tatami
C++ API for different matrix representations
Loading...
Searching...
No Matches
convert_to_dense.hpp
Go to the documentation of this file.
1#ifndef TATAMI_CONVERT_TO_DENSE_H
2#define TATAMI_CONVERT_TO_DENSE_H
3
4#include "./DenseMatrix.hpp"
5#include "./transpose.hpp"
6
9#include "../utils/copy.hpp"
10
11#include <memory>
12#include <vector>
13#include <deque>
14
21namespace tatami {
22
32
44template <typename StoredValue_, typename InputValue_, typename InputIndex_>
45void convert_to_dense(const Matrix<InputValue_, InputIndex_>& matrix, bool row_major, StoredValue_* store, const ConvertToDenseOptions& options) {
46 InputIndex_ NR = matrix.nrow();
47 InputIndex_ NC = matrix.ncol();
48 bool pref_rows = matrix.prefer_rows();
49 size_t primary = (pref_rows ? NR : NC);
50 size_t secondary = (pref_rows ? NC : NR);
51
52 if (row_major == pref_rows) {
53 constexpr bool same_type = std::is_same<InputValue_, StoredValue_>::value;
54 parallelize([&](int, InputIndex_ start, InputIndex_ length) -> void {
55 std::vector<InputValue_> temp(same_type ? 0 : secondary);
56 auto wrk = consecutive_extractor<false>(matrix, pref_rows, start, length);
57
58 for (InputIndex_ x = 0; x < length; ++x) {
59 auto store_copy = store + static_cast<size_t>(start + x) * secondary; // cast to avoid overflow.
60 if constexpr(same_type) {
61 auto ptr = wrk->fetch(store_copy);
62 copy_n(ptr, secondary, store_copy);
63 } else {
64 auto ptr = wrk->fetch(temp.data());
65 std::copy_n(ptr, secondary, store_copy);
66 }
67 }
68 }, primary, options.num_threads);
69
70 } else if (matrix.is_sparse()) {
71 std::fill_n(store, primary * secondary, 0); // already cast to size_t to avoid overflow.
72
73 // We iterate over the input matrix's preferred dimension but split
74 // into threads along the non-preferred dimension. This aims to
75 // reduce false sharing across threads during writes, as locations
76 // for simultaneous writes in the transposed matrix will be
77 // separated by around 'secondary * length' elements.
78 parallelize([&](int, InputIndex_ start, InputIndex_ length) -> void {
79 auto wrk = consecutive_extractor<true, InputValue_, InputIndex_>(matrix, pref_rows, 0, primary, start, length);
80 std::vector<InputValue_> vtemp(length);
81 std::vector<InputIndex_> itemp(length);
82
83 // Note that we don't use the blocked transposition strategy
84 // from the dense case, because the overhead of looping is
85 // worse than the cache misses for sparse data.
86 for (size_t x = 0; x < primary; ++x) {
87 auto range = wrk->fetch(vtemp.data(), itemp.data());
88 for (InputIndex_ i = 0; i < range.number; ++i) {
89 store[static_cast<size_t>(range.index[i]) * primary + x] = range.value[i]; // cast to size_t to avoid overflow.
90 }
91 }
92 }, secondary, options.num_threads);
93
94 } else {
95 // Same logic as described for the sparse case; we iterate along the
96 // preferred dimension but split into threads along the non-preferred
97 // dimension to reduce false sharing.
98 parallelize([&](int, InputIndex_ start, InputIndex_ length) -> void {
99 auto wrk = consecutive_extractor<false, InputValue_, InputIndex_>(matrix, pref_rows, 0, primary, start, length);
100 const size_t length_as_size_t = length;
101 const size_t start_as_size_t = start;
102
103 // Performing a blocked transposition to be more
104 // cache-friendly. This involves collecting several
105 // consecutive primary dimension elements so that we can
106 // transpose by blocks along the secondary dimension.
107 constexpr size_t block_size = 16;
108 const size_t alloc = std::min(primary, block_size);
109 std::vector<InputValue_> bigbuffer(length_as_size_t * alloc);
110 std::vector<const InputValue_*> ptrs(alloc);
111 std::vector<InputValue_*> buf_ptrs;
112 buf_ptrs.reserve(alloc);
113 auto first = bigbuffer.data();
114 for (size_t i = 0; i < alloc; ++i, first += length_as_size_t) {
115 buf_ptrs.push_back(first);
116 }
117
118 size_t prim_i = 0;
119 while (prim_i < primary) {
120 size_t prim_to_process = std::min(primary - prim_i, block_size);
121 for (size_t c = 0; c < prim_to_process; ++c) {
122 ptrs[c] = wrk->fetch(buf_ptrs[c]);
123 }
124
125 size_t sec_i = 0;
126 while (sec_i < length_as_size_t) {
127 size_t sec_end = sec_i + std::min(block_size, length_as_size_t - sec_i);
128 for (size_t c = 0; c < prim_to_process; ++c) {
129 auto input = ptrs[c];
130 size_t offset = start_as_size_t * primary + (c + prim_i); // already all size_t's, to avoid overflow.
131 for (size_t r = sec_i; r < sec_end; ++r) {
132 store[r * primary + offset] = input[r];
133 }
134 }
135
136 sec_i = sec_end;
137 }
138 prim_i += prim_to_process;
139 }
140 }, secondary, options.num_threads);
141 }
142
143 return;
144}
145
160template <
161 typename Value_,
162 typename Index_,
163 typename StoredValue_ = Value_,
164 typename InputValue_,
165 typename InputIndex_
166>
167inline std::shared_ptr<Matrix<Value_, Index_> > convert_to_dense(const Matrix<InputValue_, InputIndex_>& matrix, bool row_major, const ConvertToDenseOptions& options) {
168 auto NR = matrix.nrow();
169 auto NC = matrix.ncol();
170 std::vector<StoredValue_> buffer(static_cast<size_t>(NR) * static_cast<size_t>(NC));
171 convert_to_dense(matrix, row_major, buffer.data(), options);
172 return std::shared_ptr<Matrix<Value_, Index_> >(new DenseMatrix<Value_, Index_, decltype(buffer)>(NR, NC, std::move(buffer), row_major));
173}
174
178// Backwards compatbility.
179template <typename StoredValue_, typename InputValue_, typename InputIndex_>
180void convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, bool row_major, StoredValue_* store, int threads = 1) {
182 *matrix,
183 row_major,
184 store,
185 [&]{
186 ConvertToDenseOptions options;
187 options.num_threads = threads;
188 return options;
189 }()
190 );
191}
192
193template <typename Value_ = double, typename Index_ = int, typename StoredValue_ = Value_, typename InputValue_, typename InputIndex_>
194inline std::shared_ptr<Matrix<Value_, Index_> > convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, bool row_major, int threads = 1) {
195 ConvertToDenseOptions options;
196 options.num_threads = threads;
198 *matrix,
199 row_major,
200 [&]{
201 ConvertToDenseOptions options;
202 options.num_threads = threads;
203 return options;
204 }()
205 );
206}
207
208template<bool row_, typename StoredValue_, typename InputValue_, typename InputIndex_>
209void convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, StoredValue_* store, int threads = 1) {
210 convert_to_dense(matrix, row_, store, threads);
211}
212
213template<bool row_, typename Value_, typename Index_, typename StoredValue_ = Value_, typename InputValue_, typename InputIndex_>
214inline std::shared_ptr<Matrix<Value_, Index_> > convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, int threads = 1) {
215 return convert_to_dense<Value_, Index_, StoredValue_>(matrix, row_, threads);
216}
221}
222
223#endif
Dense matrix representation.
Dense matrix representation.
Definition DenseMatrix.hpp:168
Virtual class for a matrix.
Definition Matrix.hpp:59
virtual Index_ ncol() const =0
virtual Index_ nrow() const =0
virtual bool prefer_rows() const =0
virtual bool is_sparse() const =0
Templated construction of a new consecutive extractor.
Copy data from one buffer to another.
Flexible representations for matrix data.
Definition Extractor.hpp:15
void parallelize(Function_ fun, Index_ tasks, int threads)
Definition parallelize.hpp:42
Value_ * copy_n(const Value_ *input, Size_ n, Value_ *output)
Definition copy.hpp:25
void convert_to_dense(const Matrix< InputValue_, InputIndex_ > &matrix, bool row_major, StoredValue_ *store, const ConvertToDenseOptions &options)
Definition convert_to_dense.hpp:45
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Definition consecutive_extractor.hpp:35
Parallelized iteration over a tatami::Matrix.
Options for convert_to_dense().
Definition convert_to_dense.hpp:26
int num_threads
Definition convert_to_dense.hpp:30
Transpose a dense array.