tatami
C++ API for different matrix representations
Loading...
Searching...
No Matches
convert_to_dense.hpp
Go to the documentation of this file.
1#ifndef TATAMI_CONVERT_TO_DENSE_H
2#define TATAMI_CONVERT_TO_DENSE_H
3
4#include "./DenseMatrix.hpp"
5#include "./transpose.hpp"
6
9#include "../utils/copy.hpp"
11
12#include <memory>
13#include <vector>
14#include <cstddef>
15
16#include "sanisizer/sanisizer.hpp"
17
24namespace tatami {
25
35
39template <typename StoredValue_, typename InputValue_, typename InputIndex_>
40void convert_to_dense_direct(const Matrix<InputValue_, InputIndex_>& matrix, const bool row, StoredValue_* const store, const ConvertToDenseOptions& options) {
41 const InputIndex_ NR = matrix.nrow();
42 const InputIndex_ NC = matrix.ncol();
43 const auto primary = (row ? NR : NC);
44 const auto secondary = (row ? NC : NR);
45
46 constexpr bool same_type = std::is_same<InputValue_, StoredValue_>::value;
48
49 parallelize([&](const int, const InputIndex_ start, const InputIndex_ length) -> void {
50 auto wrk = consecutive_extractor<false, InputValue_, InputIndex_>(matrix, row, start, length);
51 auto temp = [&]{
52 if constexpr(same_type) {
53 return false;
54 } else {
55 return std::vector<InputValue_>(secondary);
56 }
57 }();
58
59 for (InputIndex_ x = 0; x < length; ++x) {
60 const auto store_copy = store + sanisizer::product_unsafe<std::size_t>(secondary, start + x);
61 if constexpr(same_type) {
62 auto ptr = wrk->fetch(store_copy);
63 copy_n(ptr, secondary, store_copy);
64 } else {
65 auto ptr = wrk->fetch(temp.data());
66 std::copy_n(ptr, secondary, store_copy);
67 }
68 }
69 }, primary, options.num_threads);
70}
71
72template <typename StoredValue_, typename InputValue_, typename InputIndex_>
73void convert_to_dense_running_from_sparse(const Matrix<InputValue_, InputIndex_>& matrix, const bool row, StoredValue_* const store, const ConvertToDenseOptions& options) {
74 const InputIndex_ NR = matrix.nrow();
75 const InputIndex_ NC = matrix.ncol();
76 const auto primary = (row ? NR : NC);
77 const auto secondary = (row ? NC : NR);
78
79 // Here, our parallelization strategy is to directly put values into the output store for the main thread,
80 // while all other threads just store the structural non-zeros in a separate per-thread data structure.
81 // This aims to reduce false sharing at the cost of some extra allocations.
82 //
83 // Previously, each thread used to process a slice of each secondary dimension element and store it in the output.
84 // This avoided false sharing without requiring more memory usage, but was suboptimal in terms of memory locality as adjacent memory was processed by different threads.
85 // Each thread also needed to do more calculations to find its slice within each secondary dimension element.
86 struct ThreadSpecificHolder {
87 ThreadSpecificHolder(const InputIndex_ start, const InputIndex_ length) :
88 start(start),
89 values(cast_Index_to_container_size<I<decltype(values)> >(length)),
90 indices(cast_Index_to_container_size<I<decltype(indices)> >(length))
91 {}
92 InputIndex_ start;
93 std::vector<std::vector<InputValue_> > values;
94 std::vector<std::vector<InputIndex_> > indices;
95 };
96
97 const bool do_parallel = options.num_threads > 1;
98 std::optional<std::vector<std::optional<ThreadSpecificHolder> > > all_partial_contents;
99 if (do_parallel) {
100 // -1, as we don't need to store the results of the main thread.
101 all_partial_contents.emplace(sanisizer::cast<I<decltype(all_partial_contents->size())> >(options.num_threads - 1));
102 }
103
104 // We assume that 'store' was allocated correctly, in which case the product of 'primary' and 'secondary' is known to fit inside a std::size_t.
105 // This saves us from various checks when computing related products.
106 std::fill_n(store, sanisizer::product_unsafe<std::size_t>(primary, secondary), 0);
107
108 const auto num_used = parallelize([&](const int thread, const InputIndex_ start, const InputIndex_ length) -> void {
109 auto wrk = consecutive_extractor<true, InputValue_, InputIndex_>(matrix, !row, start, length);
112
113 if (!do_parallel || thread == 0) {
114 for (InputIndex_ x = 0; x < length; ++x) {
115 const auto range = wrk->fetch(vtemp.data(), itemp.data());
116 for (InputIndex_ i = 0; i < range.number; ++i) {
117 store[sanisizer::nd_offset<std::size_t>(start + x, secondary, range.index[i])] = range.value[i];
118 }
119 }
120
121 } else {
122 ThreadSpecificHolder tmp(start, length);
123 for (InputIndex_ x = 0; x < length; ++x) {
124 const auto range = wrk->fetch(vtemp.data(), itemp.data());
125 tmp.values[x] = std::vector<InputValue_>(range.value, range.value + range.number);
126 tmp.indices[x] = std::vector<InputIndex_>(range.index, range.index + range.number);
127 }
128 (*all_partial_contents)[thread - 1] = std::move(tmp);
129 }
130 }, secondary, options.num_threads);
131
132 // Our reduction step takes the structural non-zeros from other threads and adds them to the output store.
133 if (do_parallel) {
134 for (int u = 1; u < num_used; ++u) {
135 const auto start = (*all_partial_contents)[u - 1]->start;
136 const auto& tmp_values = (*all_partial_contents)[u - 1]->values;
137 const auto& tmp_indices = (*all_partial_contents)[u - 1]->indices;
138 const auto length = tmp_indices.size();
139 for (I<decltype(length)> x = 0; x < length ; ++x) {
140 const auto& cur_values = tmp_values[x];
141 const auto& cur_indices = tmp_indices[x];
142 const auto cur_count = cur_indices.size();
143 for (I<decltype(cur_count)> i = 0; i < cur_count; ++i) {
144 store[sanisizer::nd_offset<std::size_t>(start + x, secondary, cur_indices[i])] = cur_values[i];
145 }
146 }
147 }
148 }
149}
150
151template <typename StoredValue_, typename InputValue_, typename InputIndex_>
152void convert_to_dense_running_from_dense(const Matrix<InputValue_, InputIndex_>& matrix, const bool row, StoredValue_* const store, const ConvertToDenseOptions& options) {
153 const InputIndex_ NR = matrix.nrow();
154 const InputIndex_ NC = matrix.ncol();
155 const auto primary = (row ? NR : NC);
156 const auto secondary = (row ? NC : NR);
157
158 // Here, our parallelization strategy is to directly put values into the output store for the main thread,
159 // while all other threads just store the submatrix values in a separate per-thread data structure.
160 // This aims to reduce false sharing at the cost of some extra allocations.
161 //
162 // Previously, each thread used to process a slice of each secondary dimension element and store it in the output.
163 // This avoided false sharing without requiring more memory usage, but was suboptimal in terms of memory locality as adjacent memory was processed by different threads.
164 // Each thread also needed to do more calculations to find its slice within each secondary dimension element.
165 const bool do_parallel = options.num_threads > 1;
166 struct ThreadSpecificHolder {
167 ThreadSpecificHolder(const InputIndex_ start, const InputIndex_ length, const InputIndex_ primary) :
168 start(start),
169 length(length),
170 values(sanisizer::product<I<decltype(values.size())> >(length, primary)) // still need to check size here, as vector's size_type might be less than size_t.
171 {}
172 InputIndex_ start, length;
173 std::vector<InputValue_> values;
174 };
175 std::optional<std::vector<std::optional<ThreadSpecificHolder> > > all_partial_contents;
176 if (do_parallel) {
177 all_partial_contents.emplace(sanisizer::cast<I<decltype(all_partial_contents->size())> >(options.num_threads - 1));
178 }
179
180 // We assume that 'store' was allocated correctly, in which case the product of 'primary' and 'secondary' is known to fit inside a std::size_t.
181 // This saves us from various checks when computing related products.
182 std::fill_n(store, sanisizer::product_unsafe<std::size_t>(primary, secondary), 0);
183
184 const auto num_used = parallelize([&](const int thread, const InputIndex_ start, const InputIndex_ length) -> void {
185 auto wrk = consecutive_extractor<false, InputValue_, InputIndex_>(matrix, !row, start, length);
186
187 if (!do_parallel || thread == 0) {
188 // Performing a blocked transposition to be more cache-friendly.
189 // This involves collecting several consecutive primary dimension elements so that we can transpose by blocks along the secondary dimension.
190 constexpr InputIndex_ block_size = 16;
191 const InputIndex_ alloc = std::min(length, block_size);
192 std::vector<InputValue_> bigbuffer(sanisizer::product_unsafe<typename std::vector<InputValue_>::size_type>(primary, alloc));
193 std::vector<const InputValue_*> ptrs(alloc); // no need for protection here, we know that alloc <= 16.
194
195 InputIndex_ sec_i = 0;
196 while (sec_i < length) {
197 const InputIndex_ sec_to_process = std::min(static_cast<InputIndex_>(length - sec_i), block_size);
198 for (InputIndex_ x = 0; x < sec_to_process; ++x) {
199 ptrs[x] = wrk->fetch(bigbuffer.data() + sanisizer::product_unsafe<std::size_t>(primary, x));
200 }
201
202 InputIndex_ prim_i = 0;
203 while (prim_i < primary) {
204 const InputIndex_ prim_end = prim_i + std::min(static_cast<InputIndex_>(primary - prim_i), block_size);
205 for (InputIndex_ x = 0; x < sec_to_process; ++x) {
206 const auto input = ptrs[x];
207 for (InputIndex_ p = prim_i; p < prim_end; ++p) {
208 store[sanisizer::nd_offset<std::size_t>(start + sec_i + x, secondary, p)] = input[p];
209 }
210 }
211 prim_i = prim_end;
212 }
213 sec_i += sec_to_process;
214 }
215
216 } else {
217 ThreadSpecificHolder tmp(start, length, primary);
218 for (InputIndex_ x = 0; x < length; ++x) {
219 const auto buffer = tmp.values.data() + sanisizer::product_unsafe<std::size_t>(x, primary);
220 const auto ptr = wrk->fetch(buffer);
221 copy_n(ptr, primary, buffer);
222 }
223 (*all_partial_contents)[thread - 1] = std::move(tmp);
224 }
225 }, secondary, options.num_threads);
226
227 // Our reduction step performs the transposition to the output array.
228 if (do_parallel) {
229 for (int u = 1; u < num_used; ++u) {
230 const auto tmp_start = (*all_partial_contents)[u - 1]->start;
231 const auto tmp_length = (*all_partial_contents)[u - 1]->length;
232 const auto& tmp_values = (*all_partial_contents)[u - 1]->values;
233 transpose(tmp_values.data(), tmp_length, primary, primary, store + tmp_start, secondary);
234 }
235 }
236}
252template <typename StoredValue_, typename InputValue_, typename InputIndex_>
253void convert_to_dense(const Matrix<InputValue_, InputIndex_>& matrix, const bool row_major, StoredValue_* const store, const ConvertToDenseOptions& options) {
254 if (row_major == matrix.prefer_rows()) {
255 convert_to_dense_direct(matrix, row_major, store, options);
256 } else if (matrix.is_sparse()) {
257 convert_to_dense_running_from_sparse(matrix, row_major, store, options);
258 } else {
259 convert_to_dense_running_from_dense(matrix, row_major, store, options);
260 }
261}
262
277template <
278 typename Value_,
279 typename Index_,
280 typename StoredValue_ = Value_,
281 typename InputValue_,
282 typename InputIndex_
283>
284std::shared_ptr<Matrix<Value_, Index_> > convert_to_dense(const Matrix<InputValue_, InputIndex_>& matrix, const bool row_major, const ConvertToDenseOptions& options) {
285 const auto NR = matrix.nrow();
286 const auto NC = matrix.ncol();
287 const auto buffer_size = sanisizer::product<typename std::vector<StoredValue_>::size_type>(attest_for_Index(NR), attest_for_Index(NC));
288 std::vector<StoredValue_> buffer(buffer_size);
289 convert_to_dense(matrix, row_major, buffer.data(), options);
290
291 return std::shared_ptr<Matrix<Value_, Index_> >(
292 new DenseMatrix<Value_, Index_, I<decltype(buffer)> >(
293 sanisizer::cast<Index_>(attest_for_Index(NR)),
294 sanisizer::cast<Index_>(attest_for_Index(NC)),
295 std::move(buffer),
296 row_major
297 )
298 );
299}
300
304// Backwards compatbility.
305template <typename StoredValue_, typename InputValue_, typename InputIndex_>
306void convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, bool row_major, StoredValue_* store, int threads = 1) {
308 *matrix,
309 row_major,
310 store,
311 [&]{
312 ConvertToDenseOptions options;
313 options.num_threads = threads;
314 return options;
315 }()
316 );
317}
318
319template <typename Value_ = double, typename Index_ = int, typename StoredValue_ = Value_, typename InputValue_, typename InputIndex_>
320inline std::shared_ptr<Matrix<Value_, Index_> > convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, bool row_major, int threads = 1) {
321 ConvertToDenseOptions options;
322 options.num_threads = threads;
324 *matrix,
325 row_major,
326 [&]{
327 ConvertToDenseOptions options;
328 options.num_threads = threads;
329 return options;
330 }()
331 );
332}
333
334template<bool row_, typename StoredValue_, typename InputValue_, typename InputIndex_>
335void convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, StoredValue_* store, int threads = 1) {
336 convert_to_dense(matrix, row_, store, threads);
337}
338
339template<bool row_, typename Value_, typename Index_, typename StoredValue_ = Value_, typename InputValue_, typename InputIndex_>
340inline std::shared_ptr<Matrix<Value_, Index_> > convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, int threads = 1) {
341 return convert_to_dense<Value_, Index_, StoredValue_>(matrix, row_, threads);
342}
347}
348
349#endif
Dense matrix representation.
Convert index type to container size.
Dense matrix representation.
Definition DenseMatrix.hpp:180
Virtual class for a matrix.
Definition Matrix.hpp:59
virtual Index_ ncol() const =0
virtual Index_ nrow() const =0
virtual bool prefer_rows() const =0
virtual bool is_sparse() const =0
Templated construction of a new consecutive extractor.
Copy data from one buffer to another.
Flexible representations for matrix data.
Definition Extractor.hpp:15
Index_ can_cast_Index_to_container_size(const Index_ x)
Definition Index_to_container.hpp:49
void transpose(const Input_ *const input, const std::size_t nrow, const std::size_t ncol, const std::size_t input_stride, Output_ *const output, const std::size_t output_stride)
Definition transpose.hpp:38
int parallelize(Function_ fun, const Index_ tasks, const int workers)
Definition parallelize.hpp:58
void convert_to_dense(const Matrix< InputValue_, InputIndex_ > &matrix, const bool row_major, StoredValue_ *const store, const ConvertToDenseOptions &options)
Definition convert_to_dense.hpp:253
I< decltype(std::declval< Container_ >().size())> cast_Index_to_container_size(const Index_ x)
Definition Index_to_container.hpp:65
Value_ * copy_n(const Value_ *const input, const Size_ n, Value_ *const output)
Definition copy.hpp:37
Container_ create_container_of_Index_size(const Index_ x, Args_ &&... args)
Definition Index_to_container.hpp:82
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, const bool row, const Index_ iter_start, const Index_ iter_length, Args_ &&... args)
Definition consecutive_extractor.hpp:35
Parallelized iteration over a tatami::Matrix.
Options for convert_to_dense().
Definition convert_to_dense.hpp:29
int num_threads
Definition convert_to_dense.hpp:33
Transpose a dense array.