tatami
C++ API for different matrix representations
Loading...
Searching...
No Matches
convert_to_dense.hpp
Go to the documentation of this file.
1#ifndef TATAMI_CONVERT_TO_DENSE_H
2#define TATAMI_CONVERT_TO_DENSE_H
3
4#include "./DenseMatrix.hpp"
5#include "./transpose.hpp"
6
7#include "../utils/consecutive_extractor.hpp"
8#include "../utils/parallelize.hpp"
9#include "../utils/copy.hpp"
10
11#include <memory>
12#include <vector>
13#include <deque>
14
21namespace tatami {
22
34template <typename StoredValue_, typename InputValue_, typename InputIndex_>
35void convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, bool row_major, StoredValue_* store, int threads = 1) {
36 InputIndex_ NR = matrix->nrow();
37 InputIndex_ NC = matrix->ncol();
38 bool pref_rows = matrix->prefer_rows();
39 size_t primary = (pref_rows ? NR : NC);
40 size_t secondary = (pref_rows ? NC : NR);
41
42 if (row_major == pref_rows) {
43 constexpr bool same_type = std::is_same<InputValue_, StoredValue_>::value;
44 parallelize([&](int, InputIndex_ start, InputIndex_ length) -> void {
45 std::vector<InputValue_> temp(same_type ? 0 : secondary);
47
48 for (InputIndex_ x = 0; x < length; ++x) {
49 auto store_copy = store + static_cast<size_t>(start + x) * secondary; // cast to avoid overflow.
50 if constexpr(same_type) {
51 auto ptr = wrk->fetch(store_copy);
52 copy_n(ptr, secondary, store_copy);
53 } else {
54 auto ptr = wrk->fetch(temp.data());
55 std::copy_n(ptr, secondary, store_copy);
56 }
57 }
58 }, primary, threads);
59
60 } else if (matrix->is_sparse()) {
61 std::fill_n(store, primary * secondary, 0); // already cast to size_t to avoid overflow.
62
63 // We iterate over the input matrix's preferred dimension but split
64 // into threads along the non-preferred dimension. This aims to
65 // reduce false sharing across threads during writes, as locations
66 // for simultaneous writes in the transposed matrix will be
67 // separated by around 'secondary * length' elements.
70 std::vector<InputValue_> vtemp(length);
71 std::vector<InputIndex_> itemp(length);
72
73 // Note that we don't use the blocked transposition strategy
74 // from the dense case, because the overhead of looping is
75 // worse than the cache misses for sparse data.
76 for (size_t x = 0; x < primary; ++x) {
77 auto range = wrk->fetch(vtemp.data(), itemp.data());
78 for (InputIndex_ i = 0; i < range.number; ++i) {
79 store[static_cast<size_t>(range.index[i]) * primary + x] = range.value[i]; // cast to size_t to avoid overflow.
80 }
81 }
82 }, secondary, threads);
83
84 } else {
85 // Same logic as described for the sparse case; we iterate along the
86 // preferred dimension but split into threads along the non-preferred
87 // dimension to reduce false sharing.
90 const size_t length_as_size_t = length;
91 const size_t start_as_size_t = start;
92
93 // Performing a blocked transposition to be more
94 // cache-friendly. This involves collecting several
95 // consecutive primary dimension elements so that we can
96 // transpose by blocks along the secondary dimension.
97 constexpr size_t block_size = 16;
98 const size_t alloc = std::min(primary, block_size);
99 std::vector<InputValue_> bigbuffer(length_as_size_t * alloc);
100 std::vector<const InputValue_*> ptrs(alloc);
101 std::vector<InputValue_*> buf_ptrs;
102 buf_ptrs.reserve(alloc);
103 auto first = bigbuffer.data();
104 for (size_t i = 0; i < alloc; ++i, first += length_as_size_t) {
105 buf_ptrs.push_back(first);
106 }
107
108 size_t prim_i = 0;
109 while (prim_i < primary) {
110 size_t prim_to_process = std::min(primary - prim_i, block_size);
111 for (size_t c = 0; c < prim_to_process; ++c) {
112 ptrs[c] = wrk->fetch(buf_ptrs[c]);
113 }
114
115 size_t sec_i = 0;
116 while (sec_i < length_as_size_t) {
117 size_t sec_end = sec_i + std::min(block_size, length_as_size_t - sec_i);
118 for (size_t c = 0; c < prim_to_process; ++c) {
119 auto input = ptrs[c];
120 size_t offset = start_as_size_t * primary + (c + prim_i); // already all size_t's, to avoid overflow.
121 for (size_t r = sec_i; r < sec_end; ++r) {
122 store[r * primary + offset] = input[r];
123 }
124 }
125
126 sec_i = sec_end;
127 }
129 }
130 }, secondary, threads);
131 }
132
133 return;
134}
135
150template <
151 typename Value_ = double,
152 typename Index_ = int,
153 typename StoredValue_ = Value_,
154 typename InputValue_,
155 typename InputIndex_
156>
157inline std::shared_ptr<Matrix<Value_, Index_> > convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, bool row_major, int threads = 1) {
158 auto NR = matrix->nrow();
159 auto NC = matrix->ncol();
160 std::vector<StoredValue_> buffer(static_cast<size_t>(NR) * static_cast<size_t>(NC));
162 return std::shared_ptr<Matrix<Value_, Index_> >(new DenseMatrix<Value_, Index_, decltype(buffer)>(NR, NC, std::move(buffer), row_major));
163}
164
168// Backwards compatbility.
169template<bool row_, typename StoredValue_, typename InputValue_, typename InputIndex_>
172}
173
174template<bool row_, typename Value_, typename Index_, typename StoredValue_ = Value_, typename InputValue_, typename InputIndex_>
175inline std::shared_ptr<Matrix<Value_, Index_> > convert_to_dense(const Matrix<InputValue_, InputIndex_>* matrix, int threads = 1) {
177}
182}
183
184#endif
Dense matrix representation.
Virtual class for a matrix.
Definition Matrix.hpp:59
virtual Index_ ncol() const =0
virtual Index_ nrow() const =0
virtual bool prefer_rows() const =0
Flexible representations for matrix data.
Definition Extractor.hpp:15
void parallelize(Function_ fun, Index_ tasks, int threads)
Definition parallelize.hpp:42
void convert_to_dense(const Matrix< InputValue_, InputIndex_ > *matrix, bool row_major, StoredValue_ *store, int threads=1)
Definition convert_to_dense.hpp:35
Value_ * copy_n(const Value_ *input, Size_ n, Value_ *output)
Definition copy.hpp:25
auto consecutive_extractor(const Matrix< Value_, Index_ > *mat, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Definition consecutive_extractor.hpp:35
Transpose a dense array.