36 InputIndex_ NR = matrix->
nrow();
37 InputIndex_ NC = matrix->
ncol();
39 size_t primary = (pref_rows ? NR : NC);
40 size_t secondary = (pref_rows ? NC : NR);
42 if (row_major == pref_rows) {
43 constexpr bool same_type = std::is_same<InputValue_, StoredValue_>::value;
44 parallelize([&](
int, InputIndex_ start, InputIndex_ length) ->
void {
45 std::vector<InputValue_> temp(same_type ? 0 : secondary);
48 for (InputIndex_ x = 0; x < length; ++x) {
49 auto store_copy = store +
static_cast<size_t>(start + x) * secondary;
50 if constexpr(same_type) {
51 auto ptr = wrk->fetch(store_copy);
52 copy_n(ptr, secondary, store_copy);
54 auto ptr = wrk->fetch(temp.data());
55 std::copy_n(ptr, secondary, store_copy);
61 std::fill_n(store, primary * secondary, 0);
68 parallelize([&](
int, InputIndex_ start, InputIndex_ length) ->
void {
70 std::vector<InputValue_> vtemp(length);
71 std::vector<InputIndex_> itemp(length);
76 for (
size_t x = 0; x < primary; ++x) {
77 auto range = wrk->fetch(vtemp.data(), itemp.data());
78 for (InputIndex_ i = 0; i < range.number; ++i) {
79 store[
static_cast<size_t>(range.index[i]) * primary + x] = range.value[i];
82 }, secondary, threads);
88 parallelize([&](
int, InputIndex_ start, InputIndex_ length) ->
void {
90 const size_t length_as_size_t = length;
91 const size_t start_as_size_t = start;
97 constexpr size_t block_size = 16;
98 const size_t alloc = std::min(primary, block_size);
99 std::vector<InputValue_> bigbuffer(length_as_size_t * alloc);
100 std::vector<const InputValue_*> ptrs(alloc);
101 std::vector<InputValue_*> buf_ptrs;
102 buf_ptrs.reserve(alloc);
103 auto first = bigbuffer.data();
104 for (
size_t i = 0; i < alloc; ++i, first += length_as_size_t) {
105 buf_ptrs.push_back(first);
109 while (prim_i < primary) {
110 size_t prim_to_process = std::min(primary - prim_i, block_size);
111 for (
size_t c = 0; c < prim_to_process; ++c) {
112 ptrs[c] = wrk->fetch(buf_ptrs[c]);
116 while (sec_i < length_as_size_t) {
117 size_t sec_end = sec_i + std::min(block_size, length_as_size_t - sec_i);
118 for (
size_t c = 0; c < prim_to_process; ++c) {
119 auto input = ptrs[c];
120 size_t offset = start_as_size_t * primary + (c + prim_i);
121 for (
size_t r = sec_i; r < sec_end; ++r) {
122 store[r * primary + offset] = input[r];
128 prim_i += prim_to_process;
130 }, secondary, threads);