tatami_tiledb
tatami bindings for TileDB-backed matrices
Loading...
Searching...
No Matches
SparseMatrix.hpp
Go to the documentation of this file.
1#ifndef TATAMI_TILEDB_SPARSE_MATRIX_HPP
2#define TATAMI_TILEDB_SPARSE_MATRIX_HPP
3
4#include "tatami_chunked/tatami_chunked.hpp"
5#include <tiledb/tiledb>
6
7#include "serialize.hpp"
8
9#include <string>
10#include <memory>
11#include <vector>
12#include <stdexcept>
13#include <type_traits>
14
20namespace tatami_tiledb {
21
37 size_t maximum_cache_size = 100000000;
38
45};
46
50namespace SparseMatrix_internal {
51
52typedef ::tatami_tiledb::internal::Components Components;
53typedef ::tatami_tiledb::internal::VariablyTypedDimension Dimension;
54typedef ::tatami_tiledb::internal::VariablyTypedVector CacheBuffer;
55
56struct Workspace {
57 CacheBuffer values;
58 CacheBuffer target_indices;
59 CacheBuffer non_target_indices;
60};
61
62inline size_t execute_query(
63 const Components& tdb_comp,
64 tiledb::Subarray& subarray,
65 const std::string& attribute,
66 bool row,
67 const std::string& target_dimname,
68 const std::string& non_target_dimname,
69 Workspace& work,
70 size_t general_work_offset,
71 size_t target_index_work_offset,
72 size_t work_length,
73 bool needs_value,
74 bool needs_index)
75{
76 tiledb::Query query(tdb_comp.ctx, tdb_comp.array);
77 query.set_subarray(subarray);
78 query.set_layout(row ? TILEDB_ROW_MAJOR : TILEDB_COL_MAJOR);
79
80 work.target_indices.set_data_buffer(query, target_dimname, target_index_work_offset, work_length);
81 if (needs_value) {
82 work.values.set_data_buffer(query, attribute, general_work_offset, work_length);
83 }
84 if (needs_index) {
85 work.non_target_indices.set_data_buffer(query, non_target_dimname, general_work_offset, work_length);
86 }
87
88 if (query.submit() != tiledb::Query::Status::COMPLETE) {
89 throw std::runtime_error("failed to read sparse data from TileDB");
90 }
91
92 return query.result_buffer_elements()[target_dimname].second;
93}
94
95/********************
96 *** Core classes ***
97 ********************/
98
99template<typename Index_>
100struct MyopicCacheParameters {
101 Index_ chunk_length;
102 size_t slab_size_in_elements;
103 size_t max_slabs_in_cache;
104};
105
106template<typename Index_>
107class MyopicCore {
108public:
109 MyopicCore(
110 const Components& tdb_comp,
111 const std::string& attribute,
112 bool row,
113 Index_ target_dim_extent,
114 const std::string& target_dimname,
115 const Dimension& tdb_target_dim,
116 const std::string& non_target_dimname,
117 const Dimension& tdb_non_target_dim,
118 tiledb_datatype_t tdb_type,
119 [[maybe_unused]] Index_ non_target_length, // provided for consistency with the other constructors.
120 [[maybe_unused]] tatami::MaybeOracle<false, Index_> oracle,
121 const MyopicCacheParameters<Index_>& cache_stats,
122 bool needs_value,
123 bool needs_index) :
124 my_tdb_comp(tdb_comp),
125 my_attribute(attribute),
126 my_row(row),
127 my_target_dim_extent(target_dim_extent),
128 my_tdb_target_dim(tdb_target_dim),
129 my_target_dimname(target_dimname),
130 my_tdb_non_target_dim(tdb_non_target_dim),
131 my_non_target_dimname(non_target_dimname),
132 my_target_chunk_length(cache_stats.chunk_length),
133 my_slab_size(cache_stats.slab_size_in_elements),
134 my_needs_value(needs_value),
135 my_needs_index(needs_index),
136 my_cache(cache_stats.max_slabs_in_cache)
137 {
138 // Only storing one slab at a time for the target indices.
139 my_work.target_indices.reset(my_tdb_target_dim.type(), my_slab_size);
140
141 size_t total_cache_size = my_slab_size * cache_stats.max_slabs_in_cache;
142 if (my_needs_value) {
143 my_work.values.reset(tdb_type, total_cache_size);
144 }
145 if (my_needs_index) {
146 my_work.non_target_indices.reset(my_tdb_non_target_dim.type(), total_cache_size);
147 }
148 }
149
150private:
151 const Components& my_tdb_comp;
152 const std::string& my_attribute;
153
154 bool my_row;
155 Index_ my_target_dim_extent;
156 const Dimension& my_tdb_target_dim;
157 const std::string& my_target_dimname;
158 const Dimension& my_tdb_non_target_dim;
159 const std::string& my_non_target_dimname;
160
161 Index_ my_target_chunk_length;
162 size_t my_slab_size;
163 bool my_needs_value;
164 bool my_needs_index;
165 Workspace my_work;
166 std::vector<std::pair<Index_, Index_> > my_counts;
167
168 struct Slab {
169 size_t offset;
170 std::vector<size_t> indptrs;
171 };
172 size_t my_offset = 0;
173 tatami_chunked::LruSlabCache<Index_, Slab> my_cache;
174
175private:
176 template<class Configure_>
177 std::pair<size_t, size_t> fetch_raw(Index_ i, Configure_ configure) {
178 Index_ chunk = i / my_target_chunk_length;
179 Index_ index = i % my_target_chunk_length;
180
181 const auto& info = my_cache.find(
182 chunk,
183 /* create = */ [&]() -> Slab {
184 Slab output;
185 output.offset = my_offset;
186 my_offset += my_slab_size;
187 return output;
188 },
189 /* populate = */ [&](Index_ id, Slab& contents) -> void {
190 Index_ chunk_start = id * my_target_chunk_length;
191 Index_ chunk_length = std::min(my_target_dim_extent - chunk_start, my_target_chunk_length);
192
193 size_t num_nonzero = 0;
194 serialize([&]() {
195 tiledb::Subarray subarray(my_tdb_comp.ctx, my_tdb_comp.array);
196 int rowdex = my_row;
197 my_tdb_target_dim.add_range(subarray, 1 - rowdex, chunk_start, chunk_length);
198 configure(subarray, rowdex);
199 num_nonzero = execute_query(
200 my_tdb_comp,
201 subarray,
202 my_attribute,
203 my_row,
204 my_target_dimname,
205 my_non_target_dimname,
206 my_work,
207 contents.offset,
208 0,
209 my_slab_size,
210 my_needs_value,
211 my_needs_index
212 );
213 });
214
215 auto& indptrs = contents.indptrs;
216 indptrs.clear();
217 indptrs.resize(chunk_length + 1);
218
219 if (num_nonzero) {
220 my_work.target_indices.compact(0, num_nonzero, my_tdb_target_dim, my_counts);
221 for (const auto& cnts : my_counts) {
222 indptrs[cnts.first - chunk_start + 1] = cnts.second;
223 }
224 for (Index_ i = 1; i <= chunk_length; ++i) {
225 indptrs[i] += indptrs[i - 1];
226 }
227 }
228 }
229 );
230
231 auto start = info.indptrs[index];
232 return std::make_pair(info.offset + start, info.indptrs[index + 1] - start);
233 }
234
235public:
236 std::pair<size_t, size_t> fetch_block(Index_ i, Index_ block_start, Index_ block_length) {
237 return fetch_raw(i, [&](tiledb::Subarray& subarray, int rowdex) {
238 my_tdb_non_target_dim.add_range(subarray, rowdex, block_start, block_length);
239 });
240 }
241
242 std::pair<size_t, size_t> fetch_indices(Index_ i, const std::vector<Index_>& indices) {
243 return fetch_raw(i, [&](tiledb::Subarray& subarray, int rowdex) {
244 tatami::process_consecutive_indices<Index_>(indices.data(), indices.size(), [&](Index_ s, Index_ l) {
245 my_tdb_non_target_dim.add_range(subarray, rowdex, s, l);
246 });
247 });
248 }
249
250public:
251 const Workspace& get_workspace() const {
252 return my_work;
253 }
254
255 bool get_needs_value() const {
256 return my_needs_value;
257 }
258
259 bool get_needs_index() const {
260 return my_needs_index;
261 }
262
263 const Dimension& get_tdb_non_target_dim() const {
264 return my_tdb_non_target_dim;
265 }
266};
267
268// The general idea with the oracular extractors is to either:
269//
270// - Extract each target dimension element directly, if the cell order within each tile corresponds to the desired target dimension (i.e., 'row').
271// - Extract the tile-wise chunk of target dimension elements, if the cell order within each tile is not the same as the target dimension.
272//
273// This means that we need to vary the chunk length of each slab from 1 or the tile extent, depending on the cell order of the TileDB array.
274// In addition, we use a variable slab cache that adjusts to the number of non-zero elements in each slab.
275
276template<typename Index_>
277struct OracularCacheParameters {
278 Index_ chunk_length;
279 size_t max_cache_size_in_elements;
280};
281
282template<typename Index_>
283class OracularCore {
284public:
285 OracularCore(
286 const Components& tdb_comp,
287 const std::string& attribute,
288 bool row,
289 Index_ target_dim_extent,
290 const std::string& target_dimname,
291 const Dimension& tdb_target_dim,
292 const std::string& non_target_dimname,
293 const Dimension& tdb_non_target_dim,
294 tiledb_datatype_t tdb_type,
295 Index_ non_target_length,
297 const OracularCacheParameters<Index_>& cache_stats,
298 bool needs_value,
299 bool needs_index) :
300 my_tdb_comp(tdb_comp),
301 my_attribute(attribute),
302 my_row(row),
303 my_target_dim_extent(target_dim_extent),
304 my_tdb_target_dim(tdb_target_dim),
305 my_target_dimname(target_dimname),
306 my_tdb_non_target_dim(tdb_non_target_dim),
307 my_non_target_dimname(non_target_dimname),
308 my_target_chunk_length(cache_stats.chunk_length),
309 my_max_slab_size(static_cast<size_t>(non_target_length) * my_target_chunk_length),
310 my_needs_value(needs_value),
311 my_needs_index(needs_index),
312 my_cache(std::move(oracle), cache_stats.max_cache_size_in_elements)
313 {
314 my_work.target_indices.reset(my_tdb_target_dim.type(), cache_stats.max_cache_size_in_elements);
315 if (my_needs_value) {
316 my_work.values.reset(tdb_type, cache_stats.max_cache_size_in_elements);
317 }
318 if (my_needs_index) {
319 my_work.non_target_indices.reset(my_tdb_non_target_dim.type(), cache_stats.max_cache_size_in_elements);
320 }
321 }
322
323private:
324 const Components& my_tdb_comp;
325 const std::string& my_attribute;
326
327 bool my_row;
328 Index_ my_target_dim_extent;
329 const Dimension& my_tdb_target_dim;
330 const std::string& my_target_dimname;
331 const Dimension& my_tdb_non_target_dim;
332 const std::string& my_non_target_dimname;
333
334 Index_ my_target_chunk_length;
335 size_t my_max_slab_size;
336 bool my_needs_value;
337 bool my_needs_index;
338 Workspace my_work;
339 std::vector<std::pair<Index_, Index_> > my_counts;
340
341 struct Slab {
342 size_t offset;
343 std::vector<size_t> indptrs;
344 };
345 tatami_chunked::OracularVariableSlabCache<Index_, Index_, Slab, size_t> my_cache;
346
347private:
348 template<class Function_>
349 static void sort_by_field(std::vector<std::pair<Index_, size_t> >& indices, Function_ field) {
350 auto comp = [&field](const std::pair<Index_, size_t>& l, const std::pair<Index_, size_t>& r) -> bool {
351 return field(l) < field(r);
352 };
353 if (!std::is_sorted(indices.begin(), indices.end(), comp)) {
354 std::sort(indices.begin(), indices.end(), comp);
355 }
356 }
357
358 template<class Configure_>
359 std::pair<size_t, size_t> fetch_raw([[maybe_unused]] Index_ i, Configure_ configure) {
360 auto info = my_cache.next(
361 /* identify = */ [&](Index_ current) -> std::pair<Index_, Index_> {
362 return std::pair<Index_, Index_>(current / my_target_chunk_length, current % my_target_chunk_length);
363 },
364 /* upper_size = */ [&](Index_) -> size_t {
365 return my_max_slab_size;
366 },
367 /* actual_size = */ [&](Index_, const Slab& slab) -> size_t {
368 return slab.indptrs.back();
369 },
370 /* create = */ [&]() -> Slab {
371 return Slab();
372 },
373 /* populate = */ [&](std::vector<std::pair<Index_, size_t> >& to_populate, std::vector<std::pair<Index_, size_t> >& to_reuse, std::vector<Slab>& all_slabs) {
374 // Defragmenting the existing chunks. We sort by offset to make
375 // sure that we're not clobbering in-use slabs during the copy().
376 sort_by_field(to_reuse, [&](const std::pair<Index_, size_t>& x) -> size_t { return all_slabs[x.second].offset; });
377 size_t running_offset = 0;
378 for (auto& x : to_reuse) {
379 auto& reused_slab = all_slabs[x.second];
380 auto& cur_offset = reused_slab.offset;
381 auto num_nonzero = reused_slab.indptrs.back();
382 if (cur_offset != running_offset) {
383 if (my_needs_value) {
384 my_work.values.shift(cur_offset, num_nonzero, running_offset);
385 }
386 if (my_needs_index) {
387 my_work.non_target_indices.shift(cur_offset, num_nonzero, running_offset);
388 }
389 cur_offset = running_offset;
390 }
391 running_offset += num_nonzero;
392 }
393
394 // Collapsing runs of consecutive ranges into a single range;
395 // otherwise, making union of ranges. This allows a single TileDb call
396 // to populate the contiguous memory pool that we made available after
397 // defragmentation; then we just update the slab pointers to refer
398 // to the slices of memory corresponding to each slab.
399 sort_by_field(to_populate, [](const std::pair<Index_, size_t>& x) -> Index_ { return x.first; });
400
401 size_t num_nonzero = 0;
402 serialize([&]() -> void {
403 tiledb::Subarray subarray(my_tdb_comp.ctx, my_tdb_comp.array);
404 int rowdex = my_row;
405 configure(subarray, rowdex);
406
407 Index_ run_chunk_id = to_populate.front().first;
408 Index_ run_chunk_start = run_chunk_id * my_target_chunk_length;
409 Index_ run_length = std::min(my_target_dim_extent - run_chunk_start, my_target_chunk_length);
410
411 int dimdex = 1 - rowdex;
412 for (size_t ci = 1, cend = to_populate.size(); ci < cend; ++ci) {
413 Index_ current_chunk_id = to_populate[ci].first;
414 Index_ current_chunk_start = current_chunk_id * my_target_chunk_length;
415
416 if (current_chunk_id - run_chunk_id > 1) { // save the existing run of to_populate as one range, and start a new run.
417 my_tdb_target_dim.add_range(subarray, dimdex, run_chunk_start, run_length);
418 run_chunk_id = current_chunk_id;
419 run_chunk_start = current_chunk_start;
420 run_length = 0;
421 }
422
423 run_length += std::min(my_target_dim_extent - current_chunk_start, my_target_chunk_length);
424 }
425
426 my_tdb_target_dim.add_range(subarray, dimdex, run_chunk_start, run_length);
427 num_nonzero = execute_query(
428 my_tdb_comp,
429 subarray,
430 my_attribute,
431 my_row,
432 my_target_dimname,
433 my_non_target_dimname,
434 my_work,
435 running_offset,
436 running_offset,
437 to_populate.size() * my_max_slab_size,
438 my_needs_value,
439 my_needs_index
440 );
441 });
442
443 my_work.target_indices.compact(running_offset, num_nonzero, my_tdb_target_dim, my_counts);
444
445 auto cIt = my_counts.begin(), cEnd = my_counts.end();
446 for (auto& si : to_populate) {
447 auto& populate_slab = all_slabs[si.second];
448 populate_slab.offset = running_offset;
449
450 Index_ chunk_start = si.first * my_target_chunk_length;
451 Index_ chunk_length = std::min(my_target_dim_extent - chunk_start, my_target_chunk_length);
452 Index_ chunk_end = chunk_start + chunk_length;
453
454 auto& slab_indptrs = populate_slab.indptrs;
455 slab_indptrs.clear();
456 slab_indptrs.resize(chunk_length + 1);
457
458 while (cIt != cEnd && cIt->first < chunk_end) {
459 slab_indptrs[cIt->first - chunk_start + 1] = cIt->second;
460 ++cIt;
461 }
462
463 for (Index_ i = 1; i <= chunk_length; ++i) {
464 slab_indptrs[i] += slab_indptrs[i - 1];
465 }
466 running_offset += slab_indptrs.back();
467 }
468 }
469 );
470
471 const auto& indptrs = info.first->indptrs;
472 auto start = indptrs[info.second];
473 return std::make_pair(info.first->offset + start, indptrs[info.second + 1] - start);
474 }
475
476public:
477 std::pair<size_t, size_t> fetch_block(Index_ i, Index_ block_start, Index_ block_length) {
478 return fetch_raw(i, [&](tiledb::Subarray& subarray, int rowdex) {
479 my_tdb_non_target_dim.add_range(subarray, rowdex, block_start, block_length);
480 });
481 }
482
483 std::pair<size_t, size_t> fetch_indices(Index_ i, const std::vector<Index_>& indices) {
484 return fetch_raw(i, [&](tiledb::Subarray& subarray, int rowdex) {
485 tatami::process_consecutive_indices<Index_>(indices.data(), indices.size(), [&](Index_ s, Index_ l) {
486 my_tdb_non_target_dim.add_range(subarray, rowdex, s, l);
487 });
488 });
489 }
490
491public:
492 const Workspace& get_workspace() const {
493 return my_work;
494 }
495
496 bool get_needs_value() const {
497 return my_needs_value;
498 }
499
500 bool get_needs_index() const {
501 return my_needs_index;
502 }
503
504 const Dimension& get_tdb_non_target_dim() const {
505 return my_tdb_non_target_dim;
506 }
507};
508
509template<bool oracle_, typename Index_>
510using SparseCore = typename std::conditional<oracle_, OracularCore<Index_>, MyopicCore<Index_> >::type;
511
512template<bool oracle_, typename Index_>
513using CacheParameters = typename std::conditional<oracle_, OracularCacheParameters<Index_>, MyopicCacheParameters<Index_> >::type;
514
515/*************************
516 *** Sparse subclasses ***
517 *************************/
518
519template<typename Value_, typename Index_>
521 const Workspace& work,
522 size_t work_start,
523 size_t work_length,
524 const Dimension& non_target_dim,
525 Value_* vbuffer,
526 Index_* ibuffer,
527 bool needs_value,
528 bool needs_index)
529{
531 output.number = work_length;
532 if (needs_value) {
533 work.values.copy(work_start, work_length, vbuffer);
534 output.value = vbuffer;
535 }
536 if (needs_index) {
537 work.non_target_indices.copy(work_start, work_length, non_target_dim, ibuffer);
538 output.index = ibuffer;
539 }
540 return output;
541}
542
543template<bool oracle_, typename Value_, typename Index_>
544class SparseFull : public tatami::SparseExtractor<oracle_, Value_, Index_> {
545public:
546 SparseFull(
547 const Components& tdb_comp,
548 const std::string& attribute,
549 bool row,
550 Index_ target_dim_extent,
551 const std::string& target_dimname,
552 const Dimension& tdb_target_dim,
553 const std::string& non_target_dimname,
554 const Dimension& tdb_non_target_dim,
555 tiledb_datatype_t tdb_type,
557 Index_ non_target_dim,
558 const CacheParameters<oracle_, Index_>& cache_parameters,
559 bool needs_value,
560 bool needs_index) :
561 my_core(
562 tdb_comp,
563 attribute,
564 row,
565 target_dim_extent,
566 target_dimname,
567 tdb_target_dim,
568 non_target_dimname,
569 tdb_non_target_dim,
570 tdb_type,
571 non_target_dim,
572 std::move(oracle),
573 cache_parameters,
574 needs_value,
575 needs_index
576 ),
577 my_non_target_dim(non_target_dim)
578 {}
579
580 tatami::SparseRange<Value_, Index_> fetch(Index_ i, Value_* vbuffer, Index_* ibuffer) {
581 auto info = my_core.fetch_block(i, 0, my_non_target_dim);
582 return fill_sparse_range(my_core.get_workspace(), info.first, info.second, my_core.get_tdb_non_target_dim(), vbuffer, ibuffer, my_core.get_needs_value(), my_core.get_needs_index());
583 }
584
585private:
586 SparseCore<oracle_, Index_> my_core;
587 Index_ my_non_target_dim;
588};
589
590template<bool oracle_, typename Value_, typename Index_>
591class SparseBlock : public tatami::SparseExtractor<oracle_, Value_, Index_> {
592public:
593 SparseBlock(
594 const Components& tdb_comp,
595 const std::string& attribute,
596 bool row,
597 Index_ target_dim_extent,
598 const std::string& target_dimname,
599 const Dimension& tdb_target_dim,
600 const std::string& non_target_dimname,
601 const Dimension& tdb_non_target_dim,
602 tiledb_datatype_t tdb_type,
604 Index_ block_start,
605 Index_ block_length,
606 const CacheParameters<oracle_, Index_>& cache_parameters,
607 bool needs_value,
608 bool needs_index) :
609 my_core(
610 tdb_comp,
611 attribute,
612 row,
613 target_dim_extent,
614 target_dimname,
615 tdb_target_dim,
616 non_target_dimname,
617 tdb_non_target_dim,
618 tdb_type,
619 block_length,
620 std::move(oracle),
621 cache_parameters,
622 needs_value,
623 needs_index
624 ),
625 my_block_start(block_start),
626 my_block_length(block_length)
627 {}
628
629 tatami::SparseRange<Value_, Index_> fetch(Index_ i, Value_* vbuffer, Index_* ibuffer) {
630 auto info = my_core.fetch_block(i, my_block_start, my_block_length);
631 return fill_sparse_range(my_core.get_workspace(), info.first, info.second, my_core.get_tdb_non_target_dim(), vbuffer, ibuffer, my_core.get_needs_value(), my_core.get_needs_index());
632 }
633
634private:
635 SparseCore<oracle_, Index_> my_core;
636 Index_ my_block_start, my_block_length;
637};
638
639template<bool oracle_, typename Value_, typename Index_>
640class SparseIndex : public tatami::SparseExtractor<oracle_, Value_, Index_> {
641public:
642 SparseIndex(
643 const Components& tdb_comp,
644 const std::string& attribute,
645 bool row,
646 Index_ target_dim_extent,
647 const std::string& target_dimname,
648 const Dimension& tdb_target_dim,
649 const std::string& non_target_dimname,
650 const Dimension& tdb_non_target_dim,
651 tiledb_datatype_t tdb_type,
653 tatami::VectorPtr<Index_> indices_ptr,
654 const CacheParameters<oracle_, Index_>& cache_parameters,
655 bool needs_value,
656 bool needs_index) :
657 my_core(
658 tdb_comp,
659 attribute,
660 row,
661 target_dim_extent,
662 target_dimname,
663 tdb_target_dim,
664 non_target_dimname,
665 tdb_non_target_dim,
666 tdb_type,
667 indices_ptr->size(),
668 std::move(oracle),
669 cache_parameters,
670 needs_value,
671 needs_index
672 ),
673 my_indices_ptr(std::move(indices_ptr))
674 {}
675
676 tatami::SparseRange<Value_, Index_> fetch(Index_ i, Value_* vbuffer, Index_* ibuffer) {
677 auto info = my_core.fetch_indices(i, *my_indices_ptr);
678 return fill_sparse_range(my_core.get_workspace(), info.first, info.second, my_core.get_tdb_non_target_dim(), vbuffer, ibuffer, my_core.get_needs_value(), my_core.get_needs_index());
679 }
680
681private:
682 SparseCore<oracle_, Index_> my_core;
683 tatami::VectorPtr<Index_> my_indices_ptr;
684};
685
686/************************
687 *** Dense subclasses ***
688 ************************/
689
690template<bool oracle_, typename Value_, typename Index_>
691class DenseFull : public tatami::DenseExtractor<oracle_, Value_, Index_> {
692public:
693 DenseFull(
694 const Components& tdb_comp,
695 const std::string& attribute,
696 bool row,
697 Index_ target_dim_extent,
698 const std::string& target_dimname,
699 const Dimension& tdb_target_dim,
700 const std::string& non_target_dimname,
701 const Dimension& tdb_non_target_dim,
702 tiledb_datatype_t tdb_type,
704 Index_ non_target_dim_extent,
705 const CacheParameters<oracle_, Index_>& cache_parameters,
706 [[maybe_unused]] bool needs_value, // for consistency with Sparse* constructors.
707 [[maybe_unused]] bool needs_index) :
708 my_core(
709 tdb_comp,
710 attribute,
711 row,
712 target_dim_extent,
713 target_dimname,
714 tdb_target_dim,
715 non_target_dimname,
716 tdb_non_target_dim,
717 tdb_type,
718 non_target_dim_extent,
719 std::move(oracle),
720 cache_parameters,
721 /* needs_value = */ true,
722 /* needs_index = */ true
723 ),
724 my_non_target_dim_extent(non_target_dim_extent),
725 my_holding_value(my_non_target_dim_extent),
726 my_holding_index(my_non_target_dim_extent)
727 {}
728
729 const Value_* fetch(Index_ i, Value_* buffer) {
730 auto info = my_core.fetch_block(i, 0, my_non_target_dim_extent);
731 const auto& work = my_core.get_workspace();
732 work.values.copy(info.first, info.second, my_holding_value.data());
733 work.non_target_indices.copy(info.first, info.second, my_core.get_tdb_non_target_dim(), my_holding_index.data());
734 std::fill_n(buffer, my_non_target_dim_extent, 0);
735 for (size_t i = 0; i < info.second; ++i) {
736 buffer[my_holding_index[i]] = my_holding_value[i];
737 }
738 return buffer;
739 }
740
741private:
742 SparseCore<oracle_, Index_> my_core;
743 Index_ my_non_target_dim_extent;
744 std::vector<Value_> my_holding_value;
745 std::vector<Index_> my_holding_index;
746};
747
748template<bool oracle_, typename Value_, typename Index_>
749class DenseBlock : public tatami::DenseExtractor<oracle_, Value_, Index_> {
750public:
751 DenseBlock(
752 const Components& tdb_comp,
753 const std::string& attribute,
754 bool row,
755 Index_ target_dim_extent,
756 const std::string& target_dimname,
757 const Dimension& tdb_target_dim,
758 const std::string& non_target_dimname,
759 const Dimension& tdb_non_target_dim,
760 tiledb_datatype_t tdb_type,
762 Index_ block_start,
763 Index_ block_length,
764 const CacheParameters<oracle_, Index_>& cache_parameters,
765 [[maybe_unused]] bool needs_value, // for consistency with Sparse* constructors.
766 [[maybe_unused]] bool needs_index) :
767 my_core(
768 tdb_comp,
769 attribute,
770 row,
771 target_dim_extent,
772 target_dimname,
773 tdb_target_dim,
774 non_target_dimname,
775 tdb_non_target_dim,
776 tdb_type,
777 block_length,
778 std::move(oracle),
779 cache_parameters,
780 /* needs_value = */ true,
781 /* needs_index = */ true
782 ),
783 my_block_start(block_start),
784 my_block_length(block_length),
785 my_holding_value(block_length),
786 my_holding_index(block_length)
787 {}
788
789 const Value_* fetch(Index_ i, Value_* buffer) {
790 auto info = my_core.fetch_block(i, my_block_start, my_block_length);
791 const auto& work = my_core.get_workspace();
792 work.values.copy(info.first, info.second, my_holding_value.data());
793 work.non_target_indices.copy(info.first, info.second, my_core.get_tdb_non_target_dim(), my_holding_index.data());
794 std::fill_n(buffer, my_block_length, 0);
795 for (size_t i = 0; i < info.second; ++i) {
796 buffer[my_holding_index[i] - my_block_start] = my_holding_value[i];
797 }
798 return buffer;
799 }
800
801private:
802 SparseCore<oracle_, Index_> my_core;
803 Index_ my_block_start, my_block_length;
804 std::vector<Value_> my_holding_value;
805 std::vector<Index_> my_holding_index;
806};
807
808template<bool oracle_, typename Value_, typename Index_>
809class DenseIndex : public tatami::DenseExtractor<oracle_, Value_, Index_> {
810public:
811 DenseIndex(
812 const Components& tdb_comp,
813 const std::string& attribute,
814 bool row,
815 Index_ target_dim_extent,
816 const std::string& target_dimname,
817 const Dimension& tdb_target_dim,
818 const std::string& non_target_dimname,
819 const Dimension& tdb_non_target_dim,
820 tiledb_datatype_t tdb_type,
822 tatami::VectorPtr<Index_> indices_ptr,
823 const CacheParameters<oracle_, Index_>& cache_parameters,
824 [[maybe_unused]] bool needs_value, // for consistency with Sparse* constructors.
825 [[maybe_unused]] bool needs_index) :
826 my_core(
827 tdb_comp,
828 attribute,
829 row,
830 target_dim_extent,
831 target_dimname,
832 tdb_target_dim,
833 non_target_dimname,
834 tdb_non_target_dim,
835 tdb_type,
836 indices_ptr->size(),
837 std::move(oracle),
838 cache_parameters,
839 /* needs_value = */ true,
840 /* needs_index = */ true
841 ),
842 my_indices_ptr(std::move(indices_ptr)),
843 my_holding_value(my_indices_ptr->size()),
844 my_holding_index(my_indices_ptr->size())
845 {
846 const auto& indices = *my_indices_ptr;
847 if (!indices.empty()) {
848 auto idx_start = indices.front();
849 my_remapping.resize(indices.back() - idx_start + 1);
850 for (size_t j = 0, end = indices.size(); j < end; ++j) {
851 my_remapping[indices[j] - idx_start] = j;
852 }
853 }
854 }
855
856 const Value_* fetch(Index_ i, Value_* buffer) {
857 const auto& indices = *my_indices_ptr;
858
859 if (!indices.empty()) {
860 auto info = my_core.fetch_indices(i, indices);
861 const auto& work = my_core.get_workspace();
862 work.values.copy(info.first, info.second, my_holding_value.data());
863 work.non_target_indices.copy(info.first, info.second, my_core.get_tdb_non_target_dim(), my_holding_index.data());
864 auto idx_start = indices.front();
865 std::fill_n(buffer, indices.size(), 0);
866 for (size_t i = 0; i < info.second; ++i) {
867 buffer[my_remapping[my_holding_index[i] - idx_start]] = my_holding_value[i];
868 }
869 }
870
871 return buffer;
872 }
873
874private:
875 SparseCore<oracle_, Index_> my_core;
876 tatami::VectorPtr<Index_> my_indices_ptr;
877 std::vector<Index_> my_remapping;
878 std::vector<Value_> my_holding_value;
879 std::vector<Index_> my_holding_index;
880};
881
882}
901template<typename Value_, typename Index_>
902class SparseMatrix : public tatami::Matrix<Value_, Index_> {
903public:
910 SparseMatrix(const std::string& uri, std::string attribute, tiledb::Context ctx, const SparseMatrixOptions& options) : my_attribute(std::move(attribute)) {
911 initialize(uri, std::move(ctx), options);
912 }
913
919 SparseMatrix(const std::string& uri, std::string attribute, const SparseMatrixOptions& options) : my_attribute(std::move(attribute)) {
920 initialize(uri, false, options);
921 }
922
927 SparseMatrix(const std::string& uri, std::string attribute) : SparseMatrix(uri, std::move(attribute), SparseMatrixOptions()) {}
928
929private:
930 template<class PossibleContext_>
931 void initialize(const std::string& uri, PossibleContext_ ctx, const SparseMatrixOptions& options) {
932 serialize([&]() {
933 my_tdb_comp.reset(
934 [&]{
935 // If we have to create our own Context_ object, we do so inside the serialized
936 // section, rather than using a delegating constructor.
937 if constexpr(std::is_same<PossibleContext_, tiledb::Context>::value) {
938 return new SparseMatrix_internal::Components(std::move(ctx), uri);
939 } else {
940 return new SparseMatrix_internal::Components(uri);
941 }
942 }(),
943 [](SparseMatrix_internal::Components* ptr) {
944 // Serializing the deleter, for completeness's sake.
945 serialize([&]() {
946 delete ptr;
947 });
948 }
949 );
950
951 auto schema = my_tdb_comp->array.schema();
952 if (schema.array_type() != TILEDB_SPARSE) {
953 throw std::runtime_error("TileDB array should be sparse");
954 }
955 my_cell_order = schema.cell_order();
956
957 my_cache_size_in_bytes = options.maximum_cache_size;
958 my_require_minimum_cache = options.require_minimum_cache;
959
960 if (!schema.has_attribute(my_attribute)) {
961 throw std::runtime_error("no attribute '" + my_attribute + "' is present in the TileDB array");
962 }
963 auto attr = schema.attribute(my_attribute);
964 my_tdb_type = attr.type();
965
966 tiledb::Domain domain = schema.domain();
967 if (domain.ndim() != 2) {
968 throw std::runtime_error("TileDB array should have exactly two dimensions");
969 }
970
971 tiledb::Dimension first_dim = domain.dimension(0);
972 my_first_dimname = first_dim.name();
973 my_tdb_first_dim.reset(first_dim);
974 Index_ first_extent = my_tdb_first_dim.extent<Index_>();
975 Index_ first_tile = my_tdb_first_dim.tile<Index_>();
976 my_firstdim_stats = tatami_chunked::ChunkDimensionStats<Index_>(first_extent, first_tile);
977
978 tiledb::Dimension second_dim = domain.dimension(1);
979 my_second_dimname = second_dim.name();
980 my_tdb_second_dim.reset(second_dim);
981 Index_ second_extent = my_tdb_second_dim.extent<Index_>();
982 Index_ second_tile = my_tdb_second_dim.tile<Index_>();
983 my_seconddim_stats = tatami_chunked::ChunkDimensionStats<Index_>(second_extent, second_tile);
984
985 // Favoring extraction on the dimension that involves pulling out fewer chunks per dimension element.
986 auto tiles_per_firstdim = (second_extent / second_tile) + (second_extent % second_tile > 0);
987 auto tiles_per_seconddim = (first_extent / first_tile) + (first_extent % first_tile > 0);
988 my_prefer_firstdim = tiles_per_firstdim <= tiles_per_seconddim;
989 });
990 }
991
992private:
993 std::shared_ptr<SparseMatrix_internal::Components> my_tdb_comp;
994 tiledb_layout_t my_cell_order;
995 tiledb_datatype_t my_tdb_type;
996
997 std::string my_attribute;
998 size_t my_cache_size_in_bytes;
999 bool my_require_minimum_cache;
1000
1001 std::string my_first_dimname, my_second_dimname;
1002 SparseMatrix_internal::Dimension my_tdb_first_dim, my_tdb_second_dim;
1003 tatami_chunked::ChunkDimensionStats<Index_> my_firstdim_stats, my_seconddim_stats;
1004
1005 bool my_prefer_firstdim;
1006
1007private:
1008 Index_ nrow_internal() const {
1009 return my_firstdim_stats.dimension_extent;
1010 }
1011
1012 Index_ ncol_internal() const {
1013 return my_seconddim_stats.dimension_extent;
1014 }
1015
1016public:
1017 Index_ nrow() const {
1018 return nrow_internal();
1019 }
1020
1021 Index_ ncol() const {
1022 return ncol_internal();
1023 }
1024
1025 bool is_sparse() const {
1026 return true;
1027 }
1028
1029 double is_sparse_proportion() const {
1030 return 1;
1031 }
1032
1033 bool prefer_rows() const {
1034 return my_prefer_firstdim;
1035 }
1036
1037 double prefer_rows_proportion() const {
1038 return static_cast<double>(my_prefer_firstdim);
1039 }
1040
1041 bool uses_oracle(bool) const {
1042 // It won't necessarily be used, but if the cache is empty,
1043 // the oracle definitely _won't_ be used.
1044 return my_cache_size_in_bytes > 0;
1045 }
1046
1047private:
1048 template<
1049 bool oracle_,
1050 template<typename, typename> class Interface_,
1051 template<bool, typename, typename> class Extractor_,
1052 typename ... Args_
1053 >
1054 std::unique_ptr<Interface_<Value_, Index_> > populate(
1055 bool row,
1056 Index_ non_target_length,
1058 const tatami::Options& opt,
1059 Args_&& ... args)
1060 const {
1061 const auto& target_dim_stats = (row ? my_firstdim_stats : my_seconddim_stats);
1062 const auto& target_dimname = (row ? my_first_dimname : my_second_dimname);
1063 const auto& non_target_dimname = (row ? my_second_dimname : my_first_dimname);
1064 const auto& tdb_target_dim = (row ? my_tdb_first_dim : my_tdb_second_dim);
1065 const auto& tdb_non_target_dim = (row ? my_tdb_second_dim : my_tdb_first_dim);
1066
1067 size_t nonzero_size = 0;
1068 if (opt.sparse_extract_value) {
1069 nonzero_size += ::tatami_tiledb::internal::determine_type_size(my_tdb_type);
1070 }
1071 if (opt.sparse_extract_index) {
1072 nonzero_size += ::tatami_tiledb::internal::determine_type_size(tdb_non_target_dim.type());
1073 }
1074
1075 if constexpr(oracle_) {
1076 // Add the target index size because we always need it for bulk
1077 // reads in the oracular case. This is not needed in the
1078 // myopic case because we only read one slab at a time.
1079 nonzero_size += ::tatami_tiledb::internal::determine_type_size(tdb_target_dim.type());
1080
1081 SparseMatrix_internal::OracularCacheParameters<Index_> cache_params;
1082 cache_params.max_cache_size_in_elements = my_cache_size_in_bytes / nonzero_size;
1083
1084 // If we're asking for rows and the cell order is row-major or
1085 // we want columns and the cell order is column-major, each
1086 // element of the target dimension has its contents stored
1087 // contiguously in TileDB's data tiles and can be easily
1088 // extracted on an individual basis; thus each element is
1089 // considered a separate slab and we set the chunk_length to 1.
1090 //
1091 // Otherwise, it's likely that an element of the target
1092 // dimension will overlap multiple data tiles within each space
1093 // tile, so we might as well extract the entire space tile's
1094 // elements on the target dimension.
1095 cache_params.chunk_length = (row == (my_cell_order == TILEDB_ROW_MAJOR) ? 1 : target_dim_stats.chunk_length);
1096
1097 // Ensure that there's enough space for every dimension element.
1098 // If this can't be guaranteed, we set the cache to only be able to
1099 // hold a single dimension element. This is effectively the same as
1100 // not doing any caching at all, as a hypothetical SoloCore would
1101 // still need to allocate enough memory for a single dimension
1102 // element to create a buffer for the TileDB libary.
1103 size_t max_slab_size = static_cast<size_t>(non_target_length) * cache_params.chunk_length; // cast to avoid overflow.
1104 if (my_require_minimum_cache) {
1105 cache_params.max_cache_size_in_elements = std::max(cache_params.max_cache_size_in_elements, max_slab_size);
1106 } else if (cache_params.max_cache_size_in_elements < max_slab_size) {
1107 cache_params.max_cache_size_in_elements = non_target_length;
1108 cache_params.chunk_length = 1;
1109 }
1110
1111 return std::make_unique<Extractor_<oracle_, Value_, Index_> >(
1112 *my_tdb_comp,
1113 my_attribute,
1114 row,
1115 target_dim_stats.dimension_extent,
1116 target_dimname,
1117 tdb_target_dim,
1118 non_target_dimname,
1119 tdb_non_target_dim,
1120 my_tdb_type,
1121 std::move(oracle),
1122 std::forward<Args_>(args)...,
1123 cache_params,
1126 );
1127
1128 } else {
1129 tatami_chunked::SlabCacheStats raw_params(
1130 target_dim_stats.chunk_length,
1131 non_target_length,
1132 target_dim_stats.num_chunks,
1133 my_cache_size_in_bytes,
1134 nonzero_size,
1135 my_require_minimum_cache
1136 );
1137
1138 // No need to have a dedicated SoloCore for uncached extraction,
1139 // because it would still need to hold a single Workspace. We
1140 // instead reuse the MyopicCore's code with a chunk length of 1 to
1141 // achieve the same memory usage. This has a mild perf hit from the
1142 // LRU but perf already sucks without caching so who cares.
1143 SparseMatrix_internal::MyopicCacheParameters<Index_> cache_params;
1144 if (raw_params.max_slabs_in_cache > 0) {
1145 cache_params.chunk_length = target_dim_stats.chunk_length;
1146 cache_params.slab_size_in_elements = raw_params.slab_size_in_elements;
1147 cache_params.max_slabs_in_cache = raw_params.max_slabs_in_cache;
1148 } else {
1149 cache_params.chunk_length = 1;
1150 cache_params.slab_size_in_elements = non_target_length;
1151 cache_params.max_slabs_in_cache = 1;
1152 }
1153
1154 return std::make_unique<Extractor_<oracle_, Value_, Index_> >(
1155 *my_tdb_comp,
1156 my_attribute,
1157 row,
1158 target_dim_stats.dimension_extent,
1159 target_dimname,
1160 tdb_target_dim,
1161 non_target_dimname,
1162 tdb_non_target_dim,
1163 my_tdb_type,
1164 std::move(oracle),
1165 std::forward<Args_>(args)...,
1166 cache_params,
1169 );
1170 }
1171 }
1172
1173 static tatami::Options set_extract_all(tatami::Options opt) {
1174 // Resetting these options so that the slab size estimates are
1175 // correctly estimated for dense extractors, regardless of 'opt'.
1176 opt.sparse_extract_value = true;
1177 opt.sparse_extract_index = true;
1178 return opt;
1179 }
1180
1181 /********************
1182 *** Myopic dense ***
1183 ********************/
1184public:
1185 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, const tatami::Options& opt) const {
1186 Index_ full_non_target = (row ? ncol_internal() : nrow_internal());
1187 return populate<false, tatami::MyopicDenseExtractor, SparseMatrix_internal::DenseFull>(row, full_non_target, false, set_extract_all(opt), full_non_target);
1188 }
1189
1190 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
1191 return populate<false, tatami::MyopicDenseExtractor, SparseMatrix_internal::DenseBlock>(row, block_length, false, set_extract_all(opt), block_start, block_length);
1192 }
1193
1194 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
1195 auto nidx = indices_ptr->size();
1196 return populate<false, tatami::MyopicDenseExtractor, SparseMatrix_internal::DenseIndex>(row, nidx, false, set_extract_all(opt), std::move(indices_ptr));
1197 }
1198
1199 /*********************
1200 *** Myopic sparse ***
1201 *********************/
1202public:
1203 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, const tatami::Options& opt) const {
1204 Index_ full_non_target = (row ? ncol_internal() : nrow_internal());
1205 return populate<false, tatami::MyopicSparseExtractor, SparseMatrix_internal::SparseFull>(row, full_non_target, false, opt, full_non_target);
1206 }
1207
1208 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
1209 return populate<false, tatami::MyopicSparseExtractor, SparseMatrix_internal::SparseBlock>(row, block_length, false, opt, block_start, block_length);
1210 }
1211
1212 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
1213 auto nidx = indices_ptr->size();
1214 return populate<false, tatami::MyopicSparseExtractor, SparseMatrix_internal::SparseIndex>(row, nidx, false, opt, std::move(indices_ptr));
1215 }
1216
1217 /**********************
1218 *** Oracular dense ***
1219 **********************/
1220public:
1221 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(
1222 bool row,
1223 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1224 const tatami::Options& opt)
1225 const {
1226 Index_ full_non_target = (row ? ncol_internal() : nrow_internal());
1227 return populate<true, tatami::OracularDenseExtractor, SparseMatrix_internal::DenseFull>(row, full_non_target, std::move(oracle), set_extract_all(opt), full_non_target);
1228 }
1229
1230 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(
1231 bool row,
1232 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1233 Index_ block_start,
1234 Index_ block_length,
1235 const tatami::Options& opt)
1236 const {
1237 return populate<true, tatami::OracularDenseExtractor, SparseMatrix_internal::DenseBlock>(row, block_length, std::move(oracle), set_extract_all(opt), block_start, block_length);
1238 }
1239
1240 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(
1241 bool row,
1242 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1243 tatami::VectorPtr<Index_> indices_ptr,
1244 const tatami::Options& opt)
1245 const {
1246 auto nidx = indices_ptr->size();
1247 return populate<true, tatami::OracularDenseExtractor, SparseMatrix_internal::DenseIndex>(row, nidx, std::move(oracle), set_extract_all(opt), std::move(indices_ptr));
1248 }
1249
1250 /***********************
1251 *** Oracular sparse ***
1252 ***********************/
1253public:
1254 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(
1255 bool row,
1256 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1257 const tatami::Options& opt)
1258 const {
1259 Index_ full_non_target = (row ? ncol_internal() : nrow_internal());
1260 return populate<true, tatami::OracularSparseExtractor, SparseMatrix_internal::SparseFull>(row, full_non_target, std::move(oracle), opt, full_non_target);
1261 }
1262
1263 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(
1264 bool row,
1265 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1266 Index_ block_start,
1267 Index_ block_length,
1268 const tatami::Options& opt)
1269 const {
1270 return populate<true, tatami::OracularSparseExtractor, SparseMatrix_internal::SparseBlock>(row, block_length, std::move(oracle), opt, block_start, block_length);
1271 }
1272
1273 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(
1274 bool row,
1275 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1276 tatami::VectorPtr<Index_> indices_ptr,
1277 const tatami::Options& opt)
1278 const {
1279 auto nidx = indices_ptr->size();
1280 return populate<true, tatami::OracularSparseExtractor, SparseMatrix_internal::SparseIndex>(row, nidx, std::move(oracle), opt, std::move(indices_ptr));
1281 }
1282};
1283
1284}
1285
1286#endif
TileDB-backed sparse matrix.
Definition SparseMatrix.hpp:902
SparseMatrix(const std::string &uri, std::string attribute, const SparseMatrixOptions &options)
Definition SparseMatrix.hpp:919
SparseMatrix(const std::string &uri, std::string attribute)
Definition SparseMatrix.hpp:927
SparseMatrix(const std::string &uri, std::string attribute, tiledb::Context ctx, const SparseMatrixOptions &options)
Definition SparseMatrix.hpp:910
tatami bindings for TileDB matrices.
Definition DenseMatrix.hpp:20
void serialize(Function_ fun)
Definition serialize.hpp:20
typename std::conditional< oracle_, OracularDenseExtractor< Value_, Index_ >, MyopicDenseExtractor< Value_, Index_ > >::type DenseExtractor
typename std::conditional< oracle_, OracularSparseExtractor< Value_, Index_ >, MyopicSparseExtractor< Value_, Index_ > >::type SparseExtractor
typename std::conditional< oracle_, std::shared_ptr< const Oracle< Index_ > >, bool >::type MaybeOracle
std::shared_ptr< const std::vector< Index_ > > VectorPtr
void process_consecutive_indices(const Index_ *indices, Index_ length, Function_ fun)
Locking for serial access.
bool sparse_extract_index
bool sparse_extract_value
const Value_ * value
const Index_ * index
Options for sparse TileDB extraction.
Definition SparseMatrix.hpp:27
size_t maximum_cache_size
Definition SparseMatrix.hpp:37
bool require_minimum_cache
Definition SparseMatrix.hpp:44