tatami_tiledb
tatami bindings for TileDB-backed matrices
Loading...
Searching...
No Matches
SparseMatrix.hpp
Go to the documentation of this file.
1#ifndef TATAMI_TILEDB_SPARSE_MATRIX_HPP
2#define TATAMI_TILEDB_SPARSE_MATRIX_HPP
3
4#include "tatami_chunked/tatami_chunked.hpp"
5#include <tiledb/tiledb>
6
7#include "serialize.hpp"
8
9#include <string>
10#include <memory>
11#include <vector>
12#include <stdexcept>
13#include <type_traits>
14
20namespace tatami_tiledb {
21
37 size_t maximum_cache_size = 100000000;
38
45};
46
50namespace SparseMatrix_internal {
51
52typedef ::tatami_tiledb::internal::Components Components;
53typedef ::tatami_tiledb::internal::VariablyTypedDimension Dimension;
54typedef ::tatami_tiledb::internal::VariablyTypedVector CacheBuffer;
55
56struct Workspace {
57 CacheBuffer values;
58 CacheBuffer target_indices;
59 CacheBuffer non_target_indices;
60};
61
62inline size_t execute_query(
63 const Components& tdb_comp,
64 tiledb::Subarray& subarray,
65 const std::string& attribute,
66 bool row,
67 const std::string& target_dimname,
68 const std::string& non_target_dimname,
69 Workspace& work,
70 size_t general_work_offset,
71 size_t target_index_work_offset,
72 size_t work_length,
73 bool needs_value,
74 bool needs_index)
75{
76 tiledb::Query query(tdb_comp.ctx, tdb_comp.array);
77 query.set_subarray(subarray);
78 query.set_layout(row ? TILEDB_ROW_MAJOR : TILEDB_COL_MAJOR);
79
80 work.target_indices.set_data_buffer(query, target_dimname, target_index_work_offset, work_length);
81 if (needs_value) {
82 work.values.set_data_buffer(query, attribute, general_work_offset, work_length);
83 }
84 if (needs_index) {
85 work.non_target_indices.set_data_buffer(query, non_target_dimname, general_work_offset, work_length);
86 }
87
88 if (query.submit() != tiledb::Query::Status::COMPLETE) {
89 throw std::runtime_error("failed to read sparse data from TileDB");
90 }
91
92 return query.result_buffer_elements()[target_dimname].second;
93}
94
95/********************
96 *** Core classes ***
97 ********************/
98
99template<typename Index_>
100struct MyopicCacheParameters {
101 Index_ chunk_length;
102 size_t slab_size_in_elements;
103 size_t max_slabs_in_cache;
104};
105
106template<typename Index_>
107class MyopicCore {
108public:
109 MyopicCore(
110 const Components& tdb_comp,
111 const std::string& attribute,
112 bool row,
113 Index_ target_dim_extent,
114 const std::string& target_dimname,
115 const Dimension& tdb_target_dim,
116 const std::string& non_target_dimname,
117 const Dimension& tdb_non_target_dim,
118 tiledb_datatype_t tdb_type,
119 [[maybe_unused]] Index_ non_target_length, // provided for consistency with the other constructors.
120 [[maybe_unused]] tatami::MaybeOracle<false, Index_> oracle,
121 const MyopicCacheParameters<Index_>& cache_stats,
122 bool needs_value,
123 bool needs_index) :
124 my_tdb_comp(tdb_comp),
125 my_attribute(attribute),
126 my_row(row),
127 my_target_dim_extent(target_dim_extent),
128 my_tdb_target_dim(tdb_target_dim),
129 my_target_dimname(target_dimname),
130 my_tdb_non_target_dim(tdb_non_target_dim),
131 my_non_target_dimname(non_target_dimname),
132 my_target_chunk_length(cache_stats.chunk_length),
133 my_slab_size(cache_stats.slab_size_in_elements),
134 my_needs_value(needs_value),
135 my_needs_index(needs_index),
136 my_cache(cache_stats.max_slabs_in_cache)
137 {
138 // Only storing one slab at a time for the target indices.
139 my_work.target_indices.reset(my_tdb_target_dim.type(), my_slab_size);
140
141 size_t total_cache_size = my_slab_size * cache_stats.max_slabs_in_cache;
142 if (my_needs_value) {
143 my_work.values.reset(tdb_type, total_cache_size);
144 }
145 if (my_needs_index) {
146 my_work.non_target_indices.reset(my_tdb_non_target_dim.type(), total_cache_size);
147 }
148 }
149
150private:
151 const Components& my_tdb_comp;
152 const std::string& my_attribute;
153
154 bool my_row;
155 Index_ my_target_dim_extent;
156 const Dimension& my_tdb_target_dim;
157 const std::string& my_target_dimname;
158 const Dimension& my_tdb_non_target_dim;
159 const std::string& my_non_target_dimname;
160
161 Index_ my_target_chunk_length;
162 size_t my_slab_size;
163 bool my_needs_value;
164 bool my_needs_index;
165 Workspace my_work;
166 std::vector<std::pair<Index_, Index_> > my_counts;
167
168 struct Slab {
169 size_t offset;
170 std::vector<size_t> indptrs;
171 };
172 size_t my_offset = 0;
173 tatami_chunked::LruSlabCache<Index_, Slab> my_cache;
174
175private:
176 template<class Configure_>
177 std::pair<size_t, size_t> fetch_raw(Index_ i, Configure_ configure) {
178 Index_ chunk = i / my_target_chunk_length;
179 Index_ index = i % my_target_chunk_length;
180
181 const auto& info = my_cache.find(
182 chunk,
183 /* create = */ [&]() -> Slab {
184 Slab output;
185 output.offset = my_offset;
186 my_offset += my_slab_size;
187 return output;
188 },
189 /* populate = */ [&](Index_ id, Slab& contents) -> void {
190 Index_ chunk_start = id * my_target_chunk_length;
191 Index_ chunk_length = std::min(my_target_dim_extent - chunk_start, my_target_chunk_length);
192
193 size_t num_nonzero = 0;
194 serialize([&]() -> void {
195 tiledb::Subarray subarray(my_tdb_comp.ctx, my_tdb_comp.array);
196 int rowdex = my_row;
197 my_tdb_target_dim.add_range(subarray, 1 - rowdex, chunk_start, chunk_length);
198 configure(subarray, rowdex);
199 num_nonzero = execute_query(
200 my_tdb_comp,
201 subarray,
202 my_attribute,
203 my_row,
204 my_target_dimname,
205 my_non_target_dimname,
206 my_work,
207 contents.offset,
208 0,
209 my_slab_size,
210 my_needs_value,
211 my_needs_index
212 );
213 });
214
215 auto& indptrs = contents.indptrs;
216 indptrs.clear();
217 indptrs.resize(chunk_length + 1);
218
219 if (num_nonzero) {
220 my_work.target_indices.compact(0, num_nonzero, my_tdb_target_dim, my_counts);
221 for (const auto& cnts : my_counts) {
222 indptrs[cnts.first - chunk_start + 1] = cnts.second;
223 }
224 for (Index_ i = 1; i <= chunk_length; ++i) {
225 indptrs[i] += indptrs[i - 1];
226 }
227 }
228 }
229 );
230
231 auto start = info.indptrs[index];
232 return std::make_pair(info.offset + start, info.indptrs[index + 1] - start);
233 }
234
235public:
236 std::pair<size_t, size_t> fetch_block(Index_ i, Index_ block_start, Index_ block_length) {
237 return fetch_raw(
238 i,
239 [&](tiledb::Subarray& subarray, int rowdex) -> void {
240 my_tdb_non_target_dim.add_range(subarray, rowdex, block_start, block_length);
241 }
242 );
243 }
244
245 std::pair<size_t, size_t> fetch_indices(Index_ i, const std::vector<Index_>& indices) {
246 return fetch_raw(
247 i,
248 [&](tiledb::Subarray& subarray, int rowdex) -> void {
250 indices.data(),
251 indices.size(),
252 [&](Index_ s, Index_ l) -> void {
253 my_tdb_non_target_dim.add_range(subarray, rowdex, s, l);
254 }
255 );
256 }
257 );
258 }
259
260public:
261 const Workspace& get_workspace() const {
262 return my_work;
263 }
264
265 bool get_needs_value() const {
266 return my_needs_value;
267 }
268
269 bool get_needs_index() const {
270 return my_needs_index;
271 }
272
273 const Dimension& get_tdb_non_target_dim() const {
274 return my_tdb_non_target_dim;
275 }
276};
277
278// The general idea with the oracular extractors is to either:
279//
280// - Extract each target dimension element directly, if the cell order within each tile corresponds to the desired target dimension (i.e., 'row').
281// - Extract the tile-wise chunk of target dimension elements, if the cell order within each tile is not the same as the target dimension.
282//
283// This means that we need to vary the chunk length of each slab from 1 or the tile extent, depending on the cell order of the TileDB array.
284// In addition, we use a variable slab cache that adjusts to the number of non-zero elements in each slab.
285
286template<typename Index_>
287struct OracularCacheParameters {
288 Index_ chunk_length;
289 size_t max_cache_size_in_elements;
290};
291
292template<typename Index_>
293class OracularCore {
294public:
295 OracularCore(
296 const Components& tdb_comp,
297 const std::string& attribute,
298 bool row,
299 Index_ target_dim_extent,
300 const std::string& target_dimname,
301 const Dimension& tdb_target_dim,
302 const std::string& non_target_dimname,
303 const Dimension& tdb_non_target_dim,
304 tiledb_datatype_t tdb_type,
305 Index_ non_target_length,
307 const OracularCacheParameters<Index_>& cache_stats,
308 bool needs_value,
309 bool needs_index) :
310 my_tdb_comp(tdb_comp),
311 my_attribute(attribute),
312 my_row(row),
313 my_target_dim_extent(target_dim_extent),
314 my_tdb_target_dim(tdb_target_dim),
315 my_target_dimname(target_dimname),
316 my_tdb_non_target_dim(tdb_non_target_dim),
317 my_non_target_dimname(non_target_dimname),
318 my_target_chunk_length(cache_stats.chunk_length),
319 my_max_slab_size(static_cast<size_t>(non_target_length) * my_target_chunk_length),
320 my_needs_value(needs_value),
321 my_needs_index(needs_index),
322 my_cache(std::move(oracle), cache_stats.max_cache_size_in_elements)
323 {
324 my_work.target_indices.reset(my_tdb_target_dim.type(), cache_stats.max_cache_size_in_elements);
325 if (my_needs_value) {
326 my_work.values.reset(tdb_type, cache_stats.max_cache_size_in_elements);
327 }
328 if (my_needs_index) {
329 my_work.non_target_indices.reset(my_tdb_non_target_dim.type(), cache_stats.max_cache_size_in_elements);
330 }
331 }
332
333private:
334 const Components& my_tdb_comp;
335 const std::string& my_attribute;
336
337 bool my_row;
338 Index_ my_target_dim_extent;
339 const Dimension& my_tdb_target_dim;
340 const std::string& my_target_dimname;
341 const Dimension& my_tdb_non_target_dim;
342 const std::string& my_non_target_dimname;
343
344 Index_ my_target_chunk_length;
345 size_t my_max_slab_size;
346 bool my_needs_value;
347 bool my_needs_index;
348 Workspace my_work;
349 std::vector<std::pair<Index_, Index_> > my_counts;
350
351 struct Slab {
352 size_t offset;
353 std::vector<size_t> indptrs;
354 };
355 tatami_chunked::OracularVariableSlabCache<Index_, Index_, Slab, size_t> my_cache;
356
357private:
358 template<class Function_>
359 static void sort_by_field(std::vector<std::pair<Index_, size_t> >& indices, Function_ field) {
360 auto comp = [&field](const std::pair<Index_, size_t>& l, const std::pair<Index_, size_t>& r) -> bool {
361 return field(l) < field(r);
362 };
363 if (!std::is_sorted(indices.begin(), indices.end(), comp)) {
364 std::sort(indices.begin(), indices.end(), comp);
365 }
366 }
367
368 template<class Configure_>
369 std::pair<size_t, size_t> fetch_raw([[maybe_unused]] Index_ i, Configure_ configure) {
370 auto info = my_cache.next(
371 /* identify = */ [&](Index_ current) -> std::pair<Index_, Index_> {
372 return std::pair<Index_, Index_>(current / my_target_chunk_length, current % my_target_chunk_length);
373 },
374 /* upper_size = */ [&](Index_) -> size_t {
375 return my_max_slab_size;
376 },
377 /* actual_size = */ [&](Index_, const Slab& slab) -> size_t {
378 return slab.indptrs.back();
379 },
380 /* create = */ [&]() -> Slab {
381 return Slab();
382 },
383 /* populate = */ [&](std::vector<std::pair<Index_, size_t> >& to_populate, std::vector<std::pair<Index_, size_t> >& to_reuse, std::vector<Slab>& all_slabs) -> void {
384 // Defragmenting the existing chunks. We sort by offset to make
385 // sure that we're not clobbering in-use slabs during the copy().
386 sort_by_field(to_reuse, [&](const std::pair<Index_, size_t>& x) -> size_t { return all_slabs[x.second].offset; });
387 size_t running_offset = 0;
388 for (auto& x : to_reuse) {
389 auto& reused_slab = all_slabs[x.second];
390 auto& cur_offset = reused_slab.offset;
391 auto num_nonzero = reused_slab.indptrs.back();
392 if (cur_offset != running_offset) {
393 if (my_needs_value) {
394 my_work.values.shift(cur_offset, num_nonzero, running_offset);
395 }
396 if (my_needs_index) {
397 my_work.non_target_indices.shift(cur_offset, num_nonzero, running_offset);
398 }
399 cur_offset = running_offset;
400 }
401 running_offset += num_nonzero;
402 }
403
404 // Collapsing runs of consecutive ranges into a single range;
405 // otherwise, making union of ranges. This allows a single TileDb call
406 // to populate the contiguous memory pool that we made available after
407 // defragmentation; then we just update the slab pointers to refer
408 // to the slices of memory corresponding to each slab.
409 sort_by_field(to_populate, [](const std::pair<Index_, size_t>& x) -> Index_ { return x.first; });
410
411 size_t num_nonzero = 0;
412 serialize([&]() -> void {
413 tiledb::Subarray subarray(my_tdb_comp.ctx, my_tdb_comp.array);
414 int rowdex = my_row;
415 configure(subarray, rowdex);
416
417 Index_ run_chunk_id = to_populate.front().first;
418 Index_ run_chunk_start = run_chunk_id * my_target_chunk_length;
419 Index_ run_length = std::min(my_target_dim_extent - run_chunk_start, my_target_chunk_length);
420
421 int dimdex = 1 - rowdex;
422 for (size_t ci = 1, cend = to_populate.size(); ci < cend; ++ci) {
423 Index_ current_chunk_id = to_populate[ci].first;
424 Index_ current_chunk_start = current_chunk_id * my_target_chunk_length;
425
426 if (current_chunk_id - run_chunk_id > 1) { // save the existing run of to_populate as one range, and start a new run.
427 my_tdb_target_dim.add_range(subarray, dimdex, run_chunk_start, run_length);
428 run_chunk_id = current_chunk_id;
429 run_chunk_start = current_chunk_start;
430 run_length = 0;
431 }
432
433 run_length += std::min(my_target_dim_extent - current_chunk_start, my_target_chunk_length);
434 }
435
436 my_tdb_target_dim.add_range(subarray, dimdex, run_chunk_start, run_length);
437 num_nonzero = execute_query(
438 my_tdb_comp,
439 subarray,
440 my_attribute,
441 my_row,
442 my_target_dimname,
443 my_non_target_dimname,
444 my_work,
445 running_offset,
446 running_offset,
447 to_populate.size() * my_max_slab_size,
448 my_needs_value,
449 my_needs_index
450 );
451 });
452
453 my_work.target_indices.compact(running_offset, num_nonzero, my_tdb_target_dim, my_counts);
454
455 auto cIt = my_counts.begin(), cEnd = my_counts.end();
456 for (auto& si : to_populate) {
457 auto& populate_slab = all_slabs[si.second];
458 populate_slab.offset = running_offset;
459
460 Index_ chunk_start = si.first * my_target_chunk_length;
461 Index_ chunk_length = std::min(my_target_dim_extent - chunk_start, my_target_chunk_length);
462 Index_ chunk_end = chunk_start + chunk_length;
463
464 auto& slab_indptrs = populate_slab.indptrs;
465 slab_indptrs.clear();
466 slab_indptrs.resize(chunk_length + 1);
467
468 while (cIt != cEnd && cIt->first < chunk_end) {
469 slab_indptrs[cIt->first - chunk_start + 1] = cIt->second;
470 ++cIt;
471 }
472
473 for (Index_ i = 1; i <= chunk_length; ++i) {
474 slab_indptrs[i] += slab_indptrs[i - 1];
475 }
476 running_offset += slab_indptrs.back();
477 }
478 }
479 );
480
481 const auto& indptrs = info.first->indptrs;
482 auto start = indptrs[info.second];
483 return std::make_pair(info.first->offset + start, indptrs[info.second + 1] - start);
484 }
485
486public:
487 std::pair<size_t, size_t> fetch_block(Index_ i, Index_ block_start, Index_ block_length) {
488 return fetch_raw(
489 i,
490 [&](tiledb::Subarray& subarray, int rowdex) -> void {
491 my_tdb_non_target_dim.add_range(subarray, rowdex, block_start, block_length);
492 }
493 );
494 }
495
496 std::pair<size_t, size_t> fetch_indices(Index_ i, const std::vector<Index_>& indices) {
497 return fetch_raw(
498 i,
499 [&](tiledb::Subarray& subarray, int rowdex) -> void {
501 indices.data(),
502 indices.size(),
503 [&](Index_ s, Index_ l) -> void {
504 my_tdb_non_target_dim.add_range(subarray, rowdex, s, l);
505 }
506 );
507 }
508 );
509 }
510
511public:
512 const Workspace& get_workspace() const {
513 return my_work;
514 }
515
516 bool get_needs_value() const {
517 return my_needs_value;
518 }
519
520 bool get_needs_index() const {
521 return my_needs_index;
522 }
523
524 const Dimension& get_tdb_non_target_dim() const {
525 return my_tdb_non_target_dim;
526 }
527};
528
529template<bool oracle_, typename Index_>
530using SparseCore = typename std::conditional<oracle_, OracularCore<Index_>, MyopicCore<Index_> >::type;
531
532template<bool oracle_, typename Index_>
533using CacheParameters = typename std::conditional<oracle_, OracularCacheParameters<Index_>, MyopicCacheParameters<Index_> >::type;
534
535/*************************
536 *** Sparse subclasses ***
537 *************************/
538
539template<typename Value_, typename Index_>
541 const Workspace& work,
542 size_t work_start,
543 size_t work_length,
544 const Dimension& non_target_dim,
545 Value_* vbuffer,
546 Index_* ibuffer,
547 bool needs_value,
548 bool needs_index)
549{
551 output.number = work_length;
552 if (needs_value) {
553 work.values.copy(work_start, work_length, vbuffer);
554 output.value = vbuffer;
555 }
556 if (needs_index) {
557 work.non_target_indices.copy(work_start, work_length, non_target_dim, ibuffer);
558 output.index = ibuffer;
559 }
560 return output;
561}
562
563template<bool oracle_, typename Value_, typename Index_>
564class SparseFull : public tatami::SparseExtractor<oracle_, Value_, Index_> {
565public:
566 SparseFull(
567 const Components& tdb_comp,
568 const std::string& attribute,
569 bool row,
570 Index_ target_dim_extent,
571 const std::string& target_dimname,
572 const Dimension& tdb_target_dim,
573 const std::string& non_target_dimname,
574 const Dimension& tdb_non_target_dim,
575 tiledb_datatype_t tdb_type,
577 Index_ non_target_dim,
578 const CacheParameters<oracle_, Index_>& cache_parameters,
579 bool needs_value,
580 bool needs_index) :
581 my_core(
582 tdb_comp,
583 attribute,
584 row,
585 target_dim_extent,
586 target_dimname,
587 tdb_target_dim,
588 non_target_dimname,
589 tdb_non_target_dim,
590 tdb_type,
591 non_target_dim,
592 std::move(oracle),
593 cache_parameters,
594 needs_value,
595 needs_index
596 ),
597 my_non_target_dim(non_target_dim)
598 {}
599
600 tatami::SparseRange<Value_, Index_> fetch(Index_ i, Value_* vbuffer, Index_* ibuffer) {
601 auto info = my_core.fetch_block(i, 0, my_non_target_dim);
602 return fill_sparse_range(my_core.get_workspace(), info.first, info.second, my_core.get_tdb_non_target_dim(), vbuffer, ibuffer, my_core.get_needs_value(), my_core.get_needs_index());
603 }
604
605private:
606 SparseCore<oracle_, Index_> my_core;
607 Index_ my_non_target_dim;
608};
609
610template<bool oracle_, typename Value_, typename Index_>
611class SparseBlock : public tatami::SparseExtractor<oracle_, Value_, Index_> {
612public:
613 SparseBlock(
614 const Components& tdb_comp,
615 const std::string& attribute,
616 bool row,
617 Index_ target_dim_extent,
618 const std::string& target_dimname,
619 const Dimension& tdb_target_dim,
620 const std::string& non_target_dimname,
621 const Dimension& tdb_non_target_dim,
622 tiledb_datatype_t tdb_type,
624 Index_ block_start,
625 Index_ block_length,
626 const CacheParameters<oracle_, Index_>& cache_parameters,
627 bool needs_value,
628 bool needs_index) :
629 my_core(
630 tdb_comp,
631 attribute,
632 row,
633 target_dim_extent,
634 target_dimname,
635 tdb_target_dim,
636 non_target_dimname,
637 tdb_non_target_dim,
638 tdb_type,
639 block_length,
640 std::move(oracle),
641 cache_parameters,
642 needs_value,
643 needs_index
644 ),
645 my_block_start(block_start),
646 my_block_length(block_length)
647 {}
648
649 tatami::SparseRange<Value_, Index_> fetch(Index_ i, Value_* vbuffer, Index_* ibuffer) {
650 auto info = my_core.fetch_block(i, my_block_start, my_block_length);
651 return fill_sparse_range(my_core.get_workspace(), info.first, info.second, my_core.get_tdb_non_target_dim(), vbuffer, ibuffer, my_core.get_needs_value(), my_core.get_needs_index());
652 }
653
654private:
655 SparseCore<oracle_, Index_> my_core;
656 Index_ my_block_start, my_block_length;
657};
658
659template<bool oracle_, typename Value_, typename Index_>
660class SparseIndex : public tatami::SparseExtractor<oracle_, Value_, Index_> {
661public:
662 SparseIndex(
663 const Components& tdb_comp,
664 const std::string& attribute,
665 bool row,
666 Index_ target_dim_extent,
667 const std::string& target_dimname,
668 const Dimension& tdb_target_dim,
669 const std::string& non_target_dimname,
670 const Dimension& tdb_non_target_dim,
671 tiledb_datatype_t tdb_type,
673 tatami::VectorPtr<Index_> indices_ptr,
674 const CacheParameters<oracle_, Index_>& cache_parameters,
675 bool needs_value,
676 bool needs_index) :
677 my_core(
678 tdb_comp,
679 attribute,
680 row,
681 target_dim_extent,
682 target_dimname,
683 tdb_target_dim,
684 non_target_dimname,
685 tdb_non_target_dim,
686 tdb_type,
687 indices_ptr->size(),
688 std::move(oracle),
689 cache_parameters,
690 needs_value,
691 needs_index
692 ),
693 my_indices_ptr(std::move(indices_ptr))
694 {}
695
696 tatami::SparseRange<Value_, Index_> fetch(Index_ i, Value_* vbuffer, Index_* ibuffer) {
697 auto info = my_core.fetch_indices(i, *my_indices_ptr);
698 return fill_sparse_range(my_core.get_workspace(), info.first, info.second, my_core.get_tdb_non_target_dim(), vbuffer, ibuffer, my_core.get_needs_value(), my_core.get_needs_index());
699 }
700
701private:
702 SparseCore<oracle_, Index_> my_core;
703 tatami::VectorPtr<Index_> my_indices_ptr;
704};
705
706/************************
707 *** Dense subclasses ***
708 ************************/
709
710template<bool oracle_, typename Value_, typename Index_>
711class DenseFull : public tatami::DenseExtractor<oracle_, Value_, Index_> {
712public:
713 DenseFull(
714 const Components& tdb_comp,
715 const std::string& attribute,
716 bool row,
717 Index_ target_dim_extent,
718 const std::string& target_dimname,
719 const Dimension& tdb_target_dim,
720 const std::string& non_target_dimname,
721 const Dimension& tdb_non_target_dim,
722 tiledb_datatype_t tdb_type,
724 Index_ non_target_dim_extent,
725 const CacheParameters<oracle_, Index_>& cache_parameters,
726 [[maybe_unused]] bool needs_value, // for consistency with Sparse* constructors.
727 [[maybe_unused]] bool needs_index) :
728 my_core(
729 tdb_comp,
730 attribute,
731 row,
732 target_dim_extent,
733 target_dimname,
734 tdb_target_dim,
735 non_target_dimname,
736 tdb_non_target_dim,
737 tdb_type,
738 non_target_dim_extent,
739 std::move(oracle),
740 cache_parameters,
741 /* needs_value = */ true,
742 /* needs_index = */ true
743 ),
744 my_non_target_dim_extent(non_target_dim_extent),
745 my_holding_value(my_non_target_dim_extent),
746 my_holding_index(my_non_target_dim_extent)
747 {}
748
749 const Value_* fetch(Index_ i, Value_* buffer) {
750 auto info = my_core.fetch_block(i, 0, my_non_target_dim_extent);
751 const auto& work = my_core.get_workspace();
752 work.values.copy(info.first, info.second, my_holding_value.data());
753 work.non_target_indices.copy(info.first, info.second, my_core.get_tdb_non_target_dim(), my_holding_index.data());
754 std::fill_n(buffer, my_non_target_dim_extent, 0);
755 for (size_t i = 0; i < info.second; ++i) {
756 buffer[my_holding_index[i]] = my_holding_value[i];
757 }
758 return buffer;
759 }
760
761private:
762 SparseCore<oracle_, Index_> my_core;
763 Index_ my_non_target_dim_extent;
764 std::vector<Value_> my_holding_value;
765 std::vector<Index_> my_holding_index;
766};
767
768template<bool oracle_, typename Value_, typename Index_>
769class DenseBlock : public tatami::DenseExtractor<oracle_, Value_, Index_> {
770public:
771 DenseBlock(
772 const Components& tdb_comp,
773 const std::string& attribute,
774 bool row,
775 Index_ target_dim_extent,
776 const std::string& target_dimname,
777 const Dimension& tdb_target_dim,
778 const std::string& non_target_dimname,
779 const Dimension& tdb_non_target_dim,
780 tiledb_datatype_t tdb_type,
782 Index_ block_start,
783 Index_ block_length,
784 const CacheParameters<oracle_, Index_>& cache_parameters,
785 [[maybe_unused]] bool needs_value, // for consistency with Sparse* constructors.
786 [[maybe_unused]] bool needs_index) :
787 my_core(
788 tdb_comp,
789 attribute,
790 row,
791 target_dim_extent,
792 target_dimname,
793 tdb_target_dim,
794 non_target_dimname,
795 tdb_non_target_dim,
796 tdb_type,
797 block_length,
798 std::move(oracle),
799 cache_parameters,
800 /* needs_value = */ true,
801 /* needs_index = */ true
802 ),
803 my_block_start(block_start),
804 my_block_length(block_length),
805 my_holding_value(block_length),
806 my_holding_index(block_length)
807 {}
808
809 const Value_* fetch(Index_ i, Value_* buffer) {
810 auto info = my_core.fetch_block(i, my_block_start, my_block_length);
811 const auto& work = my_core.get_workspace();
812 work.values.copy(info.first, info.second, my_holding_value.data());
813 work.non_target_indices.copy(info.first, info.second, my_core.get_tdb_non_target_dim(), my_holding_index.data());
814 std::fill_n(buffer, my_block_length, 0);
815 for (size_t i = 0; i < info.second; ++i) {
816 buffer[my_holding_index[i] - my_block_start] = my_holding_value[i];
817 }
818 return buffer;
819 }
820
821private:
822 SparseCore<oracle_, Index_> my_core;
823 Index_ my_block_start, my_block_length;
824 std::vector<Value_> my_holding_value;
825 std::vector<Index_> my_holding_index;
826};
827
828template<bool oracle_, typename Value_, typename Index_>
829class DenseIndex : public tatami::DenseExtractor<oracle_, Value_, Index_> {
830public:
831 DenseIndex(
832 const Components& tdb_comp,
833 const std::string& attribute,
834 bool row,
835 Index_ target_dim_extent,
836 const std::string& target_dimname,
837 const Dimension& tdb_target_dim,
838 const std::string& non_target_dimname,
839 const Dimension& tdb_non_target_dim,
840 tiledb_datatype_t tdb_type,
842 tatami::VectorPtr<Index_> indices_ptr,
843 const CacheParameters<oracle_, Index_>& cache_parameters,
844 [[maybe_unused]] bool needs_value, // for consistency with Sparse* constructors.
845 [[maybe_unused]] bool needs_index) :
846 my_core(
847 tdb_comp,
848 attribute,
849 row,
850 target_dim_extent,
851 target_dimname,
852 tdb_target_dim,
853 non_target_dimname,
854 tdb_non_target_dim,
855 tdb_type,
856 indices_ptr->size(),
857 std::move(oracle),
858 cache_parameters,
859 /* needs_value = */ true,
860 /* needs_index = */ true
861 ),
862 my_indices_ptr(std::move(indices_ptr)),
863 my_holding_value(my_indices_ptr->size()),
864 my_holding_index(my_indices_ptr->size())
865 {
866 const auto& indices = *my_indices_ptr;
867 if (!indices.empty()) {
868 auto idx_start = indices.front();
869 my_remapping.resize(indices.back() - idx_start + 1);
870 for (size_t j = 0, end = indices.size(); j < end; ++j) {
871 my_remapping[indices[j] - idx_start] = j;
872 }
873 }
874 }
875
876 const Value_* fetch(Index_ i, Value_* buffer) {
877 const auto& indices = *my_indices_ptr;
878
879 if (!indices.empty()) {
880 auto info = my_core.fetch_indices(i, indices);
881 const auto& work = my_core.get_workspace();
882 work.values.copy(info.first, info.second, my_holding_value.data());
883 work.non_target_indices.copy(info.first, info.second, my_core.get_tdb_non_target_dim(), my_holding_index.data());
884 auto idx_start = indices.front();
885 std::fill_n(buffer, indices.size(), 0);
886 for (size_t i = 0; i < info.second; ++i) {
887 buffer[my_remapping[my_holding_index[i] - idx_start]] = my_holding_value[i];
888 }
889 }
890
891 return buffer;
892 }
893
894private:
895 SparseCore<oracle_, Index_> my_core;
896 tatami::VectorPtr<Index_> my_indices_ptr;
897 std::vector<Index_> my_remapping;
898 std::vector<Value_> my_holding_value;
899 std::vector<Index_> my_holding_index;
900};
901
902}
921template<typename Value_, typename Index_>
922class SparseMatrix : public tatami::Matrix<Value_, Index_> {
923public:
930 SparseMatrix(const std::string& uri, std::string attribute, tiledb::Context ctx, const SparseMatrixOptions& options) : my_attribute(std::move(attribute)) {
931 initialize(uri, std::move(ctx), options);
932 }
933
939 SparseMatrix(const std::string& uri, std::string attribute, const SparseMatrixOptions& options) : my_attribute(std::move(attribute)) {
940 initialize(uri, false, options);
941 }
942
947 SparseMatrix(const std::string& uri, std::string attribute) : SparseMatrix(uri, std::move(attribute), SparseMatrixOptions()) {}
948
949private:
950 template<class PossibleContext_>
951 void initialize(const std::string& uri, PossibleContext_ ctx, const SparseMatrixOptions& options) {
952 serialize([&]() -> void {
953 my_tdb_comp.reset(
954 [&]{
955 // If we have to create our own Context_ object, we do so inside the serialized
956 // section, rather than using a delegating constructor.
957 if constexpr(std::is_same<PossibleContext_, tiledb::Context>::value) {
958 return new SparseMatrix_internal::Components(std::move(ctx), uri);
959 } else {
960 return new SparseMatrix_internal::Components(uri);
961 }
962 }(),
963 [](SparseMatrix_internal::Components* ptr) {
964 // Serializing the deleter, for completeness's sake.
965 serialize([&]() -> void {
966 delete ptr;
967 });
968 }
969 );
970
971 auto schema = my_tdb_comp->array.schema();
972 if (schema.array_type() != TILEDB_SPARSE) {
973 throw std::runtime_error("TileDB array should be sparse");
974 }
975 my_cell_order = schema.cell_order();
976
977 my_cache_size_in_bytes = options.maximum_cache_size;
978 my_require_minimum_cache = options.require_minimum_cache;
979
980 if (!schema.has_attribute(my_attribute)) {
981 throw std::runtime_error("no attribute '" + my_attribute + "' is present in the TileDB array");
982 }
983 auto attr = schema.attribute(my_attribute);
984 my_tdb_type = attr.type();
985
986 tiledb::Domain domain = schema.domain();
987 if (domain.ndim() != 2) {
988 throw std::runtime_error("TileDB array should have exactly two dimensions");
989 }
990
991 tiledb::Dimension first_dim = domain.dimension(0);
992 my_first_dimname = first_dim.name();
993 my_tdb_first_dim.reset(first_dim);
994 Index_ first_extent = my_tdb_first_dim.extent<Index_>();
995 Index_ first_tile = my_tdb_first_dim.tile<Index_>();
996 my_firstdim_stats = tatami_chunked::ChunkDimensionStats<Index_>(first_extent, first_tile);
997
998 tiledb::Dimension second_dim = domain.dimension(1);
999 my_second_dimname = second_dim.name();
1000 my_tdb_second_dim.reset(second_dim);
1001 Index_ second_extent = my_tdb_second_dim.extent<Index_>();
1002 Index_ second_tile = my_tdb_second_dim.tile<Index_>();
1003 my_seconddim_stats = tatami_chunked::ChunkDimensionStats<Index_>(second_extent, second_tile);
1004
1005 // Favoring extraction on the dimension that involves pulling out fewer chunks per dimension element.
1006 auto tiles_per_firstdim = (second_extent / second_tile) + (second_extent % second_tile > 0);
1007 auto tiles_per_seconddim = (first_extent / first_tile) + (first_extent % first_tile > 0);
1008 my_prefer_firstdim = tiles_per_firstdim <= tiles_per_seconddim;
1009 });
1010 }
1011
1012private:
1013 std::shared_ptr<SparseMatrix_internal::Components> my_tdb_comp;
1014 tiledb_layout_t my_cell_order;
1015 tiledb_datatype_t my_tdb_type;
1016
1017 std::string my_attribute;
1018 size_t my_cache_size_in_bytes;
1019 bool my_require_minimum_cache;
1020
1021 std::string my_first_dimname, my_second_dimname;
1022 SparseMatrix_internal::Dimension my_tdb_first_dim, my_tdb_second_dim;
1023 tatami_chunked::ChunkDimensionStats<Index_> my_firstdim_stats, my_seconddim_stats;
1024
1025 bool my_prefer_firstdim;
1026
1027private:
1028 Index_ nrow_internal() const {
1029 return my_firstdim_stats.dimension_extent;
1030 }
1031
1032 Index_ ncol_internal() const {
1033 return my_seconddim_stats.dimension_extent;
1034 }
1035
1036public:
1037 Index_ nrow() const {
1038 return nrow_internal();
1039 }
1040
1041 Index_ ncol() const {
1042 return ncol_internal();
1043 }
1044
1045 bool is_sparse() const {
1046 return true;
1047 }
1048
1049 double is_sparse_proportion() const {
1050 return 1;
1051 }
1052
1053 bool prefer_rows() const {
1054 return my_prefer_firstdim;
1055 }
1056
1057 double prefer_rows_proportion() const {
1058 return static_cast<double>(my_prefer_firstdim);
1059 }
1060
1061 bool uses_oracle(bool) const {
1062 // It won't necessarily be used, but if the cache is empty,
1063 // the oracle definitely _won't_ be used.
1064 return my_cache_size_in_bytes > 0;
1065 }
1066
1067private:
1068 template<
1069 bool oracle_,
1070 template<typename, typename> class Interface_,
1071 template<bool, typename, typename> class Extractor_,
1072 typename ... Args_
1073 >
1074 std::unique_ptr<Interface_<Value_, Index_> > populate(
1075 bool row,
1076 Index_ non_target_length,
1078 const tatami::Options& opt,
1079 Args_&& ... args)
1080 const {
1081 const auto& target_dim_stats = (row ? my_firstdim_stats : my_seconddim_stats);
1082 const auto& target_dimname = (row ? my_first_dimname : my_second_dimname);
1083 const auto& non_target_dimname = (row ? my_second_dimname : my_first_dimname);
1084 const auto& tdb_target_dim = (row ? my_tdb_first_dim : my_tdb_second_dim);
1085 const auto& tdb_non_target_dim = (row ? my_tdb_second_dim : my_tdb_first_dim);
1086
1087 size_t nonzero_size = 0;
1088 if (opt.sparse_extract_value) {
1089 nonzero_size += ::tatami_tiledb::internal::determine_type_size(my_tdb_type);
1090 }
1091 if (opt.sparse_extract_index) {
1092 nonzero_size += ::tatami_tiledb::internal::determine_type_size(tdb_non_target_dim.type());
1093 }
1094
1095 if constexpr(oracle_) {
1096 // Add the target index size because we always need it for bulk
1097 // reads in the oracular case. This is not needed in the
1098 // myopic case because we only read one slab at a time.
1099 nonzero_size += ::tatami_tiledb::internal::determine_type_size(tdb_target_dim.type());
1100
1101 SparseMatrix_internal::OracularCacheParameters<Index_> cache_params;
1102 cache_params.max_cache_size_in_elements = my_cache_size_in_bytes / nonzero_size;
1103
1104 // If we're asking for rows and the cell order is row-major or
1105 // we want columns and the cell order is column-major, each
1106 // element of the target dimension has its contents stored
1107 // contiguously in TileDB's data tiles and can be easily
1108 // extracted on an individual basis; thus each element is
1109 // considered a separate slab and we set the chunk_length to 1.
1110 //
1111 // Otherwise, it's likely that an element of the target
1112 // dimension will overlap multiple data tiles within each space
1113 // tile, so we might as well extract the entire space tile's
1114 // elements on the target dimension.
1115 cache_params.chunk_length = (row == (my_cell_order == TILEDB_ROW_MAJOR) ? 1 : target_dim_stats.chunk_length);
1116
1117 // Ensure that there's enough space for every dimension element.
1118 // If this can't be guaranteed, we set the cache to only be able to
1119 // hold a single dimension element. This is effectively the same as
1120 // not doing any caching at all, as a hypothetical SoloCore would
1121 // still need to allocate enough memory for a single dimension
1122 // element to create a buffer for the TileDB libary.
1123 size_t max_slab_size = static_cast<size_t>(non_target_length) * cache_params.chunk_length; // cast to avoid overflow.
1124 if (my_require_minimum_cache) {
1125 cache_params.max_cache_size_in_elements = std::max(cache_params.max_cache_size_in_elements, max_slab_size);
1126 } else if (cache_params.max_cache_size_in_elements < max_slab_size) {
1127 cache_params.max_cache_size_in_elements = non_target_length;
1128 cache_params.chunk_length = 1;
1129 }
1130
1131 return std::make_unique<Extractor_<oracle_, Value_, Index_> >(
1132 *my_tdb_comp,
1133 my_attribute,
1134 row,
1135 target_dim_stats.dimension_extent,
1136 target_dimname,
1137 tdb_target_dim,
1138 non_target_dimname,
1139 tdb_non_target_dim,
1140 my_tdb_type,
1141 std::move(oracle),
1142 std::forward<Args_>(args)...,
1143 cache_params,
1146 );
1147
1148 } else {
1149 tatami_chunked::SlabCacheStats raw_params(
1150 target_dim_stats.chunk_length,
1151 non_target_length,
1152 target_dim_stats.num_chunks,
1153 my_cache_size_in_bytes,
1154 nonzero_size,
1155 my_require_minimum_cache
1156 );
1157
1158 // No need to have a dedicated SoloCore for uncached extraction,
1159 // because it would still need to hold a single Workspace. We
1160 // instead reuse the MyopicCore's code with a chunk length of 1 to
1161 // achieve the same memory usage. This has a mild perf hit from the
1162 // LRU but perf already sucks without caching so who cares.
1163 SparseMatrix_internal::MyopicCacheParameters<Index_> cache_params;
1164 if (raw_params.max_slabs_in_cache > 0) {
1165 cache_params.chunk_length = target_dim_stats.chunk_length;
1166 cache_params.slab_size_in_elements = raw_params.slab_size_in_elements;
1167 cache_params.max_slabs_in_cache = raw_params.max_slabs_in_cache;
1168 } else {
1169 cache_params.chunk_length = 1;
1170 cache_params.slab_size_in_elements = non_target_length;
1171 cache_params.max_slabs_in_cache = 1;
1172 }
1173
1174 return std::make_unique<Extractor_<oracle_, Value_, Index_> >(
1175 *my_tdb_comp,
1176 my_attribute,
1177 row,
1178 target_dim_stats.dimension_extent,
1179 target_dimname,
1180 tdb_target_dim,
1181 non_target_dimname,
1182 tdb_non_target_dim,
1183 my_tdb_type,
1184 std::move(oracle),
1185 std::forward<Args_>(args)...,
1186 cache_params,
1189 );
1190 }
1191 }
1192
1193 static tatami::Options set_extract_all(tatami::Options opt) {
1194 // Resetting these options so that the slab size estimates are
1195 // correctly estimated for dense extractors, regardless of 'opt'.
1196 opt.sparse_extract_value = true;
1197 opt.sparse_extract_index = true;
1198 return opt;
1199 }
1200
1201 /********************
1202 *** Myopic dense ***
1203 ********************/
1204public:
1205 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, const tatami::Options& opt) const {
1206 Index_ full_non_target = (row ? ncol_internal() : nrow_internal());
1207 return populate<false, tatami::MyopicDenseExtractor, SparseMatrix_internal::DenseFull>(row, full_non_target, false, set_extract_all(opt), full_non_target);
1208 }
1209
1210 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
1211 return populate<false, tatami::MyopicDenseExtractor, SparseMatrix_internal::DenseBlock>(row, block_length, false, set_extract_all(opt), block_start, block_length);
1212 }
1213
1214 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
1215 auto nidx = indices_ptr->size();
1216 return populate<false, tatami::MyopicDenseExtractor, SparseMatrix_internal::DenseIndex>(row, nidx, false, set_extract_all(opt), std::move(indices_ptr));
1217 }
1218
1219 /*********************
1220 *** Myopic sparse ***
1221 *********************/
1222public:
1223 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, const tatami::Options& opt) const {
1224 Index_ full_non_target = (row ? ncol_internal() : nrow_internal());
1225 return populate<false, tatami::MyopicSparseExtractor, SparseMatrix_internal::SparseFull>(row, full_non_target, false, opt, full_non_target);
1226 }
1227
1228 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
1229 return populate<false, tatami::MyopicSparseExtractor, SparseMatrix_internal::SparseBlock>(row, block_length, false, opt, block_start, block_length);
1230 }
1231
1232 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
1233 auto nidx = indices_ptr->size();
1234 return populate<false, tatami::MyopicSparseExtractor, SparseMatrix_internal::SparseIndex>(row, nidx, false, opt, std::move(indices_ptr));
1235 }
1236
1237 /**********************
1238 *** Oracular dense ***
1239 **********************/
1240public:
1241 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(
1242 bool row,
1243 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1244 const tatami::Options& opt)
1245 const {
1246 Index_ full_non_target = (row ? ncol_internal() : nrow_internal());
1247 return populate<true, tatami::OracularDenseExtractor, SparseMatrix_internal::DenseFull>(row, full_non_target, std::move(oracle), set_extract_all(opt), full_non_target);
1248 }
1249
1250 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(
1251 bool row,
1252 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1253 Index_ block_start,
1254 Index_ block_length,
1255 const tatami::Options& opt)
1256 const {
1257 return populate<true, tatami::OracularDenseExtractor, SparseMatrix_internal::DenseBlock>(row, block_length, std::move(oracle), set_extract_all(opt), block_start, block_length);
1258 }
1259
1260 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(
1261 bool row,
1262 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1263 tatami::VectorPtr<Index_> indices_ptr,
1264 const tatami::Options& opt)
1265 const {
1266 auto nidx = indices_ptr->size();
1267 return populate<true, tatami::OracularDenseExtractor, SparseMatrix_internal::DenseIndex>(row, nidx, std::move(oracle), set_extract_all(opt), std::move(indices_ptr));
1268 }
1269
1270 /***********************
1271 *** Oracular sparse ***
1272 ***********************/
1273public:
1274 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(
1275 bool row,
1276 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1277 const tatami::Options& opt)
1278 const {
1279 Index_ full_non_target = (row ? ncol_internal() : nrow_internal());
1280 return populate<true, tatami::OracularSparseExtractor, SparseMatrix_internal::SparseFull>(row, full_non_target, std::move(oracle), opt, full_non_target);
1281 }
1282
1283 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(
1284 bool row,
1285 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1286 Index_ block_start,
1287 Index_ block_length,
1288 const tatami::Options& opt)
1289 const {
1290 return populate<true, tatami::OracularSparseExtractor, SparseMatrix_internal::SparseBlock>(row, block_length, std::move(oracle), opt, block_start, block_length);
1291 }
1292
1293 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(
1294 bool row,
1295 std::shared_ptr<const tatami::Oracle<Index_> > oracle,
1296 tatami::VectorPtr<Index_> indices_ptr,
1297 const tatami::Options& opt)
1298 const {
1299 auto nidx = indices_ptr->size();
1300 return populate<true, tatami::OracularSparseExtractor, SparseMatrix_internal::SparseIndex>(row, nidx, std::move(oracle), opt, std::move(indices_ptr));
1301 }
1302};
1303
1304}
1305
1306#endif
TileDB-backed sparse matrix.
Definition SparseMatrix.hpp:922
SparseMatrix(const std::string &uri, std::string attribute, const SparseMatrixOptions &options)
Definition SparseMatrix.hpp:939
SparseMatrix(const std::string &uri, std::string attribute)
Definition SparseMatrix.hpp:947
SparseMatrix(const std::string &uri, std::string attribute, tiledb::Context ctx, const SparseMatrixOptions &options)
Definition SparseMatrix.hpp:930
tatami bindings for TileDB matrices.
Definition DenseMatrix.hpp:20
void serialize(Function_ fun)
Definition serialize.hpp:20
std::shared_ptr< const std::vector< Index_ > > VectorPtr
typename std::conditional< oracle_, OracularSparseExtractor< Value_, Index_ >, MyopicSparseExtractor< Value_, Index_ > >::type SparseExtractor
typename std::conditional< oracle_, std::shared_ptr< const Oracle< Index_ > >, bool >::type MaybeOracle
typename std::conditional< oracle_, OracularDenseExtractor< Value_, Index_ >, MyopicDenseExtractor< Value_, Index_ > >::type DenseExtractor
void process_consecutive_indices(const Index_ *indices, Index_ length, Function_ fun)
Locking for serial access.
bool sparse_extract_index
bool sparse_extract_value
const Value_ * value
const Index_ * index
Options for sparse TileDB extraction.
Definition SparseMatrix.hpp:27
size_t maximum_cache_size
Definition SparseMatrix.hpp:37
bool require_minimum_cache
Definition SparseMatrix.hpp:44