tatami_python
R bindings to tatami matrices
Loading...
Searching...
No Matches
UnknownMatrix.hpp
Go to the documentation of this file.
1#ifndef TATAMI_PYTHON_UNKNOWNMATRIX_HPP
2#define TATAMI_PYTHON_UNKNOWNMATRIX_HPP
3
4#include "pybind11/pybind11.h"
5#include "pybind11/numpy.h"
6#include "tatami/tatami.hpp"
7
8#include "dense_extractor.hpp"
9#include "sparse_extractor.hpp"
10#include "parallelize.hpp"
11
12#include <vector>
13#include <memory>
14#include <string>
15#include <stdexcept>
16#include <optional>
17#include <cstddef>
18
24namespace tatami_python {
25
33 std::size_t maximum_cache_size = sanisizer::cap<std::size_t>(100000000);
34
41};
42
55template<typename Value_, typename Index_, typename CachedValue_ = Value_, typename CachedIndex_ = Index_>
56class UnknownMatrix : public tatami::Matrix<Value_, Index_> {
57public:
65 UnknownMatrix(pybind11::object seed, const UnknownMatrixOptions& opt) :
66 my_seed(std::move(seed)),
67 my_module(pybind11::module::import("delayedarray")),
68 my_dense_extractor(my_module.attr("extract_dense_array")),
69 my_sparse_extractor(my_module.attr("extract_sparse_array")),
70 my_cache_size_in_bytes(opt.maximum_cache_size),
71 my_require_minimum_cache(opt.require_minimum_cache)
72 {
73 // We assume the constructor only occurs on the main thread, so we
74 // won't bother locking things up. I'm also not sure that the
75 // operations in the initialization list are thread-safe.
76
77 const auto shape = get_shape<Index_>(my_seed);
78 my_nrow = shape.first;
79 my_ncol = shape.second;
80
81 // Checking that we can safely create a pybind11::array_t<Index_> without overflow.
82 // We do it here once, so that we don't need to check in each call to create_indexing_array().
84
85 auto sparse = my_module.attr("is_sparse")(my_seed);
86 my_sparse = sparse.template cast<bool>();
87
88 auto grid = my_module.attr("chunk_grid")(my_seed);
89 auto bounds = grid.attr("boundaries").template cast<pybind11::tuple>();
90 if (bounds.size() != 2) {
91 auto ctype = get_class_name(seed);
92 throw std::runtime_error("'chunk_grid(<" + ctype + ">).boundaries' should be a tuple of length 2");
93 }
94
95 auto np = pybind11::module::import("numpy");
96 auto arrayfun = np.attr("array");
97 auto populate = [&](Index_ extent, const pybind11::object& raw_ticks, std::vector<Index_>& map, std::vector<Index_>& new_ticks, Index_& max_chunk_size) {
98 // Force realization of Iterable into a numpy array on the Python side.
99 // Assume that casting to Index_ is safe, given that we were able to cast the extent.
100 auto tick_array = arrayfun(raw_ticks, pybind11::dtype::of<Index_>());
101 auto ticks = tick_array.template cast<pybind11::array_t<Index_> >();
102 const auto tptr = static_cast<Index_*>(ticks.request().ptr);
103 const auto nticks = ticks.size();
104
105 new_ticks.reserve(sanisizer::sum<decltype(new_ticks.size())>(nticks, 1));
106 new_ticks.push_back(0);
108 Index_ counter = 0;
109 max_chunk_size = 0;
110
111 for (I<decltype(nticks)> i = 0; i < nticks; ++i) {
112 const auto latest = tptr[i];
113 const auto previous = new_ticks.back();
114 if (latest <= previous) {
115 auto ctype = get_class_name(seed);
116 throw std::runtime_error("boundaries are not strictly increasing in the output of 'chunk_grid(<" + ctype + ">).boundaries'");
117 }
118 new_ticks.push_back(latest);
119
120 std::fill(map.begin() + previous, map.begin() + latest, counter);
121 ++counter;
122 const auto to_fill = latest - previous;
123 if (to_fill > max_chunk_size) {
124 max_chunk_size = to_fill;
125 }
126 }
127
128 if (!sanisizer::is_equal(new_ticks.back(), extent)) {
129 auto ctype = get_class_name(seed);
130 throw std::runtime_error("invalid ticks returned in 'chunk_grid(<" + ctype + ">).boundaries'");
131 }
132 };
133
134 populate(my_nrow, bounds[0], my_row_chunk_map, my_row_chunk_ticks, my_row_max_chunk_size);
135 populate(my_ncol, bounds[1], my_col_chunk_map, my_col_chunk_ticks, my_col_max_chunk_size);
136
137 // Choose the dimension that requires pulling out fewer chunks.
138 auto chunks_per_row = my_col_chunk_ticks.size() - 1;
139 auto chunks_per_col = my_row_chunk_ticks.size() - 1;
140 my_prefer_rows = chunks_per_row <= chunks_per_col;
141 }
142
143private:
144 Index_ my_nrow, my_ncol;
145 bool my_sparse, my_prefer_rows;
146
147 std::vector<Index_> my_row_chunk_map, my_col_chunk_map;
148 std::vector<Index_> my_row_chunk_ticks, my_col_chunk_ticks;
149
150 // To decide how many chunks to store in the cache, we pretend the largest
151 // chunk is a good representative. This is a bit suboptimal for irregular
152 // chunks but the LruSlabCache class doesn't have a good way of dealing
153 // with this right now. The fundamental problem is that variable slabs will
154 // either (i) all reach the maximum allocation eventually, if slabs are
155 // reused, or (ii) require lots of allocations, if slabs are not reused, or
156 // (iii) require manual defragmentation, if slabs are reused in a manner
157 // that avoids inflation to the maximum allocation.
158 Index_ my_row_max_chunk_size, my_col_max_chunk_size;
159
160 pybind11::object my_seed;
161 pybind11::module my_module;
162 pybind11::object my_dense_extractor, my_sparse_extractor;
163
164 std::size_t my_cache_size_in_bytes;
165 bool my_require_minimum_cache;
166
167public:
168 Index_ nrow() const {
169 return my_nrow;
170 }
171
172 Index_ ncol() const {
173 return my_ncol;
174 }
175
176 bool is_sparse() const {
177 return my_sparse;
178 }
179
180 double is_sparse_proportion() const {
181 return static_cast<double>(my_sparse);
182 }
183
184 bool prefer_rows() const {
185 return my_prefer_rows;
186 }
187
188 double prefer_rows_proportion() const {
189 return static_cast<double>(my_prefer_rows);
190 }
191
192 bool uses_oracle(bool) const {
193 return true;
194 }
195
196private:
197 Index_ max_primary_chunk_length(bool row) const {
198 return (row ? my_row_max_chunk_size : my_col_max_chunk_size);
199 }
200
201 Index_ primary_num_chunks(bool row, Index_ primary_chunk_length) const {
202 auto primary_dim = (row ? my_nrow : my_ncol);
203 if (primary_chunk_length == 0) {
204 return primary_dim;
205 } else {
206 return primary_dim / primary_chunk_length;
207 }
208 }
209
210 Index_ secondary_dim(bool row) const {
211 return (row ? my_ncol : my_nrow);
212 }
213
214 const std::vector<Index_>& chunk_ticks(bool row) const {
215 if (row) {
216 return my_row_chunk_ticks;
217 } else {
218 return my_col_chunk_ticks;
219 }
220 }
221
222 const std::vector<Index_>& chunk_map(bool row) const {
223 if (row) {
224 return my_row_chunk_map;
225 } else {
226 return my_col_chunk_map;
227 }
228 }
229
230 /********************
231 *** Myopic dense ***
232 ********************/
233private:
234 template<
235 bool oracle_,
236 template <bool, bool, typename, typename, typename> class FromDense_,
237 template <bool, bool, typename, typename, typename, typename> class FromSparse_,
238 typename ... Args_
239 >
240 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense_internal(
241 bool row,
242 Index_ non_target_length,
244 Args_&& ... args
245 ) const {
246 Index_ max_target_chunk_length = max_primary_chunk_length(row);
247 tatami_chunked::SlabCacheStats<Index_> stats(
248 /* target length = */ max_target_chunk_length,
249 /* non_target_length = */ non_target_length,
250 /* target_num_slabs = */ primary_num_chunks(row, max_target_chunk_length),
251 /* cache_size_in_bytes = */ my_cache_size_in_bytes,
252 /* element_size = */ sizeof(CachedValue_),
253 /* require_minimum_cache = */ my_require_minimum_cache
254 );
255
256 const auto& map = chunk_map(row);
257 const auto& ticks = chunk_ticks(row);
258 const bool solo = (stats.max_slabs_in_cache == 0);
259
260 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > output;
261#ifdef TATAMI_PYTHON_PARALLELIZE_UNKNOWN
262 TATAMI_PYTHON_SERIALIZE([&]() -> void {
263#endif
264
265 if (!my_sparse) {
266 if (solo) {
267 output.reset(
268 new FromDense_<true, oracle_, Value_, Index_, CachedValue_>(
269 my_seed,
270 my_dense_extractor,
271 row,
272 std::move(oracle),
273 std::forward<Args_>(args)...,
274 ticks,
275 map,
276 stats
277 )
278 );
279
280 } else {
281 output.reset(
282 new FromDense_<false, oracle_, Value_, Index_, CachedValue_>(
283 my_seed,
284 my_dense_extractor,
285 row,
286 std::move(oracle),
287 std::forward<Args_>(args)...,
288 ticks,
289 map,
290 stats
291 )
292 );
293 }
294
295 } else {
296 if (solo) {
297 output.reset(
298 new FromSparse_<true, oracle_, Value_, Index_, CachedValue_, CachedIndex_>(
299 my_seed,
300 my_sparse_extractor,
301 row,
302 std::move(oracle),
303 std::forward<Args_>(args)...,
304 max_target_chunk_length,
305 ticks,
306 map,
307 stats
308 )
309 );
310
311 } else {
312 output.reset(
313 new FromSparse_<false, oracle_, Value_, Index_, CachedValue_, CachedIndex_>(
314 my_seed,
315 my_sparse_extractor,
316 row,
317 std::move(oracle),
318 std::forward<Args_>(args)...,
319 max_target_chunk_length,
320 ticks,
321 map,
322 stats
323 )
324 );
325 }
326 }
327
328#ifdef TATAMI_PYTHON_PARALLELIZE_UNKNOWN
329 });
330#endif
331
332 return output;
333 }
334
335 template<bool oracle_>
336 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(
337 bool row,
339 const tatami::Options&
340 ) const {
341 Index_ non_target_dim = secondary_dim(row);
342 return populate_dense_internal<oracle_, DenseFull, DensifiedSparseFull>(
343 row,
344 non_target_dim,
345 std::move(ora),
346 non_target_dim
347 );
348 }
349
350 template<bool oracle_>
351 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(
352 bool row,
354 Index_ block_start,
355 Index_ block_length,
356 const tatami::Options&
357 ) const {
358 return populate_dense_internal<oracle_, DenseBlock, DensifiedSparseBlock>(
359 row,
360 block_length,
361 std::move(ora),
362 block_start,
363 block_length
364 );
365 }
366
367 template<bool oracle_>
368 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(
369 bool row,
371 tatami::VectorPtr<Index_> indices_ptr,
372 const tatami::Options&
373 ) const {
374 Index_ nidx = indices_ptr->size();
375 return populate_dense_internal<oracle_, DenseIndexed, DensifiedSparseIndexed>(
376 row,
377 nidx,
378 std::move(ora),
379 std::move(indices_ptr)
380 );
381 }
382
383public:
384 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(
385 bool row,
386 const tatami::Options& opt
387 ) const {
388 return populate_dense<false>(row, false, opt);
389 }
390
391 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(
392 bool row,
393 Index_ block_start,
394 Index_ block_length,
395 const tatami::Options& opt
396 ) const {
397 return populate_dense<false>(row, false, block_start, block_length, opt);
398 }
399
400 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(
401 bool row,
402 tatami::VectorPtr<Index_> indices_ptr,
403 const tatami::Options& opt
404 ) const {
405 return populate_dense<false>(row, false, std::move(indices_ptr), opt);
406 }
407
408 /**********************
409 *** Oracular dense ***
410 **********************/
411public:
412 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(
413 bool row,
414 std::shared_ptr<const tatami::Oracle<Index_> > ora,
415 const tatami::Options& opt
416 ) const {
417 return populate_dense<true>(row, std::move(ora), opt);
418 }
419
420 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(
421 bool row,
422 std::shared_ptr<const tatami::Oracle<Index_> > ora,
423 Index_ block_start,
424 Index_ block_length,
425 const tatami::Options& opt
426 ) const {
427 return populate_dense<true>(row, std::move(ora), block_start, block_length, opt);
428 }
429
430 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(
431 bool row,
432 std::shared_ptr<const tatami::Oracle<Index_> > ora,
433 tatami::VectorPtr<Index_> indices_ptr,
434 const tatami::Options& opt
435 ) const {
436 return populate_dense<true>(row, std::move(ora), std::move(indices_ptr), opt);
437 }
438
439 /*********************
440 *** Myopic sparse ***
441 *********************/
442public:
443 template<
444 bool oracle_,
445 template<bool, bool, typename, typename, typename, typename> class FromSparse_,
446 typename ... Args_
447 >
448 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse_internal(
449 bool row,
450 Index_ non_target_length,
452 const tatami::Options& opt,
453 Args_&& ... args
454 ) const {
455 Index_ max_target_chunk_length = max_primary_chunk_length(row);
456 tatami_chunked::SlabCacheStats<Index_> stats(
457 /* target_length = */ max_target_chunk_length,
458 /* non_target_length = */ non_target_length,
459 /* target_num_slabs = */ primary_num_chunks(row, max_target_chunk_length),
460 /* cache_size_in_bytes = */ my_cache_size_in_bytes,
461 /* element_size = */ (opt.sparse_extract_index ? sizeof(CachedIndex_) : 0) + (opt.sparse_extract_value ? sizeof(CachedValue_) : 0),
462 /* require_minimum_cache = */ my_require_minimum_cache
463 );
464
465 const auto& map = chunk_map(row);
466 const auto& ticks = chunk_ticks(row);
467 const bool needs_value = opt.sparse_extract_value;
468 const bool needs_index = opt.sparse_extract_index;
469 const bool solo = stats.max_slabs_in_cache == 0;
470
471 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > output;
472#ifdef TATAMI_PYTHON_PARALLELIZE_UNKNOWN
473 TATAMI_PYTHON_SERIALIZE([&]() -> void {
474#endif
475
476 if (solo) {
477 output.reset(
478 new FromSparse_<true, oracle_, Value_, Index_, CachedValue_, CachedIndex_>(
479 my_seed,
480 my_sparse_extractor,
481 row,
482 std::move(oracle),
483 std::forward<Args_>(args)...,
484 max_target_chunk_length,
485 ticks,
486 map,
487 stats,
488 needs_value,
489 needs_index
490 )
491 );
492
493 } else {
494 output.reset(
495 new FromSparse_<false, oracle_, Value_, Index_, CachedValue_, CachedIndex_>(
496 my_seed,
497 my_sparse_extractor,
498 row,
499 std::move(oracle),
500 std::forward<Args_>(args)...,
501 max_target_chunk_length,
502 ticks,
503 map,
504 stats,
505 needs_value,
506 needs_index
507 )
508 );
509 }
510
511#ifdef TATAMI_PYTHON_PARALLELIZE_UNKNOWN
512 });
513#endif
514
515 return output;
516 }
517
518 template<bool oracle_>
519 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(
520 bool row,
522 const tatami::Options& opt
523 ) const {
524 Index_ non_target_dim = secondary_dim(row);
525 return populate_sparse_internal<oracle_, SparseFull>(
526 row,
527 non_target_dim,
528 std::move(ora),
529 opt,
530 non_target_dim
531 );
532 }
533
534 template<bool oracle_>
535 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(
536 bool row,
538 Index_ block_start,
539 Index_ block_length,
540 const tatami::Options& opt
541 ) const {
542 return populate_sparse_internal<oracle_, SparseBlock>(
543 row,
544 block_length,
545 std::move(ora),
546 opt,
547 block_start,
548 block_length
549 );
550 }
551
552 template<bool oracle_>
553 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(
554 bool row,
556 tatami::VectorPtr<Index_> indices_ptr,
557 const tatami::Options& opt
558 ) const {
559 Index_ nidx = indices_ptr->size();
560 return populate_sparse_internal<oracle_, SparseIndexed>(
561 row,
562 nidx,
563 std::move(ora),
564 opt,
565 std::move(indices_ptr)
566 );
567 }
568
569public:
570 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(
571 bool row,
572 const tatami::Options& opt
573 ) const {
574 if (!my_sparse) {
575 return std::make_unique<tatami::FullSparsifiedWrapper<false, Value_, Index_> >(
576 dense(row, opt),
577 secondary_dim(row),
578 opt
579 );
580 } else {
581 return populate_sparse<false>(
582 row,
583 false,
584 opt
585 );
586 }
587 }
588
589 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(
590 bool row,
591 Index_ block_start,
592 Index_ block_length,
593 const tatami::Options& opt
594 ) const {
595 if (!my_sparse) {
596 return std::make_unique<tatami::BlockSparsifiedWrapper<false, Value_, Index_> >(
597 dense(row, block_start, block_length, opt),
598 block_start,
599 block_length,
600 opt
601 );
602 } else {
603 return populate_sparse<false>(
604 row,
605 false,
606 block_start,
607 block_length,
608 opt
609 );
610 }
611 }
612
613 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(
614 bool row,
615 tatami::VectorPtr<Index_> indices_ptr,
616 const tatami::Options& opt
617 ) const {
618 if (!my_sparse) {
619 auto index_copy = indices_ptr;
620 return std::make_unique<tatami::IndexSparsifiedWrapper<false, Value_, Index_> >(
621 dense(row, std::move(indices_ptr), opt),
622 std::move(index_copy),
623 opt
624 );
625 } else {
626 return populate_sparse<false>(
627 row,
628 false,
629 std::move(indices_ptr),
630 opt
631 );
632 }
633 }
634
635 /**********************
636 *** Oracular sparse ***
637 **********************/
638public:
639 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(
640 bool row,
641 std::shared_ptr<const tatami::Oracle<Index_> > ora,
642 const tatami::Options& opt
643 ) const {
644 if (!my_sparse) {
645 return std::make_unique<tatami::FullSparsifiedWrapper<true, Value_, Index_> >(
646 dense(row, std::move(ora), opt),
647 secondary_dim(row),
648 opt
649 );
650 } else {
651 return populate_sparse<true>(
652 row,
653 std::move(ora),
654 opt
655 );
656 }
657 }
658
659 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(
660 bool row,
661 std::shared_ptr<const tatami::Oracle<Index_> > ora,
662 Index_ block_start,
663 Index_ block_length,
664 const tatami::Options& opt
665 ) const {
666 if (!my_sparse) {
667 return std::make_unique<tatami::BlockSparsifiedWrapper<true, Value_, Index_> >(
668 dense(row, std::move(ora), block_start, block_length, opt),
669 block_start,
670 block_length,
671 opt
672 );
673 } else {
674 return populate_sparse<true>(
675 row,
676 std::move(ora),
677 block_start,
678 block_length,
679 opt
680 );
681 }
682 }
683
684 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(
685 bool row,
686 std::shared_ptr<const tatami::Oracle<Index_> > ora,
687 tatami::VectorPtr<Index_> indices_ptr,
688 const tatami::Options& opt
689 ) const {
690 if (!my_sparse) {
691 auto index_copy = indices_ptr;
692 return std::make_unique<tatami::IndexSparsifiedWrapper<true, Value_, Index_> >(
693 dense(row, std::move(ora),
694 std::move(indices_ptr), opt),
695 std::move(index_copy),
696 opt
697 );
698 } else {
699 return populate_sparse<true>(
700 row,
701 std::move(ora),
702 std::move(indices_ptr),
703 opt
704 );
705 }
706 }
707};
708
709}
710
711#endif
Unknown matrix-like object in Python.
Definition UnknownMatrix.hpp:56
UnknownMatrix(pybind11::object seed, const UnknownMatrixOptions &opt)
Definition UnknownMatrix.hpp:65
tatami bindings for arbitrary Python matrices.
std::shared_ptr< const std::vector< Index_ > > VectorPtr
Index_ can_cast_Index_to_container_size(const Index_ x)
void resize_container_to_Index_size(Container_ &container, const Index_ x, Args_ &&... args)
typename std::conditional< oracle_, std::shared_ptr< const Oracle< Index_ > >, bool >::type MaybeOracle
Utilities for safe parallelization.
#define TATAMI_PYTHON_SERIALIZE
Definition parallelize.hpp:21
bool sparse_extract_index
bool sparse_extract_value
Options for data extraction from an UnknownMatrix.
Definition UnknownMatrix.hpp:29
bool require_minimum_cache
Definition UnknownMatrix.hpp:40
std::size_t maximum_cache_size
Definition UnknownMatrix.hpp:33