tatami_r
R bindings to tatami matrices
Loading...
Searching...
No Matches
UnknownMatrix.hpp
1#ifndef TATAMI_R_UNKNOWNMATRIX_HPP
2#define TATAMI_R_UNKNOWNMATRIX_HPP
3
4#include "Rcpp.h"
5#include "tatami/tatami.hpp"
6
7#include "parallelize.hpp"
8#include "dense_extractor.hpp"
9#include "sparse_extractor.hpp"
10
11#include <vector>
12#include <memory>
13#include <string>
14#include <stdexcept>
15
16#include "parallelize.hpp"
17
18namespace tatami_r {
19
28 size_t maximum_cache_size = -1;
29
36};
37
50template<typename Value_, typename Index_, typename CachedValue_ = Value_, typename CachedIndex_ = Index_>
51class UnknownMatrix : public tatami::Matrix<Value_, Index_> {
52public:
59 UnknownMatrix(Rcpp::RObject seed, const UnknownMatrixOptions& opt) :
60 my_original_seed(seed),
61 my_delayed_env(Rcpp::Environment::namespace_env("DelayedArray")),
62 my_sparse_env(Rcpp::Environment::namespace_env("SparseArray")),
63 my_dense_extractor(my_delayed_env["extract_array"]),
64 my_sparse_extractor(my_sparse_env["extract_sparse_array"])
65 {
66 // We assume the constructor only occurs on the main thread, so we
67 // won't bother locking things up. I'm also not sure that the
68 // operations in the initialization list are thread-safe.
69
70 {
71 auto base = Rcpp::Environment::base_env();
72 Rcpp::Function fun = base["dim"];
73 Rcpp::RObject output = fun(seed);
74 if (output.sexp_type() != INTSXP) {
75 auto ctype = get_class_name(my_original_seed);
76 throw std::runtime_error("'dim(<" + ctype + ">)' should return an integer vector");
77 }
78 Rcpp::IntegerVector dims(output);
79 if (dims.size() != 2 || dims[0] < 0 || dims[1] < 0) {
80 auto ctype = get_class_name(my_original_seed);
81 throw std::runtime_error("'dim(<" + ctype + ">)' should contain two non-negative integers");
82 }
83
84 my_nrow = dims[0];
85 my_ncol = dims[1];
86 }
87
88 {
89 Rcpp::Function fun = my_delayed_env["is_sparse"];
90 Rcpp::LogicalVector is_sparse = fun(seed);
91 if (is_sparse.size() != 1) {
92 auto ctype = get_class_name(my_original_seed);
93 throw std::runtime_error("'is_sparse(<" + ctype + ">)' should return a logical vector of length 1");
94 }
95 my_sparse = (is_sparse[0] != 0);
96 }
97
98 {
99 my_row_chunk_map.resize(my_nrow);
100 my_col_chunk_map.resize(my_ncol);
101
102 Rcpp::Function fun = my_delayed_env["chunkGrid"];
103 Rcpp::RObject grid = fun(seed);
104
105 if (grid == R_NilValue) {
106 my_row_max_chunk_size = 1;
107 my_col_max_chunk_size = 1;
108 std::iota(my_row_chunk_map.begin(), my_row_chunk_map.end(), static_cast<Index_>(0));
109 std::iota(my_col_chunk_map.begin(), my_col_chunk_map.end(), static_cast<Index_>(0));
110 my_row_chunk_ticks.resize(my_nrow + 1);
111 std::iota(my_row_chunk_ticks.begin(), my_row_chunk_ticks.end(), static_cast<Index_>(0));
112 my_col_chunk_ticks.resize(my_ncol + 1);
113 std::iota(my_col_chunk_ticks.begin(), my_col_chunk_ticks.end(), static_cast<Index_>(0));
114
115 // Both dense and sparse inputs are implicitly column-major, so
116 // if there isn't chunking information to the contrary, we'll
117 // favor extraction of the columns.
118 my_prefer_rows = false;
119
120 } else {
121 auto grid_cls = get_class_name(grid);
122
123 if (grid_cls == "RegularArrayGrid") {
124 Rcpp::IntegerVector spacings(Rcpp::RObject(grid.slot("spacings")));
125 if (spacings.size() != 2) {
126 auto ctype = get_class_name(seed);
127 throw std::runtime_error("'chunkGrid(<" + ctype + ">)@spacings' should be an integer vector of length 2 with non-negative values");
128 }
129
130 auto populate = [](Index_ extent, Index_ spacing, std::vector<Index_>& map, std::vector<Index_>& ticks) {
131 if (spacing == 0) {
132 ticks.push_back(0);
133 } else {
134 ticks.reserve((extent / spacing) + (extent % spacing > 0) + 1);
135 Index_ start = 0;
136 ticks.push_back(start);
137 while (start != extent) {
138 auto to_fill = std::min(spacing, extent - start);
139 std::fill_n(map.begin() + start, to_fill, ticks.size() - 1);
140 start += to_fill;
141 ticks.push_back(start);
142 }
143 }
144 };
145
146 my_row_max_chunk_size = spacings[0];
147 populate(my_nrow, my_row_max_chunk_size, my_row_chunk_map, my_row_chunk_ticks);
148 my_col_max_chunk_size = spacings[1];
149 populate(my_ncol, my_col_max_chunk_size, my_col_chunk_map, my_col_chunk_ticks);
150
151 } else if (grid_cls == "ArbitraryArrayGrid") {
152 Rcpp::List ticks(Rcpp::RObject(grid.slot("tickmarks")));
153 if (ticks.size() != 2) {
154 auto ctype = get_class_name(seed);
155 throw std::runtime_error("'chunkGrid(<" + ctype + ">)@tickmarks' should return a list of length 2");
156 }
157
158 auto populate = [](Index_ extent, const Rcpp::IntegerVector& ticks, std::vector<Index_>& map, std::vector<Index_>& new_ticks, Index_& max_chunk_size) {
159 if (ticks.size() == 0 || ticks[ticks.size() - 1] != static_cast<int>(extent)) {
160 throw std::runtime_error("invalid ticks returned by 'chunkGrid'");
161 }
162 new_ticks.resize(ticks.size() + 1);
163 std::copy(ticks.begin(), ticks.end(), new_ticks.begin() + 1);
164
165 max_chunk_size = 0;
166 int start = 0;
167 map.resize(extent);
168 Index_ counter = 0;
169
170 for (auto t : ticks) {
171 if (t < start) {
172 throw std::runtime_error("invalid ticks returned by 'chunkGrid'");
173 }
174 Index_ to_fill = t - start;
175 if (to_fill > max_chunk_size) {
176 max_chunk_size = to_fill;
177 }
178 std::fill_n(map.begin() + start, to_fill, counter);
179 ++counter;
180 start = t;
181 }
182 };
183
184 Rcpp::IntegerVector first(ticks[0]);
185 populate(my_nrow, first, my_row_chunk_map, my_row_chunk_ticks, my_row_max_chunk_size);
186 Rcpp::IntegerVector second(ticks[1]);
187 populate(my_ncol, second, my_col_chunk_map, my_col_chunk_ticks, my_col_max_chunk_size);
188
189 } else {
190 auto ctype = get_class_name(seed);
191 throw std::runtime_error("instance of unknown class '" + grid_cls + "' returned by 'chunkGrid(<" + ctype + ">)");
192 }
193
194 // Choose the dimension that requires pulling out fewer chunks.
195 auto chunks_per_row = my_col_chunk_ticks.size() - 1;
196 auto chunks_per_col = my_row_chunk_ticks.size() - 1;
197 my_prefer_rows = chunks_per_row <= chunks_per_col;
198 }
199 }
200
201 my_cache_size_in_bytes = opt.maximum_cache_size;
202 my_require_minimum_cache = opt.require_minimum_cache;
203 if (my_cache_size_in_bytes == static_cast<size_t>(-1)) {
204 Rcpp::Function fun = my_delayed_env["getAutoBlockSize"];
205 Rcpp::NumericVector bsize = fun();
206 if (bsize.size() != 1 || bsize[0] < 0) {
207 throw std::runtime_error("'getAutoBlockSize()' should return a non-negative number of bytes");
208 }
209 my_cache_size_in_bytes = bsize[0];
210 }
211 }
212
219 UnknownMatrix(Rcpp::RObject seed) : UnknownMatrix(std::move(seed), UnknownMatrixOptions()) {}
220
221private:
222 Index_ my_nrow, my_ncol;
223 bool my_sparse, my_prefer_rows;
224
225 std::vector<Index_> my_row_chunk_map, my_col_chunk_map;
226 std::vector<Index_> my_row_chunk_ticks, my_col_chunk_ticks;
227
228 // To decide how many chunks to store in the cache, we pretend the largest
229 // chunk is a good representative. This is a bit suboptimal for irregular
230 // chunks but the LruSlabCache class doesn't have a good way of dealing
231 // with this right now. The fundamental problem is that variable slabs will
232 // either (i) all reach the maximum allocation eventually, if slabs are
233 // reused, or (ii) require lots of allocations, if slabs are not reused, or
234 // (iii) require manual defragmentation, if slabs are reused in a manner
235 // that avoids inflation to the maximum allocation.
236 Index_ my_row_max_chunk_size, my_col_max_chunk_size;
237
238 size_t my_cache_size_in_bytes;
239 bool my_require_minimum_cache;
240
241 Rcpp::RObject my_original_seed;
242 Rcpp::Environment my_delayed_env, my_sparse_env;
243 Rcpp::Function my_dense_extractor, my_sparse_extractor;
244
245public:
246 Index_ nrow() const {
247 return my_nrow;
248 }
249
250 Index_ ncol() const {
251 return my_ncol;
252 }
253
254 bool is_sparse() const {
255 return my_sparse;
256 }
257
258 double is_sparse_proportion() const {
259 return static_cast<double>(my_sparse);
260 }
261
262 bool prefer_rows() const {
263 return my_prefer_rows;
264 }
265
266 double prefer_rows_proportion() const {
267 return static_cast<double>(my_prefer_rows);
268 }
269
270 bool uses_oracle(bool) const {
271 return true;
272 }
273
274private:
275 Index_ max_primary_chunk_length(bool row) const {
276 return (row ? my_row_max_chunk_size : my_col_max_chunk_size);
277 }
278
279 Index_ secondary_dim(bool row) const {
280 return (row ? my_ncol : my_nrow);
281 }
282
283 const std::vector<Index_>& chunk_ticks(bool row) const {
284 if (row) {
285 return my_row_chunk_ticks;
286 } else {
287 return my_col_chunk_ticks;
288 }
289 }
290
291 const std::vector<Index_>& chunk_map(bool row) const {
292 if (row) {
293 return my_row_chunk_map;
294 } else {
295 return my_col_chunk_map;
296 }
297 }
298
299 /********************
300 *** Myopic dense ***
301 ********************/
302private:
303 template<
304 bool oracle_,
305 template <bool, bool, typename, typename, typename> class FromDense_,
306 template <bool, bool, typename, typename, typename, typename> class FromSparse_,
307 typename ... Args_
308 >
309 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense_internal(bool row, Index_ non_target_length, tatami::MaybeOracle<oracle_, Index_> oracle, Args_&& ... args) const {
310 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > output;
311
312 Index_ max_target_chunk_length = max_primary_chunk_length(row);
313 tatami_chunked::SlabCacheStats stats(max_target_chunk_length, non_target_length, my_cache_size_in_bytes, sizeof(CachedValue_), my_require_minimum_cache);
314
315 const auto& map = chunk_map(row);
316 const auto& ticks = chunk_ticks(row);
317 bool solo = (stats.max_slabs_in_cache == 0);
318
319#ifdef TATAMI_R_PARALLELIZE_UNKNOWN
320 // This involves some Rcpp initializations, so we lock it just in case.
321 auto& mexec = executor();
322 mexec.run([&]() -> void {
323#endif
324
325 if (!my_sparse) {
326 if (solo) {
327 typedef FromDense_<true, oracle_, Value_, Index_, CachedValue_> ShortDense;
328 output.reset(new ShortDense(my_original_seed, my_dense_extractor, row, std::move(oracle), std::forward<Args_>(args)..., ticks, map, stats));
329 } else {
330 typedef FromDense_<false, oracle_, Value_, Index_, CachedValue_> ShortDense;
331 output.reset(new ShortDense(my_original_seed, my_dense_extractor, row, std::move(oracle), std::forward<Args_>(args)..., ticks, map, stats));
332 }
333 } else {
334 if (solo) {
335 typedef FromSparse_<true, oracle_, Value_, Index_, CachedValue_, CachedIndex_> ShortSparse;
336 output.reset(new ShortSparse(my_original_seed, my_sparse_extractor, row, std::move(oracle), std::forward<Args_>(args)..., max_target_chunk_length, ticks, map, stats));
337 } else {
338 typedef FromSparse_<false, oracle_, Value_, Index_, CachedValue_, CachedIndex_> ShortSparse;
339 output.reset(new ShortSparse(my_original_seed, my_sparse_extractor, row, std::move(oracle), std::forward<Args_>(args)..., max_target_chunk_length, ticks, map, stats));
340 }
341 }
342
343#ifdef TATAMI_R_PARALLELIZE_UNKNOWN
344 });
345#endif
346
347 return output;
348 }
349
350 template<bool oracle_>
351 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> ora, const tatami::Options&) const {
352 Index_ non_target_dim = secondary_dim(row);
353 return populate_dense_internal<oracle_, UnknownMatrix_internal::DenseFull, UnknownMatrix_internal::DensifiedSparseFull>(row, non_target_dim, std::move(ora), non_target_dim);
354 }
355
356 template<bool oracle_>
357 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> ora, Index_ block_start, Index_ block_length, const tatami::Options&) const {
358 return populate_dense_internal<oracle_, UnknownMatrix_internal::DenseBlock, UnknownMatrix_internal::DensifiedSparseBlock>(row, block_length, std::move(ora), block_start, block_length);
359 }
360
361 template<bool oracle_>
362 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> ora, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options&) const {
363 Index_ nidx = indices_ptr->size();
364 return populate_dense_internal<oracle_, UnknownMatrix_internal::DenseIndexed, UnknownMatrix_internal::DensifiedSparseIndexed>(row, nidx, std::move(ora), std::move(indices_ptr));
365 }
366
367public:
368 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, const tatami::Options& opt) const {
369 return populate_dense<false>(row, false, opt);
370 }
371
372 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
373 return populate_dense<false>(row, false, block_start, block_length, opt);
374 }
375
376 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
377 return populate_dense<false>(row, false, std::move(indices_ptr), opt);
378 }
379
380 /**********************
381 *** Oracular dense ***
382 **********************/
383public:
384 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, const tatami::Options& opt) const {
385 return populate_dense<true>(row, std::move(ora), opt);
386 }
387
388 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
389 return populate_dense<true>(row, std::move(ora), block_start, block_length, opt);
390 }
391
392 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
393 return populate_dense<true>(row, std::move(ora), std::move(indices_ptr), opt);
394 }
395
396 /*********************
397 *** Myopic sparse ***
398 *********************/
399public:
400 template<
401 bool oracle_,
402 template<bool, bool, typename, typename, typename, typename> class FromSparse_,
403 typename ... Args_
404 >
405 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse_internal(
406 bool row,
407 Index_ non_target_length,
409 const tatami::Options& opt,
410 Args_&& ... args)
411 const {
412 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > output;
413
414 Index_ max_target_chunk_length = max_primary_chunk_length(row);
415 tatami_chunked::SlabCacheStats stats(
416 max_target_chunk_length,
417 non_target_length,
418 my_cache_size_in_bytes,
419 (opt.sparse_extract_index ? sizeof(CachedIndex_) : 0) + (opt.sparse_extract_value ? sizeof(CachedValue_) : 0),
420 my_require_minimum_cache
421 );
422
423 const auto& map = chunk_map(row);
424 const auto& ticks = chunk_ticks(row);
425 bool needs_value = opt.sparse_extract_value;
426 bool needs_index = opt.sparse_extract_index;
427 bool solo = stats.max_slabs_in_cache == 0;
428
429#ifdef TATAMI_R_PARALLELIZE_UNKNOWN
430 // This involves some Rcpp initializations, so we lock it just in case.
431 auto& mexec = executor();
432 mexec.run([&]() -> void {
433#endif
434
435 if (solo) {
436 typedef FromSparse_<true, oracle_, Value_, Index_, CachedValue_, CachedIndex_> ShortSparse;
437 output.reset(new ShortSparse(my_original_seed, my_sparse_extractor, row, std::move(oracle), std::forward<Args_>(args)..., max_target_chunk_length, ticks, map, stats, needs_value, needs_index));
438 } else {
439 typedef FromSparse_<false, oracle_, Value_, Index_, CachedValue_, CachedIndex_> ShortSparse;
440 output.reset(new ShortSparse(my_original_seed, my_sparse_extractor, row, std::move(oracle), std::forward<Args_>(args)..., max_target_chunk_length, ticks, map, stats, needs_value, needs_index));
441 }
442
443#ifdef TATAMI_R_PARALLELIZE_UNKNOWN
444 });
445#endif
446
447 return output;
448 }
449
450 template<bool oracle_>
451 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> ora, const tatami::Options& opt) const {
452 Index_ non_target_dim = secondary_dim(row);
453 return populate_sparse_internal<oracle_, UnknownMatrix_internal::SparseFull>(row, non_target_dim, std::move(ora), opt, non_target_dim);
454 }
455
456 template<bool oracle_>
457 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> ora, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
458 return populate_sparse_internal<oracle_, UnknownMatrix_internal::SparseBlock>(row, block_length, std::move(ora), opt, block_start, block_length);
459 }
460
461 template<bool oracle_>
462 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> ora, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
463 Index_ nidx = indices_ptr->size();
464 return populate_sparse_internal<oracle_, UnknownMatrix_internal::SparseIndexed>(row, nidx, std::move(ora), opt, std::move(indices_ptr));
465 }
466
467public:
468 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, const tatami::Options& opt) const {
469 if (!my_sparse) {
470 return std::make_unique<tatami::FullSparsifiedWrapper<false, Value_, Index_> >(dense(row, opt), secondary_dim(row), opt);
471 } else {
472 return populate_sparse<false>(row, false, opt);
473 }
474 }
475
476 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
477 if (!my_sparse) {
478 return std::make_unique<tatami::BlockSparsifiedWrapper<false, Value_, Index_> >(dense(row, block_start, block_length, opt), block_start, block_length, opt);
479 } else {
480 return populate_sparse<false>(row, false, block_start, block_length, opt);
481 }
482 }
483
484 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
485 if (!my_sparse) {
486 auto index_copy = indices_ptr;
487 return std::make_unique<tatami::IndexSparsifiedWrapper<false, Value_, Index_> >(dense(row, std::move(indices_ptr), opt), std::move(index_copy), opt);
488 } else {
489 return populate_sparse<false>(row, false, std::move(indices_ptr), opt);
490 }
491 }
492
493 /**********************
494 *** Oracular sparse ***
495 **********************/
496public:
497 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, const tatami::Options& opt) const {
498 if (!my_sparse) {
499 return std::make_unique<tatami::FullSparsifiedWrapper<true, Value_, Index_> >(dense(row, std::move(ora), opt), secondary_dim(row), opt);
500 } else {
501 return populate_sparse<true>(row, std::move(ora), opt);
502 }
503 }
504
505 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
506 if (!my_sparse) {
507 return std::make_unique<tatami::BlockSparsifiedWrapper<true, Value_, Index_> >(dense(row, std::move(ora), block_start, block_length, opt), block_start, block_length, opt);
508 } else {
509 return populate_sparse<true>(row, std::move(ora), block_start, block_length, opt);
510 }
511 }
512
513 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
514 if (!my_sparse) {
515 auto index_copy = indices_ptr;
516 return std::make_unique<tatami::IndexSparsifiedWrapper<true, Value_, Index_> >(dense(row, std::move(ora), std::move(indices_ptr), opt), std::move(index_copy), opt);
517 } else {
518 return populate_sparse<true>(row, std::move(ora), std::move(indices_ptr), opt);
519 }
520 }
521};
522
523}
524
525#endif
Unknown matrix-like object in R.
Definition UnknownMatrix.hpp:51
UnknownMatrix(Rcpp::RObject seed, const UnknownMatrixOptions &opt)
Definition UnknownMatrix.hpp:59
UnknownMatrix(Rcpp::RObject seed)
Definition UnknownMatrix.hpp:219
std::shared_ptr< const std::vector< Index_ > > VectorPtr
typename std::conditional< oracle_, std::shared_ptr< const Oracle< Index_ > >, bool >::type MaybeOracle
Safely parallelize for unknown matrices.
bool sparse_extract_index
bool sparse_extract_value
Options for data extraction from an UnknownMatrix.
Definition UnknownMatrix.hpp:23
bool require_minimum_cache
Definition UnknownMatrix.hpp:35
size_t maximum_cache_size
Definition UnknownMatrix.hpp:28