tatami_r
R bindings to tatami matrices
Loading...
Searching...
No Matches
UnknownMatrix.hpp
1#ifndef TATAMI_R_UNKNOWNMATRIX_HPP
2#define TATAMI_R_UNKNOWNMATRIX_HPP
3
4#include "Rcpp.h"
5#include "tatami/tatami.hpp"
6#include "dense_extractor.hpp"
7#include "sparse_extractor.hpp"
8
9#include <vector>
10#include <memory>
11#include <string>
12#include <stdexcept>
13
14#include "parallelize.hpp"
15
16namespace tatami_r {
17
26 size_t maximum_cache_size = -1;
27
34};
35
48template<typename Value_, typename Index_, typename CachedValue_ = Value_, typename CachedIndex_ = Index_>
49class UnknownMatrix : public tatami::Matrix<Value_, Index_> {
50public:
57 UnknownMatrix(Rcpp::RObject seed, const UnknownMatrixOptions& opt) :
58 my_original_seed(seed),
59 my_delayed_env(Rcpp::Environment::namespace_env("DelayedArray")),
60 my_sparse_env(Rcpp::Environment::namespace_env("SparseArray")),
61 my_dense_extractor(my_delayed_env["extract_array"]),
62 my_sparse_extractor(my_sparse_env["extract_sparse_array"])
63 {
64 // We assume the constructor only occurs on the main thread, so we
65 // won't bother locking things up. I'm also not sure that the
66 // operations in the initialization list are thread-safe.
67
68 {
69 auto base = Rcpp::Environment::base_env();
70 Rcpp::Function fun = base["dim"];
71 Rcpp::RObject output = fun(seed);
72 if (output.sexp_type() != INTSXP) {
73 auto ctype = get_class_name(my_original_seed);
74 throw std::runtime_error("'dim(<" + ctype + ">)' should return an integer vector");
75 }
76 Rcpp::IntegerVector dims(output);
77 if (dims.size() != 2 || dims[0] < 0 || dims[1] < 0) {
78 auto ctype = get_class_name(my_original_seed);
79 throw std::runtime_error("'dim(<" + ctype + ">)' should contain two non-negative integers");
80 }
81
82 my_nrow = dims[0];
83 my_ncol = dims[1];
84 }
85
86 {
87 Rcpp::Function fun = my_delayed_env["is_sparse"];
88 Rcpp::LogicalVector is_sparse = fun(seed);
89 if (is_sparse.size() != 1) {
90 auto ctype = get_class_name(my_original_seed);
91 throw std::runtime_error("'is_sparse(<" + ctype + ">)' should return a logical vector of length 1");
92 }
93 my_sparse = (is_sparse[0] != 0);
94 }
95
96 {
97 my_row_chunk_map.resize(my_nrow);
98 my_col_chunk_map.resize(my_ncol);
99
100 Rcpp::Function fun = my_delayed_env["chunkGrid"];
101 Rcpp::RObject grid = fun(seed);
102
103 if (grid == R_NilValue) {
104 my_row_max_chunk_size = 1;
105 my_col_max_chunk_size = 1;
106 std::iota(my_row_chunk_map.begin(), my_row_chunk_map.end(), static_cast<Index_>(0));
107 std::iota(my_col_chunk_map.begin(), my_col_chunk_map.end(), static_cast<Index_>(0));
108 my_row_chunk_ticks.resize(my_nrow + 1);
109 std::iota(my_row_chunk_ticks.begin(), my_row_chunk_ticks.end(), static_cast<Index_>(0));
110 my_col_chunk_ticks.resize(my_ncol + 1);
111 std::iota(my_col_chunk_ticks.begin(), my_col_chunk_ticks.end(), static_cast<Index_>(0));
112
113 // Both dense and sparse inputs are implicitly column-major, so
114 // if there isn't chunking information to the contrary, we'll
115 // favor extraction of the columns.
116 my_prefer_rows = false;
117
118 } else {
119 auto grid_cls = get_class_name(grid);
120
121 if (grid_cls == "RegularArrayGrid") {
122 Rcpp::IntegerVector spacings(Rcpp::RObject(grid.slot("spacings")));
123 if (spacings.size() != 2) {
124 auto ctype = get_class_name(seed);
125 throw std::runtime_error("'chunkGrid(<" + ctype + ">)@spacings' should be an integer vector of length 2 with non-negative values");
126 }
127
128 auto populate = [](Index_ extent, Index_ spacing, std::vector<Index_>& map, std::vector<Index_>& ticks) {
129 if (spacing == 0) {
130 ticks.push_back(0);
131 } else {
132 ticks.reserve((extent / spacing) + (extent % spacing > 0) + 1);
133 Index_ start = 0;
134 ticks.push_back(start);
135 while (start != extent) {
136 auto to_fill = std::min(spacing, extent - start);
137 std::fill_n(map.begin() + start, to_fill, ticks.size() - 1);
138 start += to_fill;
139 ticks.push_back(start);
140 }
141 }
142 };
143
144 my_row_max_chunk_size = spacings[0];
145 populate(my_nrow, my_row_max_chunk_size, my_row_chunk_map, my_row_chunk_ticks);
146 my_col_max_chunk_size = spacings[1];
147 populate(my_ncol, my_col_max_chunk_size, my_col_chunk_map, my_col_chunk_ticks);
148
149 } else if (grid_cls == "ArbitraryArrayGrid") {
150 Rcpp::List ticks(Rcpp::RObject(grid.slot("tickmarks")));
151 if (ticks.size() != 2) {
152 auto ctype = get_class_name(seed);
153 throw std::runtime_error("'chunkGrid(<" + ctype + ">)@tickmarks' should return a list of length 2");
154 }
155
156 auto populate = [](Index_ extent, const Rcpp::IntegerVector& ticks, std::vector<Index_>& map, std::vector<Index_>& new_ticks, Index_& max_chunk_size) {
157 if (ticks.size() == 0 || ticks[ticks.size() - 1] != static_cast<int>(extent)) {
158 throw std::runtime_error("invalid ticks returned by 'chunkGrid'");
159 }
160 new_ticks.resize(ticks.size() + 1);
161 std::copy(ticks.begin(), ticks.end(), new_ticks.begin() + 1);
162
163 max_chunk_size = 0;
164 int start = 0;
165 map.resize(extent);
166 Index_ counter = 0;
167
168 for (auto t : ticks) {
169 if (t < start) {
170 throw std::runtime_error("invalid ticks returned by 'chunkGrid'");
171 }
172 Index_ to_fill = t - start;
173 if (to_fill > max_chunk_size) {
174 max_chunk_size = to_fill;
175 }
176 std::fill_n(map.begin() + start, to_fill, counter);
177 ++counter;
178 start = t;
179 }
180 };
181
182 Rcpp::IntegerVector first(ticks[0]);
183 populate(my_nrow, first, my_row_chunk_map, my_row_chunk_ticks, my_row_max_chunk_size);
184 Rcpp::IntegerVector second(ticks[1]);
185 populate(my_ncol, second, my_col_chunk_map, my_col_chunk_ticks, my_col_max_chunk_size);
186
187 } else {
188 auto ctype = get_class_name(seed);
189 throw std::runtime_error("instance of unknown class '" + grid_cls + "' returned by 'chunkGrid(<" + ctype + ">)");
190 }
191
192 // Choose the dimension that requires pulling out fewer chunks.
193 auto chunks_per_row = my_col_chunk_ticks.size() - 1;
194 auto chunks_per_col = my_row_chunk_ticks.size() - 1;
195 my_prefer_rows = chunks_per_row <= chunks_per_col;
196 }
197 }
198
199 my_cache_size_in_bytes = opt.maximum_cache_size;
200 my_require_minimum_cache = opt.require_minimum_cache;
201 if (my_cache_size_in_bytes == static_cast<size_t>(-1)) {
202 Rcpp::Function fun = my_delayed_env["getAutoBlockSize"];
203 Rcpp::NumericVector bsize = fun();
204 if (bsize.size() != 1 || bsize[0] < 0) {
205 throw std::runtime_error("'getAutoBlockSize()' should return a non-negative number of bytes");
206 }
207 my_cache_size_in_bytes = bsize[0];
208 }
209 }
210
217 UnknownMatrix(Rcpp::RObject seed) : UnknownMatrix(std::move(seed), UnknownMatrixOptions()) {}
218
219private:
220 Index_ my_nrow, my_ncol;
221 bool my_sparse, my_prefer_rows;
222
223 std::vector<Index_> my_row_chunk_map, my_col_chunk_map;
224 std::vector<Index_> my_row_chunk_ticks, my_col_chunk_ticks;
225
226 // To decide how many chunks to store in the cache, we pretend the largest
227 // chunk is a good representative. This is a bit suboptimal for irregular
228 // chunks but the LruSlabCache class doesn't have a good way of dealing
229 // with this right now. The fundamental problem is that variable slabs will
230 // either (i) all reach the maximum allocation eventually, if slabs are
231 // reused, or (ii) require lots of allocations, if slabs are not reused, or
232 // (iii) require manual defragmentation, if slabs are reused in a manner
233 // that avoids inflation to the maximum allocation.
234 Index_ my_row_max_chunk_size, my_col_max_chunk_size;
235
236 size_t my_cache_size_in_bytes;
237 bool my_require_minimum_cache;
238
239 Rcpp::RObject my_original_seed;
240 Rcpp::Environment my_delayed_env, my_sparse_env;
241 Rcpp::Function my_dense_extractor, my_sparse_extractor;
242
243public:
244 Index_ nrow() const {
245 return my_nrow;
246 }
247
248 Index_ ncol() const {
249 return my_ncol;
250 }
251
252 bool is_sparse() const {
253 return my_sparse;
254 }
255
256 double is_sparse_proportion() const {
257 return static_cast<double>(my_sparse);
258 }
259
260 bool prefer_rows() const {
261 return my_prefer_rows;
262 }
263
264 double prefer_rows_proportion() const {
265 return static_cast<double>(my_prefer_rows);
266 }
267
268 bool uses_oracle(bool) const {
269 return true;
270 }
271
272private:
273 Index_ max_primary_chunk_length(bool row) const {
274 return (row ? my_row_max_chunk_size : my_col_max_chunk_size);
275 }
276
277 Index_ secondary_dim(bool row) const {
278 return (row ? my_ncol : my_nrow);
279 }
280
281 const std::vector<Index_>& chunk_ticks(bool row) const {
282 if (row) {
283 return my_row_chunk_ticks;
284 } else {
285 return my_col_chunk_ticks;
286 }
287 }
288
289 const std::vector<Index_>& chunk_map(bool row) const {
290 if (row) {
291 return my_row_chunk_map;
292 } else {
293 return my_col_chunk_map;
294 }
295 }
296
297 /********************
298 *** Myopic dense ***
299 ********************/
300private:
301 template<
302 bool oracle_,
303 template <bool, bool, typename, typename, typename> class FromDense_,
304 template <bool, bool, typename, typename, typename, typename> class FromSparse_,
305 typename ... Args_
306 >
307 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense_internal(bool row, Index_ non_target_length, tatami::MaybeOracle<oracle_, Index_> oracle, Args_&& ... args) const {
308 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > output;
309
310 Index_ max_target_chunk_length = max_primary_chunk_length(row);
311 tatami_chunked::SlabCacheStats stats(max_target_chunk_length, non_target_length, my_cache_size_in_bytes, sizeof(CachedValue_), my_require_minimum_cache);
312
313 const auto& map = chunk_map(row);
314 const auto& ticks = chunk_ticks(row);
315 bool solo = (stats.max_slabs_in_cache == 0);
316
317#ifdef TATAMI_R_PARALLELIZE_UNKNOWN
318 // This involves some Rcpp initializations, so we lock it just in case.
319 auto& mexec = executor();
320 mexec.run([&]() -> void {
321#endif
322
323 if (!my_sparse) {
324 if (solo) {
325 typedef FromDense_<true, oracle_, Value_, Index_, CachedValue_> ShortDense;
326 output.reset(new ShortDense(my_original_seed, my_dense_extractor, row, std::move(oracle), std::forward<Args_>(args)..., ticks, map, stats));
327 } else {
328 typedef FromDense_<false, oracle_, Value_, Index_, CachedValue_> ShortDense;
329 output.reset(new ShortDense(my_original_seed, my_dense_extractor, row, std::move(oracle), std::forward<Args_>(args)..., ticks, map, stats));
330 }
331 } else {
332 if (solo) {
333 typedef FromSparse_<true, oracle_, Value_, Index_, CachedValue_, CachedIndex_> ShortSparse;
334 output.reset(new ShortSparse(my_original_seed, my_sparse_extractor, row, std::move(oracle), std::forward<Args_>(args)..., max_target_chunk_length, ticks, map, stats));
335 } else {
336 typedef FromSparse_<false, oracle_, Value_, Index_, CachedValue_, CachedIndex_> ShortSparse;
337 output.reset(new ShortSparse(my_original_seed, my_sparse_extractor, row, std::move(oracle), std::forward<Args_>(args)..., max_target_chunk_length, ticks, map, stats));
338 }
339 }
340
341#ifdef TATAMI_R_PARALLELIZE_UNKNOWN
342 });
343#endif
344
345 return output;
346 }
347
348 template<bool oracle_>
349 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> ora, const tatami::Options&) const {
350 Index_ non_target_dim = secondary_dim(row);
351 return populate_dense_internal<oracle_, UnknownMatrix_internal::DenseFull, UnknownMatrix_internal::DensifiedSparseFull>(row, non_target_dim, std::move(ora), non_target_dim);
352 }
353
354 template<bool oracle_>
355 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> ora, Index_ block_start, Index_ block_length, const tatami::Options&) const {
356 return populate_dense_internal<oracle_, UnknownMatrix_internal::DenseBlock, UnknownMatrix_internal::DensifiedSparseBlock>(row, block_length, std::move(ora), block_start, block_length);
357 }
358
359 template<bool oracle_>
360 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> ora, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options&) const {
361 Index_ nidx = indices_ptr->size();
362 return populate_dense_internal<oracle_, UnknownMatrix_internal::DenseIndexed, UnknownMatrix_internal::DensifiedSparseIndexed>(row, nidx, std::move(ora), std::move(indices_ptr));
363 }
364
365public:
366 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, const tatami::Options& opt) const {
367 return populate_dense<false>(row, false, opt);
368 }
369
370 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
371 return populate_dense<false>(row, false, block_start, block_length, opt);
372 }
373
374 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
375 return populate_dense<false>(row, false, std::move(indices_ptr), opt);
376 }
377
378 /**********************
379 *** Oracular dense ***
380 **********************/
381public:
382 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, const tatami::Options& opt) const {
383 return populate_dense<true>(row, std::move(ora), opt);
384 }
385
386 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
387 return populate_dense<true>(row, std::move(ora), block_start, block_length, opt);
388 }
389
390 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
391 return populate_dense<true>(row, std::move(ora), std::move(indices_ptr), opt);
392 }
393
394 /*********************
395 *** Myopic sparse ***
396 *********************/
397public:
398 template<
399 bool oracle_,
400 template<bool, bool, typename, typename, typename, typename> class FromSparse_,
401 typename ... Args_
402 >
403 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse_internal(
404 bool row,
405 Index_ non_target_length,
407 const tatami::Options& opt,
408 Args_&& ... args)
409 const {
410 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > output;
411
412 Index_ max_target_chunk_length = max_primary_chunk_length(row);
413 tatami_chunked::SlabCacheStats stats(
414 max_target_chunk_length,
415 non_target_length,
416 my_cache_size_in_bytes,
417 (opt.sparse_extract_index ? sizeof(CachedIndex_) : 0) + (opt.sparse_extract_value ? sizeof(CachedValue_) : 0),
418 my_require_minimum_cache
419 );
420
421 const auto& map = chunk_map(row);
422 const auto& ticks = chunk_ticks(row);
423 bool needs_value = opt.sparse_extract_value;
424 bool needs_index = opt.sparse_extract_index;
425 bool solo = stats.max_slabs_in_cache == 0;
426
427#ifdef TATAMI_R_PARALLELIZE_UNKNOWN
428 // This involves some Rcpp initializations, so we lock it just in case.
429 auto& mexec = executor();
430 mexec.run([&]() -> void {
431#endif
432
433 if (solo) {
434 typedef FromSparse_<true, oracle_, Value_, Index_, CachedValue_, CachedIndex_> ShortSparse;
435 output.reset(new ShortSparse(my_original_seed, my_sparse_extractor, row, std::move(oracle), std::forward<Args_>(args)..., max_target_chunk_length, ticks, map, stats, needs_value, needs_index));
436 } else {
437 typedef FromSparse_<false, oracle_, Value_, Index_, CachedValue_, CachedIndex_> ShortSparse;
438 output.reset(new ShortSparse(my_original_seed, my_sparse_extractor, row, std::move(oracle), std::forward<Args_>(args)..., max_target_chunk_length, ticks, map, stats, needs_value, needs_index));
439 }
440
441#ifdef TATAMI_R_PARALLELIZE_UNKNOWN
442 });
443#endif
444
445 return output;
446 }
447
448 template<bool oracle_>
449 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> ora, const tatami::Options& opt) const {
450 Index_ non_target_dim = secondary_dim(row);
451 return populate_sparse_internal<oracle_, UnknownMatrix_internal::SparseFull>(row, non_target_dim, std::move(ora), opt, non_target_dim);
452 }
453
454 template<bool oracle_>
455 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> ora, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
456 return populate_sparse_internal<oracle_, UnknownMatrix_internal::SparseBlock>(row, block_length, std::move(ora), opt, block_start, block_length);
457 }
458
459 template<bool oracle_>
460 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> ora, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
461 Index_ nidx = indices_ptr->size();
462 return populate_sparse_internal<oracle_, UnknownMatrix_internal::SparseIndexed>(row, nidx, std::move(ora), opt, std::move(indices_ptr));
463 }
464
465public:
466 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, const tatami::Options& opt) const {
467 if (!my_sparse) {
468 return std::make_unique<tatami::FullSparsifiedWrapper<false, Value_, Index_> >(dense(row, opt), secondary_dim(row), opt);
469 } else {
470 return populate_sparse<false>(row, false, opt);
471 }
472 }
473
474 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
475 if (!my_sparse) {
476 return std::make_unique<tatami::BlockSparsifiedWrapper<false, Value_, Index_> >(dense(row, block_start, block_length, opt), block_start, block_length, opt);
477 } else {
478 return populate_sparse<false>(row, false, block_start, block_length, opt);
479 }
480 }
481
482 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
483 if (!my_sparse) {
484 auto index_copy = indices_ptr;
485 return std::make_unique<tatami::IndexSparsifiedWrapper<false, Value_, Index_> >(dense(row, std::move(indices_ptr), opt), std::move(index_copy), opt);
486 } else {
487 return populate_sparse<false>(row, false, std::move(indices_ptr), opt);
488 }
489 }
490
491 /**********************
492 *** Oracular sparse ***
493 **********************/
494public:
495 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, const tatami::Options& opt) const {
496 if (!my_sparse) {
497 return std::make_unique<tatami::FullSparsifiedWrapper<true, Value_, Index_> >(dense(row, std::move(ora), opt), secondary_dim(row), opt);
498 } else {
499 return populate_sparse<true>(row, std::move(ora), opt);
500 }
501 }
502
503 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
504 if (!my_sparse) {
505 return std::make_unique<tatami::BlockSparsifiedWrapper<true, Value_, Index_> >(dense(row, std::move(ora), block_start, block_length, opt), block_start, block_length, opt);
506 } else {
507 return populate_sparse<true>(row, std::move(ora), block_start, block_length, opt);
508 }
509 }
510
511 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > ora, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
512 if (!my_sparse) {
513 auto index_copy = indices_ptr;
514 return std::make_unique<tatami::IndexSparsifiedWrapper<true, Value_, Index_> >(dense(row, std::move(ora), std::move(indices_ptr), opt), std::move(index_copy), opt);
515 } else {
516 return populate_sparse<true>(row, std::move(ora), std::move(indices_ptr), opt);
517 }
518 }
519};
520
521}
522
523#endif
Unknown matrix-like object in R.
Definition UnknownMatrix.hpp:49
UnknownMatrix(Rcpp::RObject seed, const UnknownMatrixOptions &opt)
Definition UnknownMatrix.hpp:57
UnknownMatrix(Rcpp::RObject seed)
Definition UnknownMatrix.hpp:217
tatami bindings for arbitrary R matrices.
manticore::Executor & executor()
Definition parallelize.hpp:34
typename std::conditional< oracle_, std::shared_ptr< const Oracle< Index_ > >, bool >::type MaybeOracle
std::shared_ptr< const std::vector< Index_ > > VectorPtr
Safely parallelize for unknown matrices.
bool sparse_extract_index
bool sparse_extract_value
Options for data extraction from an UnknownMatrix.
Definition UnknownMatrix.hpp:21
bool require_minimum_cache
Definition UnknownMatrix.hpp:33
size_t maximum_cache_size
Definition UnknownMatrix.hpp:26