tatami_hdf5
tatami bindings for HDF5-backed matrices
Loading...
Searching...
No Matches
CompressedSparseMatrix.hpp
Go to the documentation of this file.
1#ifndef TATAMI_HDF5_SPARSE_MATRIX_HPP
2#define TATAMI_HDF5_SPARSE_MATRIX_HPP
3
4#include "sparse_primary.hpp"
5#include "sparse_secondary.hpp"
6#include "serialize.hpp"
7#include "utils.hpp"
8
9#include <string>
10#include <vector>
11#include <algorithm>
12#include <cstddef>
13
14#include "H5Cpp.h"
15#include "tatami/tatami.hpp"
16#include "sanisizer/sanisizer.hpp"
17
24namespace tatami_hdf5 {
25
39 std::size_t maximum_cache_size = sanisizer::cap<std::size_t>(100000000);
40};
41
71template<typename Value_, typename Index_, typename CachedValue_ = Value_, typename CachedIndex_ = Index_>
72class CompressedSparseMatrix final : public tatami::Matrix<Value_, Index_> {
73 Index_ my_nrow, my_ncol;
74 std::string my_file_name, my_value_name, my_index_name;
75 std::vector<hsize_t> pointers;
76 bool my_csr;
77
78 std::size_t my_slab_cache_size; // our own cache of slabs.
79 Index_ my_max_non_zeros;
80 CompressedSparseMatrix_internal::ChunkCacheSizes my_chunk_cache_sizes; // HDF5's cache of uncompressed chunks.
81
82public:
97 Index_ nrow,
98 Index_ ncol,
99 std::string file_name,
100 std::string value_name,
101 std::string index_name,
102 std::string pointer_name,
103 bool csr,
104 const CompressedSparseMatrixOptions& options
105 ) :
106 my_nrow(nrow),
107 my_ncol(ncol),
108 my_file_name(std::move(file_name)),
109 my_value_name(std::move(value_name)),
110 my_index_name(std::move(index_name)),
111 my_csr(csr),
112 my_slab_cache_size(options.maximum_cache_size)
113 {
114 // Here, the 'primary' dimension refers to the dimension by which the non-zero elements are grouped.
115 // The secondary dimension is, well, the other dimension.
116 Index_ primary_dim = my_csr ? my_nrow : my_ncol;
117 Index_ secondary_dim = my_csr ? my_ncol : my_nrow;
118
119 auto dim_as_str = [](bool row) -> std::string {
120 if (row) {
121 return "rows";
122 } else {
123 return "columns";
124 }
125 };
126
127 serialize([&]() -> void {
128 H5::H5File file_handle(my_file_name, H5F_ACC_RDONLY);
129 auto dhandle = open_and_check_dataset<false>(file_handle, my_value_name);
130 hsize_t nonzeros = get_array_dimensions<1>(dhandle, "value_name")[0];
131
132 auto ihandle = open_and_check_dataset<true>(file_handle, my_index_name);
133 if (get_array_dimensions<1>(ihandle, "index_name")[0] != nonzeros) {
134 throw std::runtime_error("number of non-zero elements is not consistent between 'value_name' and 'index_name'");
135 }
136
137 auto phandle = open_and_check_dataset<true>(file_handle, pointer_name);
138 auto ptr_size = get_array_dimensions<1>(phandle, "pointer_name")[0];
139 if (ptr_size == 0 || !sanisizer::is_equal(ptr_size - 1, primary_dim)) {
140 throw std::runtime_error("'pointer_name' dataset should have length equal to the number of " + dim_as_str(my_csr) + " plus 1");
141 }
142
143 auto dparms = dhandle.getCreatePlist();
144 if (dparms.getLayout() == H5D_CHUNKED) {
145 hsize_t dchunk_length;
146 dparms.getChunk(1, &dchunk_length);
147 my_chunk_cache_sizes.value = CompressedSparseMatrix_internal::compute_chunk_cache_size(nonzeros, dchunk_length, dhandle.getDataType().getSize());
148 }
149
150 auto iparms = ihandle.getCreatePlist();
151 if (iparms.getLayout() == H5D_CHUNKED) {
152 hsize_t ichunk_length;
153 iparms.getChunk(1, &ichunk_length);
154 my_chunk_cache_sizes.index = CompressedSparseMatrix_internal::compute_chunk_cache_size(nonzeros, ichunk_length, ihandle.getDataType().getSize());
155 }
156
157 // Checking the contents of the index pointers.
158 pointers.resize(sanisizer::cast<decltype(pointers.size())>(ptr_size));
159 phandle.read(pointers.data(), H5::PredType::NATIVE_HSIZE);
160 if (pointers[0] != 0) {
161 throw std::runtime_error("first index pointer should be zero");
162 }
163 if (pointers.back() != nonzeros) {
164 throw std::runtime_error("last index pointer should be equal to the number of non-zero elements");
165 }
166 });
167
168 my_max_non_zeros = 0;
169 for (Index_ i = 0; i < primary_dim; ++i) {
170 if (pointers[i+1] < pointers[i]) {
171 throw std::runtime_error("pointers should be ordered");
172 }
173 auto diff = pointers[i+1] - pointers[i];
174 if (sanisizer::is_greater_than(diff, secondary_dim)) {
175 throw std::runtime_error("differences between pointers should be no greater than the number of " + dim_as_str(!my_csr));
176 }
177 if (sanisizer::is_greater_than(diff, my_max_non_zeros)) {
178 my_max_non_zeros = diff; // cast is safe, because we know that it's less than the secondary_dim.
179 }
180 }
181 }
182
193 CompressedSparseMatrix(Index_ nrow, Index_ ncol, std::string file_name, std::string value_name, std::string index_name, std::string pointer_name, bool csr) :
194 CompressedSparseMatrix(nrow, ncol, std::move(file_name), std::move(value_name), std::move(index_name), std::move(pointer_name), csr, CompressedSparseMatrixOptions()) {}
195
196public:
197 Index_ nrow() const {
198 return my_nrow;
199 }
200
201 Index_ ncol() const {
202 return my_ncol;
203 }
204
205 bool is_sparse() const {
206 return true;
207 }
208
209 double is_sparse_proportion() const {
210 return 1;
211 }
212
213 bool prefer_rows() const {
214 return my_csr;
215 }
216
217 double prefer_rows_proportion() const {
218 return static_cast<double>(my_csr);
219 }
220
221 bool uses_oracle(bool) const {
222 return true;
223 }
224
225 using tatami::Matrix<Value_, Index_>::dense;
226
227 using tatami::Matrix<Value_, Index_>::sparse;
228
229 /**************************************
230 ************ Myopic dense ************
231 **************************************/
232private:
233 CompressedSparseMatrix_internal::MatrixDetails<Index_> details() const {
234 return CompressedSparseMatrix_internal::MatrixDetails<Index_>(
235 my_file_name,
236 my_value_name,
237 my_index_name,
238 (my_csr ? my_nrow : my_ncol),
239 (my_csr ? my_ncol : my_nrow),
240 pointers,
241 my_slab_cache_size,
242 my_max_non_zeros,
243 my_chunk_cache_sizes
244 );
245 }
246
247 template<bool oracle_>
248 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, const tatami::Options&) const {
249 if (row == my_csr) {
250 return std::make_unique<CompressedSparseMatrix_internal::PrimaryFullDense<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
251 details(), std::move(oracle)
252 );
253 } else {
254 return std::make_unique<CompressedSparseMatrix_internal::SecondaryFullDense<oracle_, Value_, Index_, CachedValue_> >(
255 details(), std::move(oracle)
256 );
257 }
258 }
259
260 template<bool oracle_>
261 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, Index_ block_start, Index_ block_length, const tatami::Options&) const {
262 if (row == my_csr) {
263 return std::make_unique<CompressedSparseMatrix_internal::PrimaryBlockDense<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
264 details(), std::move(oracle), block_start, block_length
265 );
266 } else {
267 return std::make_unique<CompressedSparseMatrix_internal::SecondaryBlockDense<oracle_, Value_, Index_, CachedValue_> >(
268 details(), std::move(oracle), block_start, block_length
269 );
270 }
271 }
272
273 template<bool oracle_>
274 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options&) const {
275 if (row == my_csr) {
276 return std::make_unique<CompressedSparseMatrix_internal::PrimaryIndexDense<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
277 details(), std::move(oracle), std::move(indices_ptr)
278 );
279 } else {
280 return std::make_unique<CompressedSparseMatrix_internal::SecondaryIndexDense<oracle_, Value_, Index_, CachedValue_> >(
281 details(), std::move(oracle), std::move(indices_ptr)
282 );
283 }
284 }
285
286public:
287 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, const tatami::Options& opt) const {
288 return populate_dense<false>(row, false, opt);
289 }
290
291 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
292 return populate_dense<false>(row, false, block_start, block_length, opt);
293 }
294
295 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
296 return populate_dense<false>(row, false, std::move(indices_ptr), opt);
297 }
298
299 /***************************************
300 ************ Myopic sparse ************
301 ***************************************/
302private:
303 template<bool oracle_>
304 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, const tatami::Options& opt) const {
305 if (row == my_csr) {
306 return std::make_unique<CompressedSparseMatrix_internal::PrimaryFullSparse<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
307 details(), std::move(oracle), opt.sparse_extract_value, opt.sparse_extract_index
308 );
309 } else {
310 return std::make_unique<CompressedSparseMatrix_internal::SecondaryFullSparse<oracle_, Value_, Index_, CachedValue_> >(
311 details(), std::move(oracle), opt.sparse_extract_value, opt.sparse_extract_index
312 );
313 }
314 }
315
316 template<bool oracle_>
317 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
318 if (row == my_csr) {
319 return std::make_unique<CompressedSparseMatrix_internal::PrimaryBlockSparse<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
320 details(), std::move(oracle), block_start, block_length, opt.sparse_extract_value, opt.sparse_extract_index
321 );
322 } else {
323 return std::make_unique<CompressedSparseMatrix_internal::SecondaryBlockSparse<oracle_, Value_, Index_, CachedValue_> >(
324 details(), std::move(oracle), block_start, block_length, opt.sparse_extract_value, opt.sparse_extract_index
325 );
326 }
327 }
328
329 template<bool oracle_>
330 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
331 if (row == my_csr) {
332 return std::make_unique<CompressedSparseMatrix_internal::PrimaryIndexSparse<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
333 details(), std::move(oracle), std::move(indices_ptr), opt.sparse_extract_value, opt.sparse_extract_index
334 );
335 } else {
336 return std::make_unique<CompressedSparseMatrix_internal::SecondaryIndexSparse<oracle_, Value_, Index_, CachedValue_> >(
337 details(), std::move(oracle), std::move(indices_ptr), opt.sparse_extract_value, opt.sparse_extract_index
338 );
339 }
340 }
341
342public:
343 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, const tatami::Options& opt) const {
344 return populate_sparse<false>(row, false, opt);
345 }
346
347 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
348 return populate_sparse<false>(row, false, block_start, block_length, opt);
349 }
350
351 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
352 return populate_sparse<false>(row, false, std::move(indices_ptr), opt);
353 }
354
355 /****************************************
356 ************ Oracular dense ************
357 ****************************************/
358public:
359 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, const tatami::Options& opt) const {
360 return populate_dense<true>(row, std::move(oracle), opt);
361 }
362
363 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
364 return populate_dense<true>(row, std::move(oracle), block_start, block_length, opt);
365 }
366
367 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
368 return populate_dense<true>(row, std::move(oracle), std::move(indices_ptr), opt);
369 }
370
371 /*****************************************
372 ************ Oracular sparse ************
373 *****************************************/
374public:
375 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, const tatami::Options& opt) const {
376 return populate_sparse<true>(row, std::move(oracle), opt);
377 }
378
379 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
380 return populate_sparse<true>(row, std::move(oracle), block_start, block_length, opt);
381 }
382
383 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
384 return populate_sparse<true>(row, std::move(oracle), std::move(indices_ptr), opt);
385 }
386};
387
388}
389
390#endif
Compressed sparse matrix in a HDF5 file.
Definition CompressedSparseMatrix.hpp:72
CompressedSparseMatrix(Index_ nrow, Index_ ncol, std::string file_name, std::string value_name, std::string index_name, std::string pointer_name, bool csr)
Definition CompressedSparseMatrix.hpp:193
CompressedSparseMatrix(Index_ nrow, Index_ ncol, std::string file_name, std::string value_name, std::string index_name, std::string pointer_name, bool csr, const CompressedSparseMatrixOptions &options)
Definition CompressedSparseMatrix.hpp:96
Representations for matrix data in HDF5 files.
Definition CompressedSparseMatrix.hpp:24
void serialize(Function_ f)
Definition serialize.hpp:53
std::shared_ptr< const std::vector< Index_ > > VectorPtr
typename std::conditional< oracle_, std::shared_ptr< const Oracle< Index_ > >, bool >::type MaybeOracle
Default locking for serial access.
bool sparse_extract_index
bool sparse_extract_value
Options for HDF5 extraction.
Definition CompressedSparseMatrix.hpp:29
std::size_t maximum_cache_size
Definition CompressedSparseMatrix.hpp:39
Utilities for HDF5 extraction.