tatami_hdf5
tatami bindings for HDF5-backed matrices
Loading...
Searching...
No Matches
CompressedSparseMatrix.hpp
Go to the documentation of this file.
1#ifndef TATAMI_HDF5_SPARSE_MATRIX_HPP
2#define TATAMI_HDF5_SPARSE_MATRIX_HPP
3
4#include "H5Cpp.h"
5
6#include <string>
7#include <vector>
8#include <algorithm>
9
10#include "tatami/tatami.hpp"
11
12#include "sparse_primary.hpp"
13#include "sparse_secondary.hpp"
14#include "serialize.hpp"
15#include "utils.hpp"
16
23namespace tatami_hdf5 {
24
38 size_t maximum_cache_size = 100000000;
39};
40
70template<typename Value_, typename Index_, typename CachedValue_ = Value_, typename CachedIndex_ = Index_>
71class CompressedSparseMatrix : public tatami::Matrix<Value_, Index_> {
72 Index_ my_nrow, my_ncol;
73 std::string my_file_name, my_value_name, my_index_name;
74 std::vector<hsize_t> pointers;
75 bool my_csr;
76
77 // We distinguish between our own cache of slabs versus HDF5's cache of uncompressed chunks.
78 size_t my_slab_cache_size;
79 size_t my_max_non_zeros;
80 size_t my_chunk_cache_size;
81
82public:
96 CompressedSparseMatrix(Index_ nrow, Index_ ncol, std::string file_name, std::string value_name, std::string index_name, std::string pointer_name, bool csr, const CompressedSparseMatrixOptions& options) :
97 my_nrow(nrow),
98 my_ncol(ncol),
99 my_file_name(std::move(file_name)),
100 my_value_name(std::move(value_name)),
101 my_index_name(std::move(index_name)),
102 my_csr(csr),
103 my_slab_cache_size(options.maximum_cache_size)
104 {
105 serialize([&]() -> void {
106 H5::H5File file_handle(my_file_name, H5F_ACC_RDONLY);
107 auto dhandle = open_and_check_dataset<false>(file_handle, my_value_name);
108 hsize_t nonzeros = get_array_dimensions<1>(dhandle, "value_name")[0];
109
110 auto ihandle = open_and_check_dataset<true>(file_handle, my_index_name);
111 if (get_array_dimensions<1>(ihandle, "index_name")[0] != nonzeros) {
112 throw std::runtime_error("number of non-zero elements is not consistent between 'value_name' and 'index_name'");
113 }
114
115 auto phandle = open_and_check_dataset<true>(file_handle, pointer_name);
116 size_t ptr_size = get_array_dimensions<1>(phandle, "pointer_name")[0];
117 size_t dim_p1 = static_cast<size_t>(my_csr ? my_nrow : my_ncol) + 1;
118 if (ptr_size != dim_p1) {
119 throw std::runtime_error("'pointer_name' dataset should have length equal to the number of " + (my_csr ? std::string("rows") : std::string("columns")) + " plus 1");
120 }
121
122 // We aim to store two chunks in HDF5's chunk cache; one
123 // overlapping the start of the primary dimension element's range,
124 // and one overlapping the end, so that we don't re-read the
125 // content for the new primary dimension element. To simplify
126 // matters, we just read the chunk sizes (in bytes) for both
127 // datasets and use the larger chunk size for both datasets.
128 // Hopefully the chunks are not too big...
129 hsize_t dchunk_length = 0;
130 size_t dchunk_element_size = 0;
131 auto dparms = dhandle.getCreatePlist();
132 if (dparms.getLayout() == H5D_CHUNKED) {
133 dparms.getChunk(1, &dchunk_length);
134 dchunk_element_size = dhandle.getDataType().getSize();
135 }
136
137 hsize_t ichunk_length = 0;
138 size_t ichunk_element_size = 0;
139 auto iparms = ihandle.getCreatePlist();
140 if (iparms.getLayout() == H5D_CHUNKED) {
141 iparms.getChunk(1, &ichunk_length);
142 ichunk_element_size = ihandle.getDataType().getSize();
143 }
144
145 auto non_overflow_double_min = [nonzeros](hsize_t chunk_length) -> size_t {
146 // Basically computes std::min(chunk_length * 2, nonzeros) without
147 // overflowing hsize_t, for a potentially silly choice of hsize_t...
148 if (chunk_length < nonzeros) {
149 return nonzeros;
150 } else {
151 return chunk_length + std::min(chunk_length, nonzeros - chunk_length);
152 }
153 };
154
155 my_chunk_cache_size = std::max(
156 non_overflow_double_min(ichunk_length) * ichunk_element_size,
157 non_overflow_double_min(dchunk_length) * dchunk_element_size
158 );
159
160 // Checking the contents of the index pointers.
161 pointers.resize(dim_p1);
162 phandle.read(pointers.data(), H5::PredType::NATIVE_HSIZE);
163 if (pointers[0] != 0) {
164 throw std::runtime_error("first index pointer should be zero");
165 }
166 if (pointers.back() != nonzeros) {
167 throw std::runtime_error("last index pointer should be equal to the number of non-zero elements");
168 }
169 });
170
171 my_max_non_zeros = 0;
172 for (size_t i = 1; i < pointers.size(); ++i) {
173 hsize_t diff = pointers[i] - pointers[i-1];
174 if (diff > my_max_non_zeros) {
175 my_max_non_zeros = diff;
176 }
177 }
178 }
179
190 CompressedSparseMatrix(Index_ ncsr, Index_ ncol, std::string file_name, std::string value_name, std::string index_name, std::string pointer_name, bool csr) :
191 CompressedSparseMatrix(ncsr, ncol, std::move(file_name), std::move(value_name), std::move(index_name), std::move(pointer_name), csr, CompressedSparseMatrixOptions()) {}
192
193public:
194 Index_ nrow() const {
195 return my_nrow;
196 }
197
198 Index_ ncol() const {
199 return my_ncol;
200 }
201
202 bool is_sparse() const {
203 return true;
204 }
205
206 double is_sparse_proportion() const {
207 return 1;
208 }
209
210 bool prefer_rows() const {
211 return my_csr;
212 }
213
214 double prefer_rows_proportion() const {
215 return static_cast<double>(my_csr);
216 }
217
218 bool uses_oracle(bool) const {
219 return true;
220 }
221
222 using tatami::Matrix<Value_, Index_>::dense;
223
224 using tatami::Matrix<Value_, Index_>::sparse;
225
226 /**************************************
227 ************ Myopic dense ************
228 **************************************/
229private:
230 CompressedSparseMatrix_internal::MatrixDetails<Index_> details() const {
231 return CompressedSparseMatrix_internal::MatrixDetails<Index_>(
232 my_file_name,
233 my_value_name,
234 my_index_name,
235 (my_csr ? my_nrow : my_ncol),
236 (my_csr ? my_ncol : my_nrow),
237 pointers,
238 my_slab_cache_size,
239 my_max_non_zeros,
240 my_chunk_cache_size
241 );
242 }
243
244 template<bool oracle_>
245 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, const tatami::Options&) const {
246 if (row == my_csr) {
247 return std::make_unique<CompressedSparseMatrix_internal::PrimaryFullDense<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
248 details(), std::move(oracle)
249 );
250 } else {
251 return std::make_unique<CompressedSparseMatrix_internal::SecondaryFullDense<oracle_, Value_, Index_, CachedValue_> >(
252 details(), std::move(oracle)
253 );
254 }
255 }
256
257 template<bool oracle_>
258 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, Index_ block_start, Index_ block_length, const tatami::Options&) const {
259 if (row == my_csr) {
260 return std::make_unique<CompressedSparseMatrix_internal::PrimaryBlockDense<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
261 details(), std::move(oracle), block_start, block_length
262 );
263 } else {
264 return std::make_unique<CompressedSparseMatrix_internal::SecondaryBlockDense<oracle_, Value_, Index_, CachedValue_> >(
265 details(), std::move(oracle), block_start, block_length
266 );
267 }
268 }
269
270 template<bool oracle_>
271 std::unique_ptr<tatami::DenseExtractor<oracle_, Value_, Index_> > populate_dense(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options&) const {
272 if (row == my_csr) {
273 return std::make_unique<CompressedSparseMatrix_internal::PrimaryIndexDense<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
274 details(), std::move(oracle), std::move(indices_ptr)
275 );
276 } else {
277 return std::make_unique<CompressedSparseMatrix_internal::SecondaryIndexDense<oracle_, Value_, Index_, CachedValue_> >(
278 details(), std::move(oracle), std::move(indices_ptr)
279 );
280 }
281 }
282
283public:
284 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, const tatami::Options& opt) const {
285 return populate_dense<false>(row, false, opt);
286 }
287
288 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
289 return populate_dense<false>(row, false, block_start, block_length, opt);
290 }
291
292 std::unique_ptr<tatami::MyopicDenseExtractor<Value_, Index_> > dense(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
293 return populate_dense<false>(row, false, std::move(indices_ptr), opt);
294 }
295
296 /***************************************
297 ************ Myopic sparse ************
298 ***************************************/
299private:
300 template<bool oracle_>
301 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, const tatami::Options& opt) const {
302 if (row == my_csr) {
303 return std::make_unique<CompressedSparseMatrix_internal::PrimaryFullSparse<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
304 details(), std::move(oracle), opt.sparse_extract_value, opt.sparse_extract_index
305 );
306 } else {
307 return std::make_unique<CompressedSparseMatrix_internal::SecondaryFullSparse<oracle_, Value_, Index_, CachedValue_> >(
308 details(), std::move(oracle), opt.sparse_extract_value, opt.sparse_extract_index
309 );
310 }
311 }
312
313 template<bool oracle_>
314 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
315 if (row == my_csr) {
316 return std::make_unique<CompressedSparseMatrix_internal::PrimaryBlockSparse<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
317 details(), std::move(oracle), block_start, block_length, opt.sparse_extract_value, opt.sparse_extract_index
318 );
319 } else {
320 return std::make_unique<CompressedSparseMatrix_internal::SecondaryBlockSparse<oracle_, Value_, Index_, CachedValue_> >(
321 details(), std::move(oracle), block_start, block_length, opt.sparse_extract_value, opt.sparse_extract_index
322 );
323 }
324 }
325
326 template<bool oracle_>
327 std::unique_ptr<tatami::SparseExtractor<oracle_, Value_, Index_> > populate_sparse(bool row, tatami::MaybeOracle<oracle_, Index_> oracle, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
328 if (row == my_csr) {
329 return std::make_unique<CompressedSparseMatrix_internal::PrimaryIndexSparse<oracle_, Value_, Index_, CachedValue_, CachedIndex_> >(
330 details(), std::move(oracle), std::move(indices_ptr), opt.sparse_extract_value, opt.sparse_extract_index
331 );
332 } else {
333 return std::make_unique<CompressedSparseMatrix_internal::SecondaryIndexSparse<oracle_, Value_, Index_, CachedValue_> >(
334 details(), std::move(oracle), std::move(indices_ptr), opt.sparse_extract_value, opt.sparse_extract_index
335 );
336 }
337 }
338
339public:
340 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, const tatami::Options& opt) const {
341 return populate_sparse<false>(row, false, opt);
342 }
343
344 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
345 return populate_sparse<false>(row, false, block_start, block_length, opt);
346 }
347
348 std::unique_ptr<tatami::MyopicSparseExtractor<Value_, Index_> > sparse(bool row, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
349 return populate_sparse<false>(row, false, std::move(indices_ptr), opt);
350 }
351
352 /****************************************
353 ************ Oracular dense ************
354 ****************************************/
355public:
356 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, const tatami::Options& opt) const {
357 return populate_dense<true>(row, std::move(oracle), opt);
358 }
359
360 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
361 return populate_dense<true>(row, std::move(oracle), block_start, block_length, opt);
362 }
363
364 std::unique_ptr<tatami::OracularDenseExtractor<Value_, Index_> > dense(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
365 return populate_dense<true>(row, std::move(oracle), std::move(indices_ptr), opt);
366 }
367
368 /*****************************************
369 ************ Oracular sparse ************
370 *****************************************/
371public:
372 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, const tatami::Options& opt) const {
373 return populate_sparse<true>(row, std::move(oracle), opt);
374 }
375
376 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, Index_ block_start, Index_ block_length, const tatami::Options& opt) const {
377 return populate_sparse<true>(row, std::move(oracle), block_start, block_length, opt);
378 }
379
380 std::unique_ptr<tatami::OracularSparseExtractor<Value_, Index_> > sparse(bool row, std::shared_ptr<const tatami::Oracle<Index_> > oracle, tatami::VectorPtr<Index_> indices_ptr, const tatami::Options& opt) const {
381 return populate_sparse<true>(row, std::move(oracle), std::move(indices_ptr), opt);
382 }
383};
384
385}
386
387#endif
Compressed sparse matrix in a HDF5 file.
Definition CompressedSparseMatrix.hpp:71
CompressedSparseMatrix(Index_ ncsr, Index_ ncol, std::string file_name, std::string value_name, std::string index_name, std::string pointer_name, bool csr)
Definition CompressedSparseMatrix.hpp:190
CompressedSparseMatrix(Index_ nrow, Index_ ncol, std::string file_name, std::string value_name, std::string index_name, std::string pointer_name, bool csr, const CompressedSparseMatrixOptions &options)
Definition CompressedSparseMatrix.hpp:96
Representations for matrix data in HDF5 files.
Definition CompressedSparseMatrix.hpp:23
void serialize(Function_ f)
Definition serialize.hpp:45
typename std::conditional< oracle_, std::shared_ptr< const Oracle< Index_ > >, bool >::type MaybeOracle
std::shared_ptr< const std::vector< Index_ > > VectorPtr
Default locking for serial access.
bool sparse_extract_index
bool sparse_extract_value
Options for HDF5 extraction.
Definition CompressedSparseMatrix.hpp:28
size_t maximum_cache_size
Definition CompressedSparseMatrix.hpp:38
Utilities for HDF5 extraction.