1#ifndef TATAMI_WRITE_SPARSE_MATRIX_TO_HDF5_HPP
2#define TATAMI_WRITE_SPARSE_MATRIX_TO_HDF5_HPP
106inline H5::DataSet create_1d_compressed_hdf5_dataset(H5::Group& location,
WriteStorageType type,
const std::string& name, hsize_t length,
int deflate_level, hsize_t chunk) {
107 H5::DataSpace dspace(1, &length);
108 H5::DSetCreatPropList plist;
110 if (deflate_level >= 0 && length) {
111 plist.setDeflate(deflate_level);
112 if (chunk > length) {
113 plist.setChunk(1, &length);
115 plist.setChunk(1, &chunk);
119 const auto dtype = choose_pred_type(type);
120 return location.createDataSet(name, *dtype, dspace, plist);
123template<
typename Type_>
124bool does_non_negative_integer_fit(
const WriteStorageType type,
const Type_ x) {
125 static_assert(std::is_integral<Type_>::value);
129 case WriteStorageType::INT8:
130 okay = fits_upper_limit<std::int8_t>(x);
132 case WriteStorageType::UINT8:
133 okay = fits_upper_limit<std::uint8_t>(x);
135 case WriteStorageType::INT16:
136 okay = fits_upper_limit<std::int16_t >(x);
138 case WriteStorageType::UINT16:
139 okay = fits_upper_limit<std::uint16_t>(x);
141 case WriteStorageType::INT32:
142 okay = fits_upper_limit<std::int32_t >(x);
144 case WriteStorageType::UINT32:
145 okay = fits_upper_limit<std::uint32_t>(x);
147 case WriteStorageType::INT64:
148 okay = fits_upper_limit<std::int64_t >(x);
150 case WriteStorageType::UINT64:
151 okay = fits_upper_limit<std::uint64_t>(x);
160template<
typename Index_>
161WriteStorageType choose_index_type(
const std::optional<WriteStorageType>& index_type, Index_ upper_index) {
162 static_assert(std::is_integral<Index_>::value);
164 if (!index_type.has_value()) {
165 if (fits_upper_limit<std::uint8_t>(upper_index)) {
166 return WriteStorageType::UINT8;
167 }
else if (fits_upper_limit<std::uint16_t>(upper_index)) {
168 return WriteStorageType::UINT16;
169 }
else if (fits_upper_limit<std::uint32_t>(upper_index)) {
170 return WriteStorageType::UINT32;
171 }
else if (fits_upper_limit<std::uint64_t>(upper_index)) {
172 return WriteStorageType::UINT64;
174 throw std::runtime_error(
"no type can store the largest index");
177 const auto itype = *index_type;
178 if (!does_non_negative_integer_fit(itype, upper_index)) {
179 throw std::runtime_error(
"specified type cannot store the largest index");
185inline WriteStorageType choose_ptr_type(
const std::optional<WriteStorageType>& ptr_type, hsize_t nnzero) {
186 if (!ptr_type.has_value()) {
187 if (fits_upper_limit<std::uint32_t>(nnzero)) {
188 return WriteStorageType::UINT32;
189 }
else if (fits_upper_limit<std::uint64_t>(nnzero)) {
190 return WriteStorageType::UINT64;
193 throw std::runtime_error(
"no type can store the number of non-zero elements");
196 const auto ptype = *ptr_type;
197 if (!does_non_negative_integer_fit(ptype, nnzero)) {
198 throw std::runtime_error(
"specified type cannot store the number of non-zero elements");
204template<
typename Value_,
typename Index_>
205struct WriteSparseHdf5Statistics {
206 Value_ lower_data = 0;
207 Value_ upper_data = 0;
208 Index_ upper_index = 0;
209 hsize_t non_zeros = 0;
210 bool has_decimal =
false;
211 bool has_nonfinite =
false;
213 void add_value(Value_ val) {
214 if constexpr(!std::is_integral<Value_>::value) {
215 if (std::trunc(val) != val) {
218 if (!std::isfinite(val)) {
219 has_nonfinite =
true;
223 if (val < lower_data) {
225 }
else if (val > upper_data) {
230 void add_index(Index_ idx) {
231 if (idx > upper_index) {
237template<
typename Value_,
typename Index_>
240 output.non_zeros = sanisizer::sum<hsize_t>(output.non_zeros, extracted.
number);
242 for (Index_ i = 0; i < extracted.
number; ++i) {
243 output.add_value(extracted.
value[i]);
246 for (Index_ i = 0; i < extracted.
number; ++i) {
247 output.add_index(extracted.
index[i]);
251template<
typename Value_,
typename Index_>
252void update_hdf5_stats(
const Value_* extracted, Index_ n, WriteSparseHdf5Statistics<Value_, Index_>& output) {
253 Index_ local_nonzero = 0;
254 for (Index_ i = 0; i < n; ++i) {
255 auto val = extracted[i];
260 output.add_value(val);
265 output.non_zeros = sanisizer::sum<hsize_t>(output.non_zeros, local_nonzero);
268template<
typename Value_,
typename Index_>
270 const auto NR = mat.
nrow(), NC = mat.
ncol();
272 WriteSparseHdf5Statistics<Value_, Index_> output;
273 auto collected = sanisizer::create<std::vector<WriteSparseHdf5Statistics<Value_, Index_> > >(nthreads - 1);
279 WriteSparseHdf5Statistics<Value_, Index_> current_output;
282 std::vector<Value_> xbuffer(NC);
283 std::vector<Index_> ibuffer(NC);
284 for (Index_ r = start, end = start + len; r < end; ++r) {
285 auto extracted = wrk->fetch(r, xbuffer.data(), ibuffer.data());
286 update_hdf5_stats(extracted, current_output);
290 (t ? collected[t - 1] : output) = std::move(current_output);
295 WriteSparseHdf5Statistics<Value_, Index_> current_output;
298 std::vector<Value_> xbuffer(NR);
299 std::vector<Index_> ibuffer(NR);
300 for (Index_ c = start, end = start + len; c < end; ++c) {
301 auto extracted = wrk->fetch(c, xbuffer.data(), ibuffer.data());
302 update_hdf5_stats(extracted, current_output);
306 (t ? collected[t - 1] : output) = std::move(current_output);
313 WriteSparseHdf5Statistics<Value_, Index_> current_output;
316 std::vector<Value_> xbuffer(NC);
317 for (Index_ r = start, end = start + len; r < end; ++r) {
318 auto extracted = wrk->fetch(r, xbuffer.data());
319 update_hdf5_stats(extracted, NC, current_output);
323 (t ? collected[t - 1] : output) = std::move(current_output);
328 WriteSparseHdf5Statistics<Value_, Index_> current_output;
331 std::vector<Value_> xbuffer(NR);
332 for (Index_ c = start, end = start + len; c < end; ++c) {
333 auto extracted = wrk->fetch(c, xbuffer.data());
334 update_hdf5_stats(extracted, NR, current_output);
338 (t ? collected[t - 1] : output) = std::move(current_output);
343 for (
int i = 1; i < num_used; ++i) {
344 auto& current = collected[i - 1];
345 output.lower_data = std::min(output.lower_data, current.lower_data);
346 output.upper_data = std::max(output.upper_data, current.upper_data);
347 output.upper_index = std::max(output.upper_index, current.upper_index);
348 output.non_zeros = sanisizer::sum<hsize_t>(output.non_zeros, current.non_zeros);
349 output.has_decimal = output.has_decimal || current.has_decimal;
350 output.has_nonfinite = output.has_nonfinite || current.has_nonfinite;
356template<
typename Value_,
typename Index_>
357void write_compressed_sparse_matrix_two_pass(
361 const std::string& data_name,
362 const std::string& index_name,
363 const std::string& ptr_name,
364 const WriteCompressedSparseMatrixOptions& params
366 auto stats = write_sparse_hdf5_statistics(mat, params.num_threads);
367 const auto data_type = choose_data_type(params.data_type, stats.lower_data, stats.upper_data, stats.has_decimal, params.force_integer, stats.has_nonfinite);
368 const auto index_type = choose_index_type(params.index_type, stats.upper_index);
371 const auto non_zeros = stats.non_zeros;
372 H5::DataSet data_ds = create_1d_compressed_hdf5_dataset(location, data_type, data_name, non_zeros, params.deflate_level, params.chunk_size);
373 H5::DataSet index_ds = create_1d_compressed_hdf5_dataset(location, index_type, index_name, non_zeros, params.deflate_level, params.chunk_size);
375 H5::DataSpace inspace(1, &non_zeros);
376 H5::DataSpace outspace(1, &non_zeros);
377 const auto& dstype = define_mem_type<Value_>();
378 const auto& ixtype = define_mem_type<Index_>();
380 const Index_ NR = mat.
nrow(), NC = mat.
ncol();
381 std::vector<hsize_t> ptrs;
383 auto fill_datasets = [&](
const Value_* vptr,
const Index_* iptr, hsize_t count) ->
void {
385 inspace.setExtentSimple(1, &count);
386 outspace.selectHyperslab(H5S_SELECT_SET, &count, &offset);
387 data_ds.write(vptr, dstype, inspace, outspace);
388 index_ds.write(iptr, ixtype, inspace, outspace);
394 if (layout == WriteStorageLayout::ROW) {
395 ptrs.resize(sanisizer::sum<
decltype(ptrs.size())>(NR, 1));
400 for (Index_ r = 0; r < NR; ++r) {
401 auto extracted = wrk->fetch(r, xbuffer.data(), ibuffer.data());
402 fill_datasets(extracted.value, extracted.index, extracted.number);
403 ptrs[r + 1] = offset;
407 ptrs.resize(sanisizer::sum<
decltype(ptrs.size())>(NC, 1));
412 for (Index_ c = 0; c < NC; ++c) {
413 auto extracted = wrk->fetch(c, xbuffer.data(), ibuffer.data());
414 fill_datasets(extracted.value, extracted.index, extracted.number);
415 ptrs[c + 1] = offset;
420 std::vector<Value_> sparse_xbuffer;
421 std::vector<Index_> sparse_ibuffer;
422 auto fill_datasets_from_dense = [&](
const Value_* extracted, Index_ n) ->
void {
423 sparse_xbuffer.clear();
424 sparse_ibuffer.clear();
425 for (Index_ i = 0; i < n; ++i) {
427 sparse_xbuffer.push_back(extracted[i]);
428 sparse_ibuffer.push_back(i);
432 hsize_t count = sparse_xbuffer.size();
433 fill_datasets(sparse_xbuffer.data(), sparse_ibuffer.data(), count);
436 if (layout == WriteStorageLayout::ROW) {
437 ptrs.resize(sanisizer::sum<
decltype(ptrs.size())>(NR, 1));
440 for (Index_ r = 0; r < NR; ++r) {
441 auto extracted = wrk->fetch(r, dbuffer.data());
442 fill_datasets_from_dense(extracted, NC);
443 ptrs[r + 1] = offset;
447 ptrs.resize(sanisizer::sum<
decltype(ptrs.size())>(NC, 1));
450 for (Index_ c = 0; c < NC; ++c) {
451 auto extracted = wrk->fetch(c, dbuffer.data());
452 fill_datasets_from_dense(extracted, NR);
453 ptrs[c + 1] = offset;
459 auto ptr_len = sanisizer::cast<hsize_t>(ptrs.size());
460 H5::DataSet ptr_ds = create_1d_compressed_hdf5_dataset(
462 choose_ptr_type(params.ptr_type, ptrs.back()),
465 params.deflate_level,
468 H5::DataSpace ptr_space(1, &ptr_len);
469 ptr_ds.write(ptrs.data(), H5::PredType::NATIVE_HSIZE, ptr_space);
474inline H5::DataSet create_1d_compressed_hdf5_dataset(H5::Group& location,
WriteStorageType type,
const std::string& name,
int deflate_level, hsize_t chunk) {
475 const hsize_t length = 0;
476 constexpr auto copy = H5S_UNLIMITED;
477 H5::DataSpace dspace(1, &length, ©);
478 H5::DSetCreatPropList plist;
479 plist.setDeflate(deflate_level);
480 plist.setChunk(1, &chunk);
481 const auto dtype = choose_pred_type(type);
482 return location.createDataSet(name, *dtype, dspace, plist);
485template<
typename Value_,
typename Index_>
486void write_compressed_sparse_matrix_one_pass(
490 const std::string& data_name,
491 const std::string& index_name,
492 const std::string& ptr_name,
493 const WriteCompressedSparseMatrixOptions& params
495 const auto requested_dtype = *(params.data_type);
496 const auto requested_itype = *(params.index_type);
497 H5::DataSet data_ds = create_1d_compressed_hdf5_dataset(location, requested_dtype, data_name, params.deflate_level, params.chunk_size);
498 H5::DataSet index_ds = create_1d_compressed_hdf5_dataset(location, requested_itype, index_name, params.deflate_level, params.chunk_size);
501 H5::DataSpace outspace;
502 const auto& dstype = define_mem_type<Value_>();
503 const auto& ixtype = define_mem_type<Index_>();
505 const Index_ NR = mat.
nrow(), NC = mat.
ncol();
506 std::vector<hsize_t> ptrs;
508 auto fill_datasets = [&](
const Value_* vptr,
const Index_* iptr, hsize_t count, H5::DataSpace& inspace) ->
void {
511 const hsize_t new_size = sanisizer::sum<hsize_t>(offset, count);
512 data_ds.extend(&new_size);
513 index_ds.extend(&new_size);
515 constexpr hsize_t zero = 0;
516 inspace.selectHyperslab(H5S_SELECT_SET, &count, &zero);
517 outspace.setExtentSimple(1, &new_size);
518 outspace.selectHyperslab(H5S_SELECT_SET, &count, &offset);
520 data_ds.write(vptr, dstype, inspace, outspace);
521 index_ds.write(iptr, ixtype, inspace, outspace);
527 auto fill_datasets_from_sparse = [&](
const Value_* vptr,
const Index_* iptr, Index_ n, H5::DataSpace& inspace) ->
void {
528 for (Index_ i = 0; i < n; ++i) {
529 check_data_value_fit(requested_dtype, vptr[i]);
530 does_non_negative_integer_fit(requested_itype, iptr[i]);
533 const auto count = sanisizer::cast<hsize_t>(n);
534 fill_datasets(vptr, iptr, count, inspace);
537 if (layout == WriteStorageLayout::ROW) {
538 ptrs.resize(sanisizer::sum<
decltype(ptrs.size())>(NR, 1));
541 const hsize_t extent = NC;
542 H5::DataSpace inspace(1, &extent);
545 for (Index_ r = 0; r < NR; ++r) {
546 auto extracted = wrk->fetch(r, xbuffer.data(), ibuffer.data());
547 fill_datasets_from_sparse(extracted.value, extracted.index, extracted.number, inspace);
548 ptrs[r + 1] = offset;
552 ptrs.resize(sanisizer::sum<
decltype(ptrs.size())>(NC, 1));
555 const hsize_t extent = NR;
556 H5::DataSpace inspace(1, &extent);
559 for (Index_ c = 0; c < NC; ++c) {
560 auto extracted = wrk->fetch(c, xbuffer.data(), ibuffer.data());
561 fill_datasets_from_sparse(extracted.value, extracted.index, extracted.number, inspace);
562 ptrs[c + 1] = offset;
567 std::vector<Value_> sparse_xbuffer;
568 std::vector<Index_> sparse_ibuffer;
569 auto fill_datasets_from_dense = [&](
const Value_* extracted, Index_ n, H5::DataSpace& inspace) ->
void {
570 sparse_xbuffer.clear();
571 sparse_ibuffer.clear();
572 for (Index_ i = 0; i < n; ++i) {
574 check_data_value_fit(requested_dtype, extracted[i]);
575 sparse_xbuffer.push_back(extracted[i]);
576 does_non_negative_integer_fit(requested_itype, i);
577 sparse_ibuffer.push_back(i);
581 const auto count = sanisizer::cast<hsize_t>(sparse_xbuffer.size());
582 fill_datasets(sparse_xbuffer.data(), sparse_ibuffer.data(), count, inspace);
585 if (layout == WriteStorageLayout::ROW) {
586 ptrs.resize(sanisizer::sum<
decltype(ptrs.size())>(NR, 1));
588 const hsize_t extent = NC;
589 H5::DataSpace inspace(1, &extent);
592 for (Index_ r = 0; r < NR; ++r) {
593 auto extracted = wrk->fetch(r, dbuffer.data());
594 fill_datasets_from_dense(extracted, NC, inspace);
595 ptrs[r + 1] = offset;
599 ptrs.resize(sanisizer::sum<
decltype(ptrs.size())>(NC, 1));
601 const hsize_t extent = NR;
602 H5::DataSpace inspace(1, &extent);
605 for (Index_ c = 0; c < NC; ++c) {
606 auto extracted = wrk->fetch(c, dbuffer.data());
607 fill_datasets_from_dense(extracted, NR, inspace);
608 ptrs[c + 1] = offset;
614 auto ptr_len = sanisizer::cast<hsize_t>(ptrs.size());
615 H5::DataSet ptr_ds = create_1d_compressed_hdf5_dataset(
617 choose_ptr_type(params.ptr_type, ptrs.back()),
620 params.deflate_level,
623 H5::DataSpace ptr_space(1, &ptr_len);
624 ptr_ds.write(ptrs.data(), H5::PredType::NATIVE_HSIZE, ptr_space);
643template<
typename Value_,
typename Index_>
651 layout = WriteStorageLayout::ROW;
653 layout = WriteStorageLayout::COLUMN;
658 std::string data_name;
665 std::string index_name;
669 index_name =
"indices";
672 std::string ptr_name;
681 write_compressed_sparse_matrix_two_pass(mat, location, layout, data_name, index_name, ptr_name, params);
683 write_compressed_sparse_matrix_one_pass(mat, location, layout, data_name, index_name, ptr_name, params);
696template<
typename Value_,
typename Index_>
706template<
typename Value_,
typename Index_>
711template<
typename Value_,
typename Index_>
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Representations for matrix data in HDF5 files.
Definition CompressedSparseMatrix.hpp:24
WriteStorageLayout
Definition utils.hpp:26
WriteStorageType
Definition utils.hpp:31
void write_compressed_sparse_matrix(const tatami::Matrix< Value_, Index_ > &mat, H5::Group &location, const WriteCompressedSparseMatrixOptions ¶ms)
Definition write_compressed_sparse_matrix.hpp:644
int parallelize(Function_ fun, const Index_ tasks, const int workers)
Container_ create_container_of_Index_size(const Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, const bool row, const Index_ iter_start, const Index_ iter_length, Args_ &&... args)
Parameters for write_compressed_sparse_matrix().
Definition write_compressed_sparse_matrix.hpp:26
std::optional< WriteStorageLayout > columnar
Definition write_compressed_sparse_matrix.hpp:49
bool two_pass
Definition write_compressed_sparse_matrix.hpp:94
bool force_integer
Definition write_compressed_sparse_matrix.hpp:63
int deflate_level
Definition write_compressed_sparse_matrix.hpp:82
std::optional< WriteStorageType > ptr_type
Definition write_compressed_sparse_matrix.hpp:75
hsize_t chunk_size
Definition write_compressed_sparse_matrix.hpp:87
std::optional< std::string > data_name
Definition write_compressed_sparse_matrix.hpp:31
std::optional< std::string > ptr_name
Definition write_compressed_sparse_matrix.hpp:43
std::optional< std::string > index_name
Definition write_compressed_sparse_matrix.hpp:37
std::optional< WriteStorageType > index_type
Definition write_compressed_sparse_matrix.hpp:69
std::optional< WriteStorageType > data_type
Definition write_compressed_sparse_matrix.hpp:55
int num_threads
Definition write_compressed_sparse_matrix.hpp:100
Utilities for HDF5 extraction.