tatami_hdf5
tatami bindings for HDF5-backed matrices
Loading...
Searching...
No Matches
sparse_utils.hpp
1#ifndef TATAMI_HDF5_SPARSE_UTILS_HPP
2#define TATAMI_HDF5_SPARSE_UTILS_HPP
3
4#include "serialize.hpp"
5
6#include <algorithm>
7#include <utility>
8#include <cstddef>
9#include <string>
10#include <vector>
11
12#include "H5Cpp.h"
13#include "sanisizer/sanisizer.hpp"
14#include "tatami/tatami.hpp"
15
16namespace tatami_hdf5 {
17
18namespace CompressedSparseMatrix_internal {
19
20template<typename CachedValue_, typename CachedIndex_>
21std::size_t size_of_cached_element(bool needs_cached_value, bool needs_cached_index) {
22 return (needs_cached_index ? sizeof(CachedIndex_) : 0) + (needs_cached_value ? sizeof(CachedValue_) : 0);
23}
24
25struct Components {
26 H5::H5File file;
27 H5::DataSet data_dataset;
28 H5::DataSet index_dataset;
29 H5::DataSpace dataspace;
30 H5::DataSpace memspace;
31};
32
33struct ChunkCacheSizes {
34 ChunkCacheSizes() = default;
35 ChunkCacheSizes(hsize_t value, hsize_t index) : value(value), index(index) {}
36 hsize_t value = 0;
37 hsize_t index = 0;
38};
39
40// Consider the case where we're iterating across primary dimension elements and extracting its contents from file.
41// At each iteration, we will have a partially-read chunk that spans the ending values of the latest primary dimension element.
42// (Unless we're going backwards, in which case the partially-read chunk would span the starting values.)
43// We want to cache this chunk so that its contents can be fully read upon scanning the next primary dimension element.
44//
45// In practice, we want the cache to be large enough to hold three chunks simultaneously;
46// one for the partially read chunk at the start of a primary dimension element, one for the partially read chunk at the end,
47// and another just in case HDF5 needs to cache the middle chunks before copying them to the user buffer.
48// This arrangement ensures that we can hold both partially read chunks while storing and evicting the fully read chunks,
49// no matter what order the HDF5 library uses to read chunks to satisfy our request.
50//
51// In any case, we also ensure that the HDF5 chunk cache is at least 1 MB (the default).
52// This allows us to be at least as good as the default in cases where reads are non-contiguous and we've got partially read chunks everywhere.
53inline hsize_t compute_chunk_cache_size(hsize_t nonzeros, hsize_t chunk_length, std::size_t element_size) {
54 if (chunk_length == 0) {
55 return 0;
56 }
57 hsize_t num_chunks = std::min(nonzeros / chunk_length + (nonzeros % chunk_length > 0), static_cast<hsize_t>(3));
58 hsize_t cache_size = sanisizer::product<hsize_t>(num_chunks, chunk_length);
59 return std::max(sanisizer::product<hsize_t>(cache_size, element_size), sanisizer::cap<hsize_t>(1000000));
60}
61
62// In all cases, we know that max_non_zeros can be safely casted between hsize_t and Index_,
63// because the value is derived from differences between hsize_t pointers.
64// We also know that it can be safely cast to std::size_t, as max_non_zeros is no greater than the dimension extents,
65// and we know that the dimension extents must be representable as a std::size_t as per the tatami contract.
66
67template<typename Index_>
68struct MatrixDetails {
69 MatrixDetails(
70 const std::string& file_name,
71 const std::string& value_name,
72 const std::string& index_name,
73 Index_ primary_dim,
74 Index_ secondary_dim,
75 const std::vector<hsize_t>& pointers,
76 std::size_t slab_cache_size,
77 Index_ max_non_zeros,
78 ChunkCacheSizes chunk_cache_sizes
79 ) :
80 file_name(file_name),
81 value_name(value_name),
82 index_name(index_name),
83 primary_dim(primary_dim),
84 secondary_dim(secondary_dim),
85 pointers(pointers),
86 slab_cache_size(slab_cache_size),
87 max_non_zeros(max_non_zeros),
88 chunk_cache_sizes(std::move(chunk_cache_sizes))
89 {}
90
91 const std::string& file_name;
92 const std::string& value_name;
93 const std::string& index_name;
94
95 Index_ primary_dim;
96 Index_ secondary_dim;
97 const std::vector<hsize_t>& pointers;
98
99 std::size_t slab_cache_size; // size for our own cache of slabs.
100 Index_ max_non_zeros;
101 ChunkCacheSizes chunk_cache_sizes; // size for HDF5's cache of uncompressed chunks.
102};
103
104// All HDF5-related members are stored in a separate pointer so we can serialize construction and destruction.
105template<typename Index_>
106void initialize(const MatrixDetails<Index_>& details, std::unique_ptr<Components>& h5comp) {
107 serialize([&]() -> void {
108 h5comp.reset(new Components);
109
110 auto create_dapl = [&](hsize_t cache_size) -> H5::DSetAccPropList {
111 // passing an ID is the only way to get the constructor to make a copy, not a reference (who knows why???).
112 H5::DSetAccPropList dapl(H5::DSetAccPropList::DEFAULT.getId());
113 dapl.setChunkCache(H5D_CHUNK_CACHE_NSLOTS_DEFAULT, cache_size, H5D_CHUNK_CACHE_W0_DEFAULT);
114 return dapl;
115 };
116
117 h5comp->file.openFile(details.file_name, H5F_ACC_RDONLY);
118 h5comp->data_dataset = h5comp->file.openDataSet(details.value_name, create_dapl(details.chunk_cache_sizes.value));
119 h5comp->index_dataset = h5comp->file.openDataSet(details.index_name, create_dapl(details.chunk_cache_sizes.index));
120 h5comp->dataspace = h5comp->data_dataset.getSpace();
121 });
122}
123
124inline void destroy(std::unique_ptr<Components>& h5comp) {
125 serialize([&]() -> void {
126 h5comp.reset();
127 });
128}
129
130// If we're only interested in the block [X, Y), we don't need to extract the entire set of indices for a primary element.
131// We only need to extract the first Y elements, as all subsequent elements must have indices greater than or equal to Y.
132// Similarly, we only need to extract the last 'dim - X' elements, as all preceding elements must have indices less than X.
133// This allows us to narrow the range of indices to be extracted.
134template<typename Pointer_, typename Index_>
135std::pair<Pointer_, Pointer_> narrow_primary_extraction_range(
136 const Pointer_ ptr_start,
137 const Pointer_ ptr_end,
138 const Index_ block_start,
139 const Index_ block_end,
140 const Index_ secondary_dim
141) {
142 const Pointer_ num_nonzeros = ptr_end - ptr_start;
143 const Pointer_ new_ptr_end = ptr_start + sanisizer::min(block_end, num_nonzeros);
144 const Pointer_ new_ptr_start = ptr_end - sanisizer::min(secondary_dim - block_start, num_nonzeros);
145
146 // It is guaranteed that new_ptr_start <= new_ptr_end.
147 // This is because the difference boils down to:
148 //
149 // -num_nonzeros + num_nonzeros + num_nonzeros, OR
150 // -num_nonzeros + block_end + num_nonzeros, OR
151 // -num_nonzeros + num_nonzeros + secondary_dim - block_start, OR
152 // -num_nonzeros + block_end + secondary_dim - block_start
153 //
154 // All of which are guaranteed to be non-negative.
155 return std::make_pair(new_ptr_start, new_ptr_end);
156}
157
158template<class IndexIt_, typename Index_>
159void refine_primary_limits(IndexIt_& indices_start, IndexIt_& indices_end, Index_ extent, Index_ smallest, Index_ largest_plus_one) {
160 if (smallest) {
161 // Using custom comparator to ensure that we cast to Index_ for signedness-safe comparisons.
162 indices_start = std::lower_bound(indices_start, indices_end, smallest, [](Index_ a, Index_ b) -> bool { return a < b; });
163 }
164
165 if (largest_plus_one != extent) {
166 indices_end = std::lower_bound(indices_start, indices_end, largest_plus_one, [](Index_ a, Index_ b) -> bool { return a < b; });
167 }
168}
169
170template<typename Slab_, typename Value_, typename Index_>
171tatami::SparseRange<Value_, Index_> slab_to_sparse(const Slab_& slab, Value_* value_buffer, Index_* index_buffer) {
172 tatami::SparseRange<Value_, Index_> output(slab.number);
173 if (slab.value) {
174 std::copy_n(slab.value, slab.number, value_buffer);
175 output.value = value_buffer;
176 }
177 if (slab.index) {
178 std::copy_n(slab.index, slab.number, index_buffer);
179 output.index = index_buffer;
180 }
181 return output;
182}
183
184template<typename Slab_, typename Value_, typename Index_>
185Value_* slab_to_dense(const Slab_& slab, Value_* buffer, Index_ extract_length) {
186 std::fill_n(buffer, extract_length, 0);
187 for (Index_ i = 0; i < slab.number; ++i) {
188 buffer[slab.index[i]] = slab.value[i];
189 }
190 return buffer;
191}
192
193}
194
195}
196
197#endif
Representations for matrix data in HDF5 files.
Definition CompressedSparseMatrix.hpp:24
void serialize(Function_ f)
Definition serialize.hpp:53
Default locking for serial access.