tatami_hdf5
tatami bindings for HDF5-backed matrices
Loading...
Searching...
No Matches
utils.hpp
Go to the documentation of this file.
1#ifndef TATAMI_HDF5_UTILS_HPP
2#define TATAMI_HDF5_UTILS_HPP
3
4#include "H5Cpp.h"
5
6#include "tatami/tatami.hpp"
7
8#include <cstdint>
9#include <array>
10#include <string>
11#include <type_traits>
12#include <stdexcept>
13#include <cassert>
14#include <cmath>
15
21namespace tatami_hdf5 {
22
26enum class WriteStorageLayout { COLUMN, ROW };
27
31enum class WriteStorageType { INT8, UINT8, INT16, UINT16, INT32, UINT32, INT64, UINT64, FLOAT, DOUBLE };
32
36template<typename Left_, typename Right_>
37bool is_less_than_or_equal(Left_ l, Right_ r) {
38 static_assert(std::is_integral<Left_>::value);
39 static_assert(std::is_integral<Right_>::value);
40
41 constexpr bool lsigned = std::is_signed<Left_>::value;
42 constexpr bool rsigned = std::is_signed<Right_>::value;
43 if constexpr(lsigned == rsigned) {
44 return l <= r;
45 } else if constexpr(lsigned) {
46 return l <= 0 || static_cast<typename std::make_unsigned<Left_>::type>(l) <= r;
47 } else {
48 return r >= 0 && l <= static_cast<typename std::make_unsigned<Right_>::type>(r);
49 }
50}
51
52template<typename Native_, typename Max_>
53bool fits_upper_limit(Max_ max) {
54 static_assert(std::is_integral<Native_>::value);
55
56 if constexpr(std::is_integral<Max_>::value) { // Native_ is already integral, so no need to check that.
57 constexpr auto native_max = std::numeric_limits<Native_>::max();
58 return is_less_than_or_equal(max, native_max);
59 } else {
60 // We don't compare values directly as the Native_-to-float conversion might not be exact;
61 // if native_max gets rounded up during the conversion, we might end up with a situation where 'native_max < max <= FLOAT(native_max)'.
62 // This would result in undefined behavior when casting values equal to 'max' to Native_.
63 //
64 // So instead, we compare the number of bits in Native_ with that required to store our (truncated) 'max'.
65 // We ignore negative or zero values of 'max' as required_bits_for_float() expects positive values.
66 // (Non-positive values would always be less than any 'native_max', so we can always return true in such cases.)
67 constexpr auto digits = std::numeric_limits<Native_>::digits;
68 assert(max == std::trunc(max));
69 return (max <= 0 || sanisizer::required_bits_for_float(max) <= digits);
70 }
71}
72
73template<typename Native_, typename Min_>
74bool fits_lower_limit(Min_ min) {
75 static_assert(std::is_integral<Native_>::value);
76
77 if constexpr(std::is_integral<Min_>::value) {
78 constexpr auto native_min = std::numeric_limits<Native_>::min();
79 return is_less_than_or_equal(native_min, min);
80 } else {
81 assert(min == std::trunc(min));
82 if constexpr(std::is_unsigned<Native_>::value) {
83 return min >= 0;
84 } else {
85 // Pretty much the same logic as fits_upper_limit() but we reverse the sign.
86 // We add 1 before reversing to account for the sign bit, i.e., -128 becomes 127 for 7 bits.
87 constexpr auto digits = std::numeric_limits<Native_>::digits;
88 return (min >= -1 || sanisizer::required_bits_for_float(-(min + 1)) <= digits);
89 }
90 }
91}
92
93template<typename Value_>
94void check_integer_range_fit(const WriteStorageType data_type, Value_ lower_data, Value_ upper_data) {
95 bool okay = false;
96
97 switch (data_type) {
98 case WriteStorageType::INT8:
99 okay = fits_lower_limit<std::int8_t >(lower_data) && fits_upper_limit<std::int8_t >(upper_data);
100 break;
101 case WriteStorageType::UINT8:
102 okay = fits_lower_limit<std::uint8_t >(lower_data) && fits_upper_limit<std::uint8_t >(upper_data);
103 break;
104 case WriteStorageType::INT16:
105 okay = fits_lower_limit<std::int16_t >(lower_data) && fits_upper_limit<std::int16_t >(upper_data);
106 break;
107 case WriteStorageType::UINT16:
108 okay = fits_lower_limit<std::uint16_t>(lower_data) && fits_upper_limit<std::uint16_t>(upper_data);
109 break;
110 case WriteStorageType::INT32:
111 okay = fits_lower_limit<std::int32_t >(lower_data) && fits_upper_limit<std::int32_t >(upper_data);
112 break;
113 case WriteStorageType::UINT32:
114 okay = fits_lower_limit<std::uint32_t>(lower_data) && fits_upper_limit<std::uint32_t>(upper_data);
115 break;
116 case WriteStorageType::INT64:
117 okay = fits_lower_limit<std::int64_t >(lower_data) && fits_upper_limit<std::int64_t >(upper_data);
118 break;
119 case WriteStorageType::UINT64:
120 okay = fits_lower_limit<std::uint64_t>(lower_data) && fits_upper_limit<std::uint64_t>(upper_data);
121 break;
122 default:
123 ; // we should never get to this point as everything should already be truncated.
124 }
125
126 if (!okay) {
127 throw std::runtime_error("no integer type can store the matrix values");
128 }
129}
130
131template<typename Value_>
132void check_data_value_fit(const WriteStorageType data_type, Value_ val) {
133 if (data_type != WriteStorageType::DOUBLE && data_type != WriteStorageType::FLOAT) {
134 if constexpr(std::is_floating_point<Value_>::value) {
135 val = std::trunc(val);
136 if (!std::isfinite(val)) {
137 throw std::runtime_error("cannot store non-finite floating-point values as integers");
138 }
139 }
140 check_integer_range_fit(data_type, val, val);
141 }
142}
143
144template<typename Value_>
145WriteStorageType choose_data_type(
146 const std::optional<WriteStorageType>& data_type,
147 Value_ lower_data,
148 Value_ upper_data,
149 bool has_decimal,
150 bool force_integer,
151 bool has_nonfinite
152) {
153 if (!data_type.has_value()) {
154 if ((has_decimal && !force_integer) || has_nonfinite) {
155 if constexpr(std::is_same<Value_, float>::value) {
156 return WriteStorageType::FLOAT;
157 } else {
158 return WriteStorageType::DOUBLE;
159 }
160 }
161
162 if constexpr(std::is_floating_point<Value_>::value) {
163 lower_data = std::trunc(lower_data);
164 upper_data = std::trunc(upper_data);
165 }
166
167 if (lower_data < 0) {
168 if (fits_lower_limit<std::int8_t>(lower_data) && fits_upper_limit<std::int8_t>(upper_data)) {
169 return WriteStorageType::INT8;
170 } else if (fits_lower_limit<std::int16_t>(lower_data) && fits_upper_limit<std::int16_t>(upper_data)) {
171 return WriteStorageType::INT16;
172 } else if (fits_lower_limit<std::int32_t>(lower_data) && fits_upper_limit<std::int32_t>(upper_data)) {
173 return WriteStorageType::INT32;
174 } else if (fits_lower_limit<std::int64_t>(lower_data) && fits_upper_limit<std::int64_t>(upper_data)) {
175 return WriteStorageType::INT64;
176 }
177
178 } else {
179 if (fits_upper_limit<std::uint8_t>(upper_data)) {
180 return WriteStorageType::UINT8;
181 } else if (fits_upper_limit<std::uint16_t>(upper_data)) {
182 return WriteStorageType::UINT16;
183 } else if (fits_upper_limit<std::uint32_t>(upper_data)) {
184 return WriteStorageType::UINT32;
185 } else if (fits_upper_limit<std::uint64_t>(upper_data)) {
186 return WriteStorageType::UINT64;
187 }
188 }
189
190 throw std::runtime_error("no type can store the matrix values");
191 }
192
193 const auto dtype = *data_type;
194 if (data_type != WriteStorageType::DOUBLE && data_type != WriteStorageType::FLOAT) {
195 if constexpr(std::is_floating_point<Value_>::value) {
196 lower_data = std::trunc(lower_data);
197 upper_data = std::trunc(upper_data);
198 if (has_nonfinite) {
199 throw std::runtime_error("cannot store non-finite floating-point values as integers");
200 }
201 }
202 check_integer_range_fit(dtype, lower_data, upper_data);
203 }
204
205 return dtype;
206}
207
208inline const H5::PredType* choose_pred_type(WriteStorageType type) {
209 const H5::PredType* dtype = NULL;
210
211 switch (type) {
212 case WriteStorageType::INT8:
213 dtype = &(H5::PredType::NATIVE_INT8);
214 break;
215 case WriteStorageType::UINT8:
216 dtype = &(H5::PredType::NATIVE_UINT8);
217 break;
218 case WriteStorageType::INT16:
219 dtype = &(H5::PredType::NATIVE_INT16);
220 break;
221 case WriteStorageType::UINT16:
222 dtype = &(H5::PredType::NATIVE_UINT16);
223 break;
224 case WriteStorageType::INT32:
225 dtype = &(H5::PredType::NATIVE_INT32);
226 break;
227 case WriteStorageType::UINT32:
228 dtype = &(H5::PredType::NATIVE_UINT32);
229 break;
230 case WriteStorageType::INT64:
231 dtype = &(H5::PredType::NATIVE_INT64);
232 break;
233 case WriteStorageType::UINT64:
234 dtype = &(H5::PredType::NATIVE_UINT64);
235 break;
236 case WriteStorageType::FLOAT:
237 dtype = &(H5::PredType::NATIVE_FLOAT);
238 break;
239 case WriteStorageType::DOUBLE:
240 dtype = &(H5::PredType::NATIVE_DOUBLE);
241 break;
242 default:
243 throw std::runtime_error("automatic HDF5 output type must be resolved before creating a HDF5 dataset");
244 }
245
246 return dtype;
247}
248
249template<typename T>
250const H5::PredType& define_mem_type() {
251 if constexpr(std::is_same<int, T>::value) {
252 return H5::PredType::NATIVE_INT;
253 } else if (std::is_same<unsigned int, T>::value) {
254 return H5::PredType::NATIVE_UINT;
255 } else if (std::is_same<long, T>::value) {
256 return H5::PredType::NATIVE_LONG;
257 } else if (std::is_same<unsigned long, T>::value) {
258 return H5::PredType::NATIVE_ULONG;
259 } else if (std::is_same<long long, T>::value) {
260 return H5::PredType::NATIVE_LLONG;
261 } else if (std::is_same<unsigned long long, T>::value) {
262 return H5::PredType::NATIVE_ULLONG;
263 } else if (std::is_same<short, T>::value) {
264 return H5::PredType::NATIVE_SHORT;
265 } else if (std::is_same<unsigned short, T>::value) {
266 return H5::PredType::NATIVE_USHORT;
267 } else if (std::is_same<char, T>::value) {
268 return H5::PredType::NATIVE_CHAR;
269 } else if (std::is_same<unsigned char, T>::value) {
270 return H5::PredType::NATIVE_UCHAR;
271 } else if (std::is_same<double, T>::value) {
272 return H5::PredType::NATIVE_DOUBLE;
273 } else if (std::is_same<float, T>::value) {
274 return H5::PredType::NATIVE_FLOAT;
275 } else if (std::is_same<long double, T>::value) {
276 return H5::PredType::NATIVE_LDOUBLE;
277 } else if (std::is_same<uint8_t, T>::value) {
278 return H5::PredType::NATIVE_UINT8;
279 } else if (std::is_same<int8_t, T>::value) {
280 return H5::PredType::NATIVE_INT8;
281 } else if (std::is_same<uint16_t, T>::value) {
282 return H5::PredType::NATIVE_UINT16;
283 } else if (std::is_same<int16_t, T>::value) {
284 return H5::PredType::NATIVE_INT16;
285 } else if (std::is_same<uint32_t, T>::value) {
286 return H5::PredType::NATIVE_UINT32;
287 } else if (std::is_same<int32_t, T>::value) {
288 return H5::PredType::NATIVE_INT32;
289 } else if (std::is_same<uint64_t, T>::value) {
290 return H5::PredType::NATIVE_UINT64;
291 } else if (std::is_same<int64_t, T>::value) {
292 return H5::PredType::NATIVE_INT64;
293 }
294 static_assert("unsupported HDF5 type for template parameter 'T'");
295}
296
297template<bool integer_only, class GroupLike>
298H5::DataSet open_and_check_dataset(const GroupLike& handle, const std::string& name) {
299 // Avoid throwing H5 exceptions.
300 if (!H5Lexists(handle.getId(), name.c_str(), H5P_DEFAULT) || handle.childObjType(name) != H5O_TYPE_DATASET) {
301 throw std::runtime_error("no child dataset named '" + name + "'");
302 }
303
304 auto dhandle = handle.openDataSet(name);
305 auto type = dhandle.getTypeClass();
306 if constexpr(integer_only) {
307 if (type != H5T_INTEGER) {
308 throw std::runtime_error(std::string("expected integer values in the '") + name + "' dataset");
309 }
310 } else {
311 if (type != H5T_INTEGER && type != H5T_FLOAT) {
312 throw std::runtime_error(std::string("expected numeric values in the '") + name + "' dataset");
313 }
314 }
315
316 return dhandle;
317}
318
319template<int N>
320std::array<hsize_t, N> get_array_dimensions(const H5::DataSet& handle, const std::string& name) {
321 auto space = handle.getSpace();
322
323 int ndim = space.getSimpleExtentNdims();
324 if (ndim != N) {
325 throw std::runtime_error(std::string("'") + name + "' should be a " + std::to_string(N) + "-dimensional array");
326 }
327
328 std::array<hsize_t, N> dims_out;
329 space.getSimpleExtentDims(dims_out.data(), NULL);
330 return dims_out;
331}
336}
337
338#endif
Representations for matrix data in HDF5 files.
Definition CompressedSparseMatrix.hpp:24
WriteStorageLayout
Definition utils.hpp:26
WriteStorageType
Definition utils.hpp:31