tatami_layered
Create layered sparse matrices for tatami
Loading...
Searching...
No Matches
read_layered_sparse_from_matrix_market.hpp
Go to the documentation of this file.
1#ifndef TATAMI_LAYERED_READ_LAYERED_SPARSE_FROM_MATRIX_MARKET_HPP
2#define TATAMI_LAYERED_READ_LAYERED_SPARSE_FROM_MATRIX_MARKET_HPP
3
4#include <vector>
5#include <algorithm>
6#include <cstddef>
7
8#include "byteme/byteme.hpp"
9#include "eminem/eminem.hpp"
10#include "tatami/tatami.hpp"
11#include "sanisizer/sanisizer.hpp"
12
13#include "utils.hpp"
14
20namespace tatami_layered {
21
25template<typename Value_, typename Index_, typename ColumnIndex_, class Creator_>
26std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market(Creator_ create, const Index_ chunk_size, const int num_threads) {
27 Index_ NR, NC, nchunks, leftovers;
28
29 std::vector<Holder< std::uint8_t, Index_, ColumnIndex_> > store8;
30 std::vector<Holder<std::uint16_t, Index_, ColumnIndex_> > store16;
31 std::vector<Holder<std::uint32_t, Index_, ColumnIndex_> > store32;
32
33 std::vector<std::vector<Index_> > identities8, identities16, identities32;
34 std::vector<std::vector<Index_> > assigned_position;
35 std::vector<std::vector<Category> > assigned_category;
36
37 eminem::ParserOptions eopt;
38 eopt.num_threads = num_threads;
39
40 // First pass, scanning for the max and number.
41 {
42 auto reader = create();
43 byteme::PerByteSerial<char, byteme::Reader*> pb(&reader);
44 eminem::Parser<I<decltype(&pb)>, Index_> parser(&pb, eopt);
45
46 parser.scan_preamble();
47 NR = parser.get_nrows();
48 NC = parser.get_ncols();
49 leftovers = NC % chunk_size;
50 nchunks = sanisizer::max(1, NC / chunk_size + (leftovers != 0));
51
55 tatami::resize_container_to_Index_size(identities8, nchunks);
56 tatami::resize_container_to_Index_size(identities16, nchunks);
57 tatami::resize_container_to_Index_size(identities32, nchunks);
58 tatami::resize_container_to_Index_size(assigned_position, nchunks);
59 tatami::resize_container_to_Index_size(assigned_category, nchunks);
60
62 for (auto& x : max_per_chunk) {
64 }
65
67 for (auto& x : num_per_chunk) {
69 }
70
71 auto handler = [&](const Index_ r, const Index_ c, const Category cat) -> void {
72 const auto chunk = (c - 1) / chunk_size;
73 auto& maxcat = max_per_chunk[chunk][r - 1];
74 maxcat = std::max(maxcat, cat);
75 ++num_per_chunk[chunk][r - 1];
76 };
77
78 const auto& banner = parser.get_banner();
79 if (banner.field == eminem::Field::INTEGER) {
80 parser.template scan_integer<std::uint32_t>([&](const Index_ r, const Index_ c, const std::uint32_t val) -> void {
81 handler(r, c, categorize(val));
82 });
83 } else if (banner.field == eminem::Field::DOUBLE || banner.field == eminem::Field::REAL) {
84 parser.scan_real([&](const Index_ r, const Index_ c, const double val) -> void {
85 handler(r, c, categorize(val));
86 });
87 } else {
88 throw std::runtime_error("expected a numeric field in the Matrix Market file");
89 }
90
91 allocate_rows(
92 max_per_chunk,
93 num_per_chunk,
94 identities8,
95 identities16,
96 identities32,
97 store8,
98 store16,
99 store32,
100 assigned_category,
101 assigned_position
102 );
103 }
104
105 // Now allocating.
106 {
107 std::vector<std::vector<std::size_t> > output_positions(nchunks);
108 for (I<decltype(nchunks)> chunk = 0; chunk < nchunks; ++chunk) {
109 tatami::resize_container_to_Index_size(output_positions[chunk], NR);
110 for (I<decltype(NR)> r = 0; r < NR; ++r) {
111 output_positions[chunk][r] = get_sparse_ptr(store8, store16, store32, assigned_category, assigned_position, chunk, r);
112 }
113 }
114
115 auto reader = create();
116 byteme::PerByteSerial<char, byteme::Reader*> pb(&reader);
117 eminem::Parser<I<decltype(&pb)>, Index_> parser(&pb, eopt);
118
119 auto handler = [&](Index_ r, Index_ c, const auto val) -> void {
120 --c;
121 const Index_ chunk = c / chunk_size;
122 const Index_ offset = c % chunk_size;
123 --r;
124 fill_sparse_value(store8, store16, store32, assigned_category[chunk][r], chunk, offset, val, output_positions[chunk][r]++);
125 };
126
127 parser.scan_preamble();
128 const auto& banner = parser.get_banner();
129 if (banner.field == eminem::Field::INTEGER) {
130 parser.template scan_integer<std::uint32_t>([&](const Index_ r, const Index_ c, const std::uint32_t val) -> void {
131 handler(r, c, val);
132 });
133 } else if (banner.field == eminem::Field::DOUBLE || banner.field == eminem::Field::REAL) {
134 parser.scan_real([&](const Index_ r, const Index_ c, const double val) -> void {
135 handler(r, c, val);
136 });
137 }
138
139 // Checking that the column indices are sorted properly.
140 auto sorter = [&](auto& store) -> void {
141 std::vector<std::pair<I<decltype(store[0].index[0])>, I<decltype(store[0].value[0])>> > buffer;
142 buffer.reserve(chunk_size);
143
144 for (auto& st : store) {
145 const auto num_ptr = st.ptr.size();
146 for (I<decltype(num_ptr)> r = 1; r < num_ptr; ++r) {
147 const auto start = st.ptr[r - 1], end = st.ptr[r];
148
149 if (!std::is_sorted(st.index.begin() + start, st.index.begin() + end)) {
150 buffer.clear();
151 for (auto i = start; i < end; ++i) {
152 buffer.emplace_back(st.index[i], st.value[i]);
153 }
154
155 std::sort(buffer.begin(), buffer.end());
156 auto bIt = buffer.begin();
157 for (auto i = start; i < end; ++i, ++bIt) {
158 st.index[i] = bIt->first;
159 st.value[i] = bIt->second;
160 }
161 }
162 }
163 }
164 };
165
166 sorter(store8);
167 sorter(store16);
168 sorter(store32);
169 }
170
171 return consolidate_matrices<Value_, Index_>(
172 identities8,
173 identities16,
174 identities32,
175 std::move(store8),
176 std::move(store16),
177 std::move(store32),
178 NR,
179 chunk_size,
180 leftovers
181 );
182}
194 std::size_t chunk_size = sanisizer::cap<std::size_t>(65536);
195
199 std::size_t buffer_size = sanisizer::cap<std::size_t>(65536);
200
204 int num_threads = 1;
205};
206
221template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
222std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_text_file(const char* filepath, const ReadLayeredSparseFromMatrixMarketOptions& options) {
223 return read_layered_sparse_from_matrix_market<Value_, Index_, ColumnIndex_>(
224 [&]() -> auto {
225 return byteme::RawFileReader(filepath, [&]{
226 byteme::RawFileReaderOptions opt;
227 opt.buffer_size = options.buffer_size;
228 return opt;
229 }());
230 },
231 check_chunk_size<Index_, ColumnIndex_>(options.chunk_size),
232 options.num_threads
233 );
234}
235
239// Back-compatibility.
240template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
241std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_text_file(const char* filepath, Index_ chunk_size = 65536, std::size_t buffer_size = 65536) {
243 ReadLayeredSparseFromMatrixMarketOptions opt;
244 opt.chunk_size = chunk_size;
245 opt.buffer_size = buffer_size;
246 return opt;
247 }());
248}
253#if __has_include("zlib.h")
254
269template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
270std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_some_file(const char* filepath, const ReadLayeredSparseFromMatrixMarketOptions& options) {
271 return read_layered_sparse_from_matrix_market<Value_, Index_, ColumnIndex_>(
272 [&]() -> auto {
273 return byteme::SomeFileReader(filepath, [&]{
274 byteme::SomeFileReaderOptions opt;
275 opt.buffer_size = options.buffer_size;
276 return opt;
277 }());
278 },
279 check_chunk_size<Index_, ColumnIndex_>(options.chunk_size),
280 options.num_threads
281 );
282}
283
298template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
299std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_gzip_file(const char* filepath, const ReadLayeredSparseFromMatrixMarketOptions& options) {
300 return read_layered_sparse_from_matrix_market<Value_, Index_, ColumnIndex_>(
301 [&]() -> auto {
302 return byteme::GzipFileReader(filepath, [&]{
303 byteme::GzipFileReaderOptions opt;
304 opt.buffer_size = options.buffer_size;
305 return opt;
306 }());
307 },
308 check_chunk_size<Index_, ColumnIndex_>(options.chunk_size),
309 options.num_threads
310 );
311}
312
316// Back-compatibility.
317template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
318std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_some_file(const char* filepath, Index_ chunk_size = 65536, std::size_t buffer_size = 65536) {
320 ReadLayeredSparseFromMatrixMarketOptions opt;
321 opt.chunk_size = chunk_size;
322 opt.buffer_size = buffer_size;
323 return opt;
324 }());
325}
326
327template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
328std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_gzip_file(const char* filepath, Index_ chunk_size = 65536, std::size_t buffer_size = 65536) {
330 ReadLayeredSparseFromMatrixMarketOptions opt;
331 opt.chunk_size = chunk_size;
332 opt.buffer_size = buffer_size;
333 return opt;
334 }());
335}
341#endif
342
358template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
359std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_text_buffer(
360 const unsigned char* contents,
361 std::size_t length,
363{
364 return read_layered_sparse_from_matrix_market<Value_, Index_, ColumnIndex_>(
365 [&]() -> auto {
366 return byteme::RawBufferReader(contents, length);
367 },
368 check_chunk_size<Index_, ColumnIndex_>(options.chunk_size),
369 options.num_threads
370 );
371}
372
376// Back-compatibility.
377template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
378std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_text_buffer(const unsigned char* contents, std::size_t length, Index_ chunk_size = 65536) {
380 ReadLayeredSparseFromMatrixMarketOptions opt;
381 opt.chunk_size = chunk_size;
382 return opt;
383 }());
384}
390#if __has_include("zlib.h")
391
407template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
408std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_some_buffer(
409 const unsigned char* contents,
410 std::size_t length,
412{
413 return read_layered_sparse_from_matrix_market<Value_, Index_, ColumnIndex_>(
414 [&]() -> auto {
415 return byteme::SomeBufferReader(contents, length, [&]{
416 byteme::SomeBufferReaderOptions opt;
417 opt.buffer_size = options.buffer_size;
418 return opt;
419 }());
420 },
421 check_chunk_size<Index_, ColumnIndex_>(options.chunk_size),
422 options.num_threads
423 );
424}
425
441template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
442std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_zlib_buffer(
443 const unsigned char* contents,
444 std::size_t length,
446{
447 return read_layered_sparse_from_matrix_market<Value_, Index_, ColumnIndex_>(
448 [&]() -> auto {
449 return byteme::ZlibBufferReader(contents, length, [&]{
450 byteme::ZlibBufferReaderOptions opt;
451 opt.buffer_size = options.buffer_size;
452 return opt;
453 }());
454 },
455 check_chunk_size<Index_, ColumnIndex_>(options.chunk_size),
456 options.num_threads
457 );
458}
459
463// Back-compatibility.
464template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
465std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_some_buffer(
466 const unsigned char* contents,
467 std::size_t length,
468 Index_ chunk_size = 65536,
469 std::size_t buffer_size = 65536)
470{
472 ReadLayeredSparseFromMatrixMarketOptions opt;
473 opt.chunk_size = chunk_size;
474 opt.buffer_size = buffer_size;
475 return opt;
476 }());
477}
478
479template<typename Value_ = double, typename Index_ = int, typename ColumnIndex_ = std::uint16_t>
480std::shared_ptr<tatami::Matrix<Value_, Index_> > read_layered_sparse_from_matrix_market_zlib_buffer(
481 const unsigned char* contents,
482 std::size_t length,
483 Index_ chunk_size = 65536,
484 std::size_t buffer_size = 65536)
485{
487 ReadLayeredSparseFromMatrixMarketOptions opt;
488 opt.chunk_size = chunk_size;
489 opt.buffer_size = buffer_size;
490 return opt;
491 }());
492}
498#endif
499
500}
501
502#endif
Create layered sparse matrices for tatami.
Definition convert_to_layered_sparse.hpp:20
std::shared_ptr< tatami::Matrix< Value_, Index_ > > read_layered_sparse_from_matrix_market_some_buffer(const unsigned char *contents, std::size_t length, const ReadLayeredSparseFromMatrixMarketOptions &options)
Definition read_layered_sparse_from_matrix_market.hpp:408
std::shared_ptr< tatami::Matrix< Value_, Index_ > > read_layered_sparse_from_matrix_market_some_file(const char *filepath, const ReadLayeredSparseFromMatrixMarketOptions &options)
Definition read_layered_sparse_from_matrix_market.hpp:270
std::shared_ptr< tatami::Matrix< Value_, Index_ > > read_layered_sparse_from_matrix_market_gzip_file(const char *filepath, const ReadLayeredSparseFromMatrixMarketOptions &options)
Definition read_layered_sparse_from_matrix_market.hpp:299
std::shared_ptr< tatami::Matrix< Value_, Index_ > > read_layered_sparse_from_matrix_market_text_buffer(const unsigned char *contents, std::size_t length, const ReadLayeredSparseFromMatrixMarketOptions &options)
Definition read_layered_sparse_from_matrix_market.hpp:359
std::shared_ptr< tatami::Matrix< Value_, Index_ > > read_layered_sparse_from_matrix_market_zlib_buffer(const unsigned char *contents, std::size_t length, const ReadLayeredSparseFromMatrixMarketOptions &options)
Definition read_layered_sparse_from_matrix_market.hpp:442
std::shared_ptr< tatami::Matrix< Value_, Index_ > > read_layered_sparse_from_matrix_market_text_file(const char *filepath, const ReadLayeredSparseFromMatrixMarketOptions &options)
Definition read_layered_sparse_from_matrix_market.hpp:222
void resize_container_to_Index_size(Container_ &container, const Index_ x, Args_ &&... args)
Container_ create_container_of_Index_size(const Index_ x, Args_ &&... args)
Definition read_layered_sparse_from_matrix_market.hpp:190
std::size_t buffer_size
Definition read_layered_sparse_from_matrix_market.hpp:199
std::size_t chunk_size
Definition read_layered_sparse_from_matrix_market.hpp:194
int num_threads
Definition read_layered_sparse_from_matrix_market.hpp:204