tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
grouped_variances.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS_GROUPED_VARIANCES_HPP
2#define TATAMI_STATS_GROUPED_VARIANCES_HPP
3
4#include "utils.hpp"
5#include "variances.hpp"
6
7#include <vector>
8#include <algorithm>
9#include <cstddef>
10
11#include "tatami/tatami.hpp"
12#include "sanisizer/sanisizer.hpp"
13
20namespace tatami_stats {
21
26namespace grouped_variances {
27
31struct Options {
36 bool skip_nan = false;
37
42 int num_threads = 1;
43};
44
48namespace internal {
49
50template<typename Index_, typename Output_>
51void finish_means(std::size_t num_groups, const Index_* group_size, Output_* output_means) {
52 for (I<decltype(num_groups)> b = 0; b < num_groups; ++b) {
53 if (group_size[b]) {
54 output_means[b] /= group_size[b];
55 } else {
56 output_means[b] = std::numeric_limits<Output_>::quiet_NaN();
57 }
58 }
59}
60
61template<typename Index_, typename Output_>
62void finish_variances(std::size_t num_groups, const Index_* group_size, Output_* output_variances) {
63 for (I<decltype(num_groups)> b = 0; b < num_groups; ++b) {
64 if (group_size[b] > 1) {
65 output_variances[b] /= group_size[b] - 1;
66 } else {
67 output_variances[b] = std::numeric_limits<Output_>::quiet_NaN();
68 }
69 }
70}
71
72}
106template<typename Value_, typename Index_, typename Group_, typename Output_>
108 const Value_* ptr,
109 Index_ num,
110 const Group_* group,
111 std::size_t num_groups,
112 const Index_* group_size,
113 Output_* output_means,
114 Output_* output_variances,
115 bool skip_nan,
116 Index_* valid_group_size)
117{
118 std::fill_n(output_means, num_groups, 0);
119 std::fill_n(output_variances, num_groups, 0);
120
121 ::tatami_stats::internal::nanable_ifelse<Value_>(
122 skip_nan,
123 [&]() -> void {
124 std::fill_n(valid_group_size, num_groups, 0);
125
126 for (Index_ j = 0; j < num; ++j) {
127 auto x = ptr[j];
128 if (!std::isnan(x)) {
129 auto b = group[j];
130 output_means[b] += x;
131 ++valid_group_size[b];
132 }
133 }
134 internal::finish_means(num_groups, valid_group_size, output_means);
135
136 for (Index_ j = 0; j < num; ++j) {
137 auto x = ptr[j];
138 if (!std::isnan(x)) {
139 auto b = group[j];
140 auto delta = x - output_means[b];
141 output_variances[b] += delta * delta;
142 }
143 }
144 internal::finish_variances(num_groups, valid_group_size, output_variances);
145 },
146 [&]() -> void {
147 for (Index_ j = 0; j < num; ++j) {
148 output_means[group[j]] += ptr[j];
149 }
150 internal::finish_means(num_groups, group_size, output_means);
151
152 for (Index_ j = 0; j < num; ++j) {
153 auto b = group[j];
154 auto delta = ptr[j] - output_means[b];
155 output_variances[b] += delta * delta;
156 }
157 internal::finish_variances(num_groups, group_size, output_variances);
158 }
159 );
160}
161
195template<typename Value_, typename Index_, typename Group_, typename Output_>
197 const Value_* value,
198 const Index_* index,
199 Index_ num_nonzero,
200 const Group_* group,
201 std::size_t num_groups,
202 const Index_* group_size,
203 Output_* output_means,
204 Output_* output_variances,
205 Index_* output_nonzero,
206 bool skip_nan,
207 Index_* valid_group_size)
208{
209 std::fill_n(output_means, num_groups, 0);
210 std::fill_n(output_nonzero, num_groups, 0);
211 std::fill_n(output_variances, num_groups, 0);
212
213 ::tatami_stats::internal::nanable_ifelse<Value_>(
214 skip_nan,
215 [&]() -> void {
216 std::copy_n(group_size, num_groups, valid_group_size);
217
218 for (Index_ j = 0; j < num_nonzero; ++j) {
219 auto x = value[j];
220 auto b = group[index[j]];
221 if (!std::isnan(x)) {
222 output_means[b] += x;
223 ++(output_nonzero[b]);
224 } else {
225 --(valid_group_size[b]);
226 }
227 }
228 internal::finish_means(num_groups, valid_group_size, output_means);
229
230 for (Index_ j = 0; j < num_nonzero; ++j) {
231 auto x = value[j];
232 if (!std::isnan(x)) {
233 auto b = group[index[j]];
234 auto delta = x - output_means[b];
235 output_variances[b] += delta * delta;
236 }
237 }
238 for (I<decltype(num_groups)> b = 0; b < num_groups; ++b) {
239 output_variances[b] += output_means[b] * output_means[b] * (valid_group_size[b] - output_nonzero[b]);
240 }
241 internal::finish_variances(num_groups, valid_group_size, output_variances);
242 },
243 [&]() -> void {
244 for (Index_ j = 0; j < num_nonzero; ++j) {
245 auto b = group[index[j]];
246 output_means[b] += value[j];
247 ++output_nonzero[b];
248 }
249 internal::finish_means(num_groups, group_size, output_means);
250
251 for (Index_ j = 0; j < num_nonzero; ++j) {
252 auto b = group[index[j]];
253 auto delta = value[j] - output_means[b];
254 output_variances[b] += delta * delta;
255 }
256 for (I<decltype(num_groups)> b = 0; b < num_groups; ++b) {
257 output_variances[b] += output_means[b] * output_means[b] * (group_size[b] - output_nonzero[b]);
258 }
259 internal::finish_variances(num_groups, group_size, output_variances);
260 }
261 );
262}
263
286template<typename Value_, typename Index_, typename Group_, typename Output_>
287void apply(bool row, const tatami::Matrix<Value_, Index_>& mat, const Group_* group, std::size_t num_groups, const Index_* group_size, Output_** output, const Options& sopt) {
288 const Index_ dim = (row ? mat.nrow() : mat.ncol());
289 const Index_ otherdim = (row ? mat.ncol() : mat.nrow());
290
291 if (mat.sparse()) {
292 if (mat.prefer_rows() == row) {
293 tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
294 auto ext = tatami::consecutive_extractor<true>(mat, row, start, len);
297
298 auto tmp_means = sanisizer::create<std::vector<Output_> >(num_groups);
299 auto output_variances = sanisizer::create<std::vector<Output_> >(num_groups);
300 auto tmp_nonzero = sanisizer::create<std::vector<Index_> >(num_groups);
301 auto valid_group_size = sanisizer::create<std::vector<Index_> >(sopt.skip_nan ? num_groups : 0);
302
303 for (Index_ i = 0; i < len; ++i) {
304 auto range = ext->fetch(xbuffer.data(), ibuffer.data());
305 direct(
306 range.value,
307 range.index,
308 range.number,
309 group,
310 num_groups,
311 group_size,
312 tmp_means.data(),
313 output_variances.data(),
314 tmp_nonzero.data(),
315 sopt.skip_nan,
316 valid_group_size.data()
317 );
318
319 for (I<decltype(num_groups)> g = 0; g < num_groups; ++g) {
320 output[g][i + start] = output_variances[g];
321 }
322 }
323 }, dim, sopt.num_threads);
324
325 } else {
326 // Order within each observed vector doesn't affect numerical
327 // precision of the outcome, as addition order for each objective
328 // vector is already well-defined for a running calculation.
329 tatami::Options opt;
330 opt.sparse_ordered_index = false;
331
332 tatami::parallelize([&](int thread, Index_ start, Index_ len) -> void {
333 std::vector<variances::RunningSparse<Output_, Value_, Index_> > runners;
334 runners.reserve(num_groups);
335 std::vector<LocalOutputBuffer<Output_> > local_var_output;
336 local_var_output.reserve(num_groups);
337 std::vector<std::vector<Output_> > local_mean_output;
338 local_mean_output.reserve(num_groups);
339
340 for (I<decltype(num_groups)> g = 0; g < num_groups; ++g) {
341 local_var_output.emplace_back(thread, start, len, output[g]);
342 local_mean_output.emplace_back(len);
343 runners.emplace_back(len, local_mean_output.back().data(), local_var_output.back().data(), sopt.skip_nan, start);
344 }
345
346 auto ext = tatami::consecutive_extractor<true>(mat, !row, static_cast<Index_>(0), otherdim, start, len, opt);
349
350 for (Index_ i = 0; i < otherdim; ++i) {
351 auto range = ext->fetch(xbuffer.data(), ibuffer.data());
352 runners[group[i]].add(range.value, range.index, range.number);
353 }
354
355 for (I<decltype(num_groups)> g = 0; g < num_groups; ++g) {
356 runners[g].finish();
357 local_var_output[g].transfer();
358 }
359 }, dim, sopt.num_threads);
360 }
361
362 } else {
363 if (mat.prefer_rows() == row) {
364 tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
365 auto ext = tatami::consecutive_extractor<false>(mat, row, start, len);
367
368 auto tmp_means = sanisizer::create<std::vector<Output_> >(num_groups);
369 auto output_variances = sanisizer::create<std::vector<Output_> >(num_groups);
370 auto valid_group_size = sanisizer::create<std::vector<Index_> >(sopt.skip_nan ? num_groups : 0);
371
372 for (Index_ i = 0; i < len; ++i) {
373 auto ptr = ext->fetch(xbuffer.data());
374 direct(
375 ptr,
376 otherdim,
377 group,
378 num_groups,
379 group_size,
380 tmp_means.data(),
381 output_variances.data(),
382 sopt.skip_nan,
383 valid_group_size.data()
384 );
385
386 for (I<decltype(num_groups)> g = 0; g < num_groups; ++g) {
387 output[g][i + start] = output_variances[g];
388 }
389 }
390 }, dim, sopt.num_threads);
391
392 } else {
393 tatami::parallelize([&](int thread, Index_ start, Index_ len) -> void {
394 std::vector<variances::RunningDense<Output_, Value_, Index_> > runners;
395 runners.reserve(num_groups);
396 std::vector<LocalOutputBuffer<Output_> > local_var_output;
397 local_var_output.reserve(num_groups);
398 std::vector<std::vector<Output_> > local_mean_output;
399 local_mean_output.reserve(num_groups);
400
401 for (I<decltype(num_groups)> g = 0; g < num_groups; ++g) {
402 local_var_output.emplace_back(thread, start, len, output[g]);
403 local_mean_output.emplace_back(len);
404 runners.emplace_back(len, local_mean_output.back().data(), local_var_output.back().data(), sopt.skip_nan);
405 }
406
408 auto ext = tatami::consecutive_extractor<false>(mat, !row, static_cast<Index_>(0), otherdim, start, len);
409
410 for (Index_ i = 0; i < otherdim; ++i) {
411 auto ptr = ext->fetch(xbuffer.data());
412 runners[group[i]].add(ptr);
413 }
414
415 for (I<decltype(num_groups)> g = 0; g < num_groups; ++g) {
416 runners[g].finish();
417 local_var_output[g].transfer();
418 }
419 }, dim, sopt.num_threads);
420 }
421 }
422}
423
427// Back-compatibility.
428template<typename Value_, typename Index_, typename Group_, typename Output_>
429void apply(bool row, const tatami::Matrix<Value_, Index_>* p, const Group_* group, std::size_t num_groups, const Index_* group_size, Output_** output, const Options& sopt) {
430 apply(row, *p, group, num_groups, group_size, output, sopt);
431}
454template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
455std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>& mat, const Group_* group, const Options& sopt) {
456 auto mydim = mat.nrow();
457 auto group_size = tabulate_groups(group, mat.ncol());
458 auto ngroup = group_size.size();
459
460 auto output = sanisizer::create<std::vector<std::vector<Output_> > >(ngroup);
461 std::vector<Output_*> ptrs;
462 ptrs.reserve(output.size());
463 for (auto& o : output) {
464 o.resize(mydim);
465 ptrs.push_back(o.data());
466 }
467
468 apply(true, mat, group, ngroup, group_size.data(), ptrs.data(), sopt);
469 return output;
470}
471
475// Back-compatibility.
476template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
477std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>* p, const Group_* group, const Options& sopt) {
478 return by_row<Output_>(*p, group, sopt);
479}
480
481template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
482std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>& mat, const Group_* group) {
483 return by_row<Output_>(mat, group, Options());
484}
485
486template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
487std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>* p, const Group_* group) {
488 return by_row<Output_>(*p, group);
489}
512template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
513std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>& mat, const Group_* group, const Options& sopt) {
514 auto mydim = mat.ncol();
515 auto group_size = tabulate_groups(group, mat.nrow());
516 auto ngroup = group_size.size();
517
518 auto output = sanisizer::create<std::vector<std::vector<Output_> > >(ngroup);
519 std::vector<Output_*> ptrs;
520 ptrs.reserve(output.size());
521 for (auto& o : output) {
522 o.resize(mydim);
523 ptrs.push_back(o.data());
524 }
525
526 apply(false, mat, group, ngroup, group_size.data(), ptrs.data(), sopt);
527 return output;
528}
529
533// Back-compatibility.
534template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
535std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>* p, const Group_* group, const Options& sopt) {
536 return by_column<Output_>(*p, group, sopt);
537}
538
539template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
540std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>& mat, const Group_* group) {
541 return by_column<Output_>(mat, group, Options());
542}
543
544template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
545std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>* p, const Group_* group) {
546 return by_column<Output_>(*p, group);
547}
552}
553
554}
555
556#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
void direct(const Value_ *ptr, Index_ num, const Group_ *group, std::size_t num_groups, const Index_ *group_size, Output_ *output_means, Output_ *output_variances, bool skip_nan, Index_ *valid_group_size)
Definition grouped_variances.hpp:107
void apply(bool row, const tatami::Matrix< Value_, Index_ > &mat, const Group_ *group, std::size_t num_groups, const Index_ *group_size, Output_ **output, const Options &sopt)
Definition grouped_variances.hpp:287
std::vector< std::vector< Output_ > > by_column(const tatami::Matrix< Value_, Index_ > &mat, const Group_ *group, const Options &sopt)
Definition grouped_variances.hpp:513
std::vector< std::vector< Output_ > > by_row(const tatami::Matrix< Value_, Index_ > &mat, const Group_ *group, const Options &sopt)
Definition grouped_variances.hpp:455
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
std::vector< Size_ > tabulate_groups(const Group_ *group, Size_ n)
Definition utils.hpp:64
void parallelize(Function_ fun, const Index_ tasks, const int threads)
Container_ create_container_of_Index_size(const Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, const bool row, const Index_ iter_start, const Index_ iter_length, Args_ &&... args)
bool sparse_ordered_index
Grouped summation options.
Definition grouped_variances.hpp:31
bool skip_nan
Definition grouped_variances.hpp:36
int num_threads
Definition grouped_variances.hpp:42
Utilities for computing matrix statistics.
Compute row and column variances from a tatami::Matrix.