tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
variances.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS_VARS_HPP
2#define TATAMI_STATS_VARS_HPP
3
4#include "utils.hpp"
5
6#include <vector>
7#include <cmath>
8#include <numeric>
9#include <limits>
10#include <algorithm>
11#include <cstddef>
12
13#include "tatami/tatami.hpp"
14
21namespace tatami_stats {
22
27namespace variances {
28
32struct Options {
37 bool skip_nan = false;
38
43 int num_threads = 1;
44};
45
49namespace internal {
50
51template<typename Output_ = double, typename Value_, typename Index_ >
52void add_welford(Output_& mean, Output_& sumsq, Value_ value, Index_ count) {
53 Output_ delta = value - mean;
54 mean += delta / count;
55 sumsq += delta * (value - mean);
56}
57
58template<typename Output_ = double, typename Index_ >
59void add_welford_zeros(Output_& mean, Output_& sumsq, Index_ num_nonzero, Index_ num_all) {
60 auto ratio = static_cast<Output_>(num_nonzero) / static_cast<Output_>(num_all);
61 sumsq += mean * mean * ratio * (num_all - num_nonzero);
62 mean *= ratio;
63}
64
65// Avoid problems from interactions between constexpr/lambda/std::conditional.
66template<typename Index_>
67struct MockVector {
68 MockVector(std::size_t) {}
69 Index_& operator[](std::size_t) { return out; }
70 std::size_t size() { return 0; }
71 Index_ out = 0;
72};
73
74}
97template<typename Output_ = double, typename Value_, typename Index_ >
98std::pair<Output_, Output_> direct(const Value_* value, Index_ num_nonzero, Index_ num_all, bool skip_nan) {
99 Output_ mean = 0;
100 Index_ lost = 0;
101
102 ::tatami_stats::internal::nanable_ifelse<Value_>(
103 skip_nan,
104 [&]() -> void {
105 auto copy = value;
106 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
107 auto val = *copy;
108 if (std::isnan(val)) {
109 ++lost;
110 } else {
111 mean += val;
112 }
113 }
114 },
115 [&]() -> void {
116 auto copy = value;
117 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
118 mean += *copy;
119 }
120 }
121 );
122
123 auto count = num_all - lost;
124 mean /= count;
125
126 Output_ var = 0;
127 ::tatami_stats::internal::nanable_ifelse<Value_>(
128 skip_nan,
129 [&]() -> void {
130 for (Index_ i = 0; i < num_nonzero; ++i) {
131 auto val = value[i];
132 if (!std::isnan(val)) {
133 auto delta = static_cast<Output_>(val) - mean;
134 var += delta * delta;
135 }
136 }
137 },
138 [&]() -> void {
139 for (Index_ i = 0; i < num_nonzero; ++i) {
140 auto delta = static_cast<Output_>(value[i]) - mean;
141 var += delta * delta;
142 }
143 }
144 );
145
146 if (num_nonzero < num_all) {
147 var += static_cast<Output_>(num_all - num_nonzero) * mean * mean;
148 }
149
150 if (count == 0) {
151 return std::make_pair(std::numeric_limits<Output_>::quiet_NaN(), std::numeric_limits<Output_>::quiet_NaN());
152 } else if (count == 1) {
153 return std::make_pair(mean, std::numeric_limits<Output_>::quiet_NaN());
154 } else {
155 return std::make_pair(mean, var / (count - 1));
156 }
157}
158
175template<typename Output_ = double, typename Value_, typename Index_ >
176std::pair<Output_, Output_> direct(const Value_* ptr, Index_ num, bool skip_nan) {
177 return direct<Output_>(ptr, num, num, skip_nan);
178}
179
194template<typename Output_, typename Value_, typename Index_>
196public:
205 RunningDense(Index_ num, Output_* mean, Output_* variance, bool skip_nan) :
206 my_num(num),
207 my_mean(mean),
208 my_variance(variance),
209 my_skip_nan(skip_nan),
210 my_ok_count(skip_nan ? tatami::can_cast_Index_to_container_size<decltype(my_ok_count)>(num) : static_cast<Index_>(0))
211 {}
212
217 void add(const Value_* ptr) {
218 ::tatami_stats::internal::nanable_ifelse<Value_>(
219 my_skip_nan,
220 [&]() -> void {
221 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
222 auto val = *ptr;
223 if (!std::isnan(val)) {
224 internal::add_welford(my_mean[i], my_variance[i], val, ++(my_ok_count[i]));
225 }
226 }
227 },
228 [&]() -> void {
229 ++my_count;
230 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
231 internal::add_welford(my_mean[i], my_variance[i], *ptr, my_count);
232 }
233 }
234 );
235 }
236
240 void finish() {
241 ::tatami_stats::internal::nanable_ifelse<Value_>(
242 my_skip_nan,
243 [&]() -> void {
244 for (Index_ i = 0; i < my_num; ++i) {
245 auto ct = my_ok_count[i];
246 if (ct < 2) {
247 my_variance[i] = std::numeric_limits<Output_>::quiet_NaN();
248 if (ct == 0) {
249 my_mean[i] = std::numeric_limits<Output_>::quiet_NaN();
250 }
251 } else {
252 my_variance[i] /= ct - 1;
253 }
254 }
255 },
256 [&]() -> void {
257 if (my_count < 2) {
258 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
259 if (my_count == 0) {
260 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
261 }
262 } else {
263 for (Index_ i = 0; i < my_num; ++i) {
264 my_variance[i] /= my_count - 1;
265 }
266 }
267 }
268 );
269 }
270
271private:
272 Index_ my_num;
273 Output_* my_mean;
274 Output_* my_variance;
275 bool my_skip_nan;
276 Index_ my_count = 0;
277 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_ok_count;
278};
279
290template<typename Output_, typename Value_, typename Index_>
292public:
304 RunningSparse(Index_ num, Output_* mean, Output_* variance, bool skip_nan, Index_ subtract = 0) :
305 my_num(num),
306 my_mean(mean),
307 my_variance(variance),
308 my_nonzero(tatami::can_cast_Index_to_container_size<decltype(my_nonzero)>(num)),
309 my_skip_nan(skip_nan),
310 my_subtract(subtract),
311 my_nan(skip_nan ? tatami::can_cast_Index_to_container_size<decltype(my_nan)>(num) : static_cast<Index_>(0))
312 {}
313
320 void add(const Value_* value, const Index_* index, Index_ number) {
321 ++my_count;
322
323 ::tatami_stats::internal::nanable_ifelse<Value_>(
324 my_skip_nan,
325 [&]() -> void {
326 for (Index_ i = 0; i < number; ++i) {
327 auto val = value[i];
328 auto ri = index[i] - my_subtract;
329 if (std::isnan(val)) {
330 ++my_nan[ri];
331 } else {
332 internal::add_welford(my_mean[ri], my_variance[ri], val, ++(my_nonzero[ri]));
333 }
334 }
335 },
336 [&]() -> void {
337 for (Index_ i = 0; i < number; ++i) {
338 auto ri = index[i] - my_subtract;
339 internal::add_welford(my_mean[ri], my_variance[ri], value[i], ++(my_nonzero[ri]));
340 }
341 }
342 );
343 }
344
348 void finish() {
349 ::tatami_stats::internal::nanable_ifelse<Value_>(
350 my_skip_nan,
351 [&]() -> void {
352 for (Index_ i = 0; i < my_num; ++i) {
353 auto& curM = my_mean[i];
354 auto& curV = my_variance[i];
355 Index_ ct = my_count - my_nan[i];
356
357 if (ct < 2) {
358 curV = std::numeric_limits<Output_>::quiet_NaN();
359 if (ct == 0) {
360 curM = std::numeric_limits<Output_>::quiet_NaN();
361 }
362 } else {
363 internal::add_welford_zeros(curM, curV, my_nonzero[i], ct);
364 curV /= ct - 1;
365 }
366 }
367 },
368 [&]() -> void {
369 if (my_count < 2) {
370 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
371 if (my_count == 0) {
372 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
373 }
374 } else {
375 for (Index_ i = 0; i < my_num; ++i) {
376 auto& var = my_variance[i];
377 internal::add_welford_zeros(my_mean[i], var, my_nonzero[i], my_count);
378 var /= my_count - 1;
379 }
380 }
381 }
382 );
383 }
384
385private:
386 Index_ my_num;
387 Output_* my_mean;
388 Output_* my_variance;
389 std::vector<Index_> my_nonzero;
390 bool my_skip_nan;
391 Index_ my_subtract;
392 Index_ my_count = 0;
393 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_nan;
394};
395
412template<typename Value_, typename Index_, typename Output_>
413void apply(bool row, const tatami::Matrix<Value_, Index_>& mat, Output_* output, const Options& vopt) {
414 auto dim = (row ? mat.nrow() : mat.ncol());
415 auto otherdim = (row ? mat.ncol() : mat.nrow());
416 const bool direct = mat.prefer_rows() == row;
417
418 if (mat.sparse()) {
419 if (direct) {
420 tatami::Options opt;
421 opt.sparse_extract_index = false;
422 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
423 auto ext = tatami::consecutive_extractor<true>(mat, row, s, l);
425 for (Index_ x = 0; x < l; ++x) {
426 auto out = ext->fetch(vbuffer.data(), NULL);
427 output[x + s] = variances::direct<Output_>(out.value, out.number, otherdim, vopt.skip_nan).second;
428 }
429 }, dim, vopt.num_threads);
430
431 } else {
432 tatami::parallelize([&](int thread, Index_ s, Index_ l) -> void {
433 auto ext = tatami::consecutive_extractor<true>(mat, !row, static_cast<Index_>(0), otherdim, s, l);
436
438 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
439 variances::RunningSparse<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan, s);
440
441 for (Index_ x = 0; x < otherdim; ++x) {
442 auto out = ext->fetch(vbuffer.data(), ibuffer.data());
443 runner.add(out.value, out.index, out.number);
444 }
445 runner.finish();
446
447 local_output.transfer();
448 }, dim, vopt.num_threads);
449 }
450
451 } else {
452 if (direct) {
453 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
454 auto ext = tatami::consecutive_extractor<false>(mat, row, s, l);
456 for (Index_ x = 0; x < l; ++x) {
457 auto out = ext->fetch(buffer.data());
458 output[x + s] = variances::direct<Output_>(out, otherdim, vopt.skip_nan).second;
459 }
460 }, dim, vopt.num_threads);
461
462 } else {
463 tatami::parallelize([&](int thread, Index_ s, Index_ l) -> void {
464 auto ext = tatami::consecutive_extractor<false>(mat, !row, static_cast<Index_>(0), otherdim, s, l);
466
468 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
469 variances::RunningDense<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan);
470
471 for (Index_ x = 0; x < otherdim; ++x) {
472 runner.add(ext->fetch(buffer.data()));
473 }
474 runner.finish();
475
476 local_output.transfer();
477 }, dim, vopt.num_threads);
478 }
479 }
480}
481
485// Back-compatibility.
486template<typename Value_, typename Index_, typename Output_>
487void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const Options& vopt) {
488 apply(row, *p, output, vopt);
489}
506template<typename Output_ = double, typename Value_, typename Index_>
507std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat, const Options& vopt) {
509 apply(false, mat, output.data(), vopt);
510 return output;
511}
512
516// Back-compatibility.
517template<typename Output_ = double, typename Value_, typename Index_>
518std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
519 return by_column<Output_>(*p, vopt);
520}
521
522template<typename Output_ = double, typename Value_, typename Index_>
523std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat) {
524 return by_column<Output_>(mat, {});
525}
526
527template<typename Output_ = double, typename Value_, typename Index_>
528std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
529 return by_column<Output_>(*p);
530}
547template<typename Output_ = double, typename Value_, typename Index_>
548std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat, const Options& vopt) {
550 apply(true, mat, output.data(), vopt);
551 return output;
552}
553
557// Back-compatibility.
558template<typename Output_ = double, typename Value_, typename Index_>
559std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
560 return by_row<Output_>(*p, vopt);
561}
562
563template<typename Output_ = double, typename Value_, typename Index_>
564std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat) {
565 return by_row<Output_>(mat, {});
566}
567
568template<typename Output_ = double, typename Value_, typename Index_>
569std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
570 return by_row<Output_>(*p);
571}
576}
577
578}
579
580#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Local output buffer for running calculations.
Definition utils.hpp:82
Running variances from dense data.
Definition variances.hpp:195
void add(const Value_ *ptr)
Definition variances.hpp:217
void finish()
Definition variances.hpp:240
RunningDense(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan)
Definition variances.hpp:205
Running variances from sparse data.
Definition variances.hpp:291
void finish()
Definition variances.hpp:348
RunningSparse(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan, Index_ subtract=0)
Definition variances.hpp:304
void add(const Value_ *value, const Index_ *index, Index_ number)
Definition variances.hpp:320
std::vector< Output_ > by_column(const tatami::Matrix< Value_, Index_ > &mat, const Options &vopt)
Definition variances.hpp:507
std::pair< Output_, Output_ > direct(const Value_ *value, Index_ num_nonzero, Index_ num_all, bool skip_nan)
Definition variances.hpp:98
void apply(bool row, const tatami::Matrix< Value_, Index_ > &mat, Output_ *output, const Options &vopt)
Definition variances.hpp:413
std::vector< Output_ > by_row(const tatami::Matrix< Value_, Index_ > &mat, const Options &vopt)
Definition variances.hpp:548
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
void parallelize(Function_ fun, Index_ tasks, int threads)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_extract_index
Variance calculation options.
Definition variances.hpp:32
int num_threads
Definition variances.hpp:43
bool skip_nan
Definition variances.hpp:37
Utilities for computing matrix statistics.