tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
variances.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS_VARS_HPP
2#define TATAMI_STATS_VARS_HPP
3
4#include "tatami/tatami.hpp"
5#include "utils.hpp"
6
7#include <vector>
8#include <cmath>
9#include <numeric>
10#include <limits>
11#include <algorithm>
12#include <cstddef>
13
20namespace tatami_stats {
21
26namespace variances {
27
31struct Options {
36 bool skip_nan = false;
37
42 int num_threads = 1;
43};
44
48namespace internal {
49
50template<typename Output_ = double, typename Value_, typename Index_ >
51void add_welford(Output_& mean, Output_& sumsq, Value_ value, Index_ count) {
52 Output_ delta = value - mean;
53 mean += delta / count;
54 sumsq += delta * (value - mean);
55}
56
57template<typename Output_ = double, typename Index_ >
58void add_welford_zeros(Output_& mean, Output_& sumsq, Index_ num_nonzero, Index_ num_all) {
59 auto ratio = static_cast<Output_>(num_nonzero) / static_cast<Output_>(num_all);
60 sumsq += mean * mean * ratio * (num_all - num_nonzero);
61 mean *= ratio;
62}
63
64// Avoid problems from interactions between constexpr/lambda/std::conditional.
65template<typename Index_>
66struct MockVector {
67 MockVector(std::size_t) {}
68 Index_& operator[](std::size_t) { return out; }
69 Index_ out = 0;
70};
71
72}
95template<typename Output_ = double, typename Value_, typename Index_ >
96std::pair<Output_, Output_> direct(const Value_* value, Index_ num_nonzero, Index_ num_all, bool skip_nan) {
97 Output_ mean = 0;
98 Index_ lost = 0;
99
100 ::tatami_stats::internal::nanable_ifelse<Value_>(
101 skip_nan,
102 [&]() -> void {
103 auto copy = value;
104 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
105 auto val = *copy;
106 if (std::isnan(val)) {
107 ++lost;
108 } else {
109 mean += val;
110 }
111 }
112 },
113 [&]() -> void {
114 auto copy = value;
115 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
116 mean += *copy;
117 }
118 }
119 );
120
121 auto count = num_all - lost;
122 mean /= count;
123
124 Output_ var = 0;
125 ::tatami_stats::internal::nanable_ifelse<Value_>(
126 skip_nan,
127 [&]() -> void {
128 for (Index_ i = 0; i < num_nonzero; ++i) {
129 auto val = value[i];
130 if (!std::isnan(val)) {
131 auto delta = static_cast<Output_>(val) - mean;
132 var += delta * delta;
133 }
134 }
135 },
136 [&]() -> void {
137 for (Index_ i = 0; i < num_nonzero; ++i) {
138 auto delta = static_cast<Output_>(value[i]) - mean;
139 var += delta * delta;
140 }
141 }
142 );
143
144 if (num_nonzero < num_all) {
145 var += static_cast<Output_>(num_all - num_nonzero) * mean * mean;
146 }
147
148 if (count == 0) {
149 return std::make_pair(std::numeric_limits<Output_>::quiet_NaN(), std::numeric_limits<Output_>::quiet_NaN());
150 } else if (count == 1) {
151 return std::make_pair(mean, std::numeric_limits<Output_>::quiet_NaN());
152 } else {
153 return std::make_pair(mean, var / (count - 1));
154 }
155}
156
173template<typename Output_ = double, typename Value_, typename Index_ >
174std::pair<Output_, Output_> direct(const Value_* ptr, Index_ num, bool skip_nan) {
175 return direct<Output_>(ptr, num, num, skip_nan);
176}
177
192template<typename Output_, typename Value_, typename Index_>
194public:
203 RunningDense(Index_ num, Output_* mean, Output_* variance, bool skip_nan) :
204 my_num(num), my_mean(mean), my_variance(variance), my_skip_nan(skip_nan), my_ok_count(skip_nan ? num : 0) {}
205
210 void add(const Value_* ptr) {
211 ::tatami_stats::internal::nanable_ifelse<Value_>(
212 my_skip_nan,
213 [&]() -> void {
214 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
215 auto val = *ptr;
216 if (!std::isnan(val)) {
217 internal::add_welford(my_mean[i], my_variance[i], val, ++(my_ok_count[i]));
218 }
219 }
220 },
221 [&]() -> void {
222 ++my_count;
223 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
224 internal::add_welford(my_mean[i], my_variance[i], *ptr, my_count);
225 }
226 }
227 );
228 }
229
233 void finish() {
234 ::tatami_stats::internal::nanable_ifelse<Value_>(
235 my_skip_nan,
236 [&]() -> void {
237 for (Index_ i = 0; i < my_num; ++i) {
238 auto ct = my_ok_count[i];
239 if (ct < 2) {
240 my_variance[i] = std::numeric_limits<Output_>::quiet_NaN();
241 if (ct == 0) {
242 my_mean[i] = std::numeric_limits<Output_>::quiet_NaN();
243 }
244 } else {
245 my_variance[i] /= ct - 1;
246 }
247 }
248 },
249 [&]() -> void {
250 if (my_count < 2) {
251 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
252 if (my_count == 0) {
253 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
254 }
255 } else {
256 for (Index_ i = 0; i < my_num; ++i) {
257 my_variance[i] /= my_count - 1;
258 }
259 }
260 }
261 );
262 }
263
264private:
265 Index_ my_num;
266 Output_* my_mean;
267 Output_* my_variance;
268 bool my_skip_nan;
269 Index_ my_count = 0;
270 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_ok_count;
271};
272
283template<typename Output_, typename Value_, typename Index_>
285public:
297 RunningSparse(Index_ num, Output_* mean, Output_* variance, bool skip_nan, Index_ subtract = 0) :
298 my_num(num), my_mean(mean), my_variance(variance), my_nonzero(num), my_skip_nan(skip_nan), my_subtract(subtract), my_nan(skip_nan ? num : 0) {}
299
306 void add(const Value_* value, const Index_* index, Index_ number) {
307 ++my_count;
308
309 ::tatami_stats::internal::nanable_ifelse<Value_>(
310 my_skip_nan,
311 [&]() -> void {
312 for (Index_ i = 0; i < number; ++i) {
313 auto val = value[i];
314 auto ri = index[i] - my_subtract;
315 if (std::isnan(val)) {
316 ++my_nan[ri];
317 } else {
318 internal::add_welford(my_mean[ri], my_variance[ri], val, ++(my_nonzero[ri]));
319 }
320 }
321 },
322 [&]() -> void {
323 for (Index_ i = 0; i < number; ++i) {
324 auto ri = index[i] - my_subtract;
325 internal::add_welford(my_mean[ri], my_variance[ri], value[i], ++(my_nonzero[ri]));
326 }
327 }
328 );
329 }
330
334 void finish() {
335 ::tatami_stats::internal::nanable_ifelse<Value_>(
336 my_skip_nan,
337 [&]() -> void {
338 for (Index_ i = 0; i < my_num; ++i) {
339 auto& curM = my_mean[i];
340 auto& curV = my_variance[i];
341 Index_ ct = my_count - my_nan[i];
342
343 if (ct < 2) {
344 curV = std::numeric_limits<Output_>::quiet_NaN();
345 if (ct == 0) {
346 curM = std::numeric_limits<Output_>::quiet_NaN();
347 }
348 } else {
349 internal::add_welford_zeros(curM, curV, my_nonzero[i], ct);
350 curV /= ct - 1;
351 }
352 }
353 },
354 [&]() -> void {
355 if (my_count < 2) {
356 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
357 if (my_count == 0) {
358 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
359 }
360 } else {
361 for (Index_ i = 0; i < my_num; ++i) {
362 auto& var = my_variance[i];
363 internal::add_welford_zeros(my_mean[i], var, my_nonzero[i], my_count);
364 var /= my_count - 1;
365 }
366 }
367 }
368 );
369 }
370
371private:
372 Index_ my_num;
373 Output_* my_mean;
374 Output_* my_variance;
375 std::vector<Index_> my_nonzero;
376 bool my_skip_nan;
377 Index_ my_subtract;
378 Index_ my_count = 0;
379 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_nan;
380};
381
398template<typename Value_, typename Index_, typename Output_>
399void apply(bool row, const tatami::Matrix<Value_, Index_>& mat, Output_* output, const Options& vopt) {
400 auto dim = (row ? mat.nrow() : mat.ncol());
401 auto otherdim = (row ? mat.ncol() : mat.nrow());
402 const bool direct = mat.prefer_rows() == row;
403
404 if (mat.sparse()) {
405 if (direct) {
406 tatami::Options opt;
407 opt.sparse_extract_index = false;
408 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
409 auto ext = tatami::consecutive_extractor<true>(mat, row, s, l);
410 std::vector<Value_> vbuffer(otherdim);
411 for (Index_ x = 0; x < l; ++x) {
412 auto out = ext->fetch(vbuffer.data(), NULL);
413 output[x + s] = variances::direct<Output_>(out.value, out.number, otherdim, vopt.skip_nan).second;
414 }
415 }, dim, vopt.num_threads);
416
417 } else {
418 tatami::parallelize([&](int thread, Index_ s, Index_ l) -> void {
419 auto ext = tatami::consecutive_extractor<true>(mat, !row, static_cast<Index_>(0), otherdim, s, l);
420 std::vector<Value_> vbuffer(l);
421 std::vector<Index_> ibuffer(l);
422
423 std::vector<Output_> running_means(l);
424 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
425 variances::RunningSparse<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan, s);
426
427 for (Index_ x = 0; x < otherdim; ++x) {
428 auto out = ext->fetch(vbuffer.data(), ibuffer.data());
429 runner.add(out.value, out.index, out.number);
430 }
431 runner.finish();
432
433 local_output.transfer();
434 }, dim, vopt.num_threads);
435 }
436
437 } else {
438 if (direct) {
439 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
440 auto ext = tatami::consecutive_extractor<false>(mat, row, s, l);
441 std::vector<Value_> buffer(otherdim);
442 for (Index_ x = 0; x < l; ++x) {
443 auto out = ext->fetch(buffer.data());
444 output[x + s] = variances::direct<Output_>(out, otherdim, vopt.skip_nan).second;
445 }
446 }, dim, vopt.num_threads);
447
448 } else {
449 tatami::parallelize([&](int thread, Index_ s, Index_ l) -> void {
450 auto ext = tatami::consecutive_extractor<false>(mat, !row, static_cast<Index_>(0), otherdim, s, l);
451 std::vector<Value_> buffer(l);
452
453 std::vector<Output_> running_means(l);
454 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
455 variances::RunningDense<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan);
456
457 for (Index_ x = 0; x < otherdim; ++x) {
458 runner.add(ext->fetch(buffer.data()));
459 }
460 runner.finish();
461
462 local_output.transfer();
463 }, dim, vopt.num_threads);
464 }
465 }
466}
467
471// Back-compatibility.
472template<typename Value_, typename Index_, typename Output_>
473void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const Options& vopt) {
474 apply(row, *p, output, vopt);
475}
492template<typename Output_ = double, typename Value_, typename Index_>
493std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat, const Options& vopt) {
494 std::vector<Output_> output(mat.ncol());
495 apply(false, mat, output.data(), vopt);
496 return output;
497}
498
502// Back-compatibility.
503template<typename Output_ = double, typename Value_, typename Index_>
504std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
505 return by_column<Output_>(*p, vopt);
506}
507
508template<typename Output_ = double, typename Value_, typename Index_>
509std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat) {
510 return by_column<Output_>(mat, {});
511}
512
513template<typename Output_ = double, typename Value_, typename Index_>
514std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
515 return by_column<Output_>(*p);
516}
533template<typename Output_ = double, typename Value_, typename Index_>
534std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat, const Options& vopt) {
535 std::vector<Output_> output(mat.nrow());
536 apply(true, mat, output.data(), vopt);
537 return output;
538}
539
543// Back-compatibility.
544template<typename Output_ = double, typename Value_, typename Index_>
545std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
546 return by_row<Output_>(*p, vopt);
547}
548
549template<typename Output_ = double, typename Value_, typename Index_>
550std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat) {
551 return by_row<Output_>(mat, {});
552}
553
554template<typename Output_ = double, typename Value_, typename Index_>
555std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
556 return by_row<Output_>(*p);
557}
562}
563
564}
565
566#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Local output buffer for running calculations.
Definition utils.hpp:79
void transfer()
Definition utils.hpp:136
Output_ * data()
Definition utils.hpp:119
Running variances from dense data.
Definition variances.hpp:193
void add(const Value_ *ptr)
Definition variances.hpp:210
void finish()
Definition variances.hpp:233
RunningDense(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan)
Definition variances.hpp:203
Running variances from sparse data.
Definition variances.hpp:284
void finish()
Definition variances.hpp:334
RunningSparse(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan, Index_ subtract=0)
Definition variances.hpp:297
void add(const Value_ *value, const Index_ *index, Index_ number)
Definition variances.hpp:306
std::vector< Output_ > by_column(const tatami::Matrix< Value_, Index_ > &mat, const Options &vopt)
Definition variances.hpp:493
std::pair< Output_, Output_ > direct(const Value_ *value, Index_ num_nonzero, Index_ num_all, bool skip_nan)
Definition variances.hpp:96
void apply(bool row, const tatami::Matrix< Value_, Index_ > &mat, Output_ *output, const Options &vopt)
Definition variances.hpp:399
std::vector< Output_ > by_row(const tatami::Matrix< Value_, Index_ > &mat, const Options &vopt)
Definition variances.hpp:534
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
void parallelize(Function_ fun, Index_ tasks, int threads)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_extract_index
Variance calculation options.
Definition variances.hpp:31
int num_threads
Definition variances.hpp:42
bool skip_nan
Definition variances.hpp:36
Utilities for computing matrix statistics.