tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
variances.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS_VARS_HPP
2#define TATAMI_STATS_VARS_HPP
3
4#include "tatami/tatami.hpp"
5#include "utils.hpp"
6
7#include <vector>
8#include <cmath>
9#include <numeric>
10#include <limits>
11#include <algorithm>
12
19namespace tatami_stats {
20
25namespace variances {
26
30struct Options {
35 bool skip_nan = false;
36
41 int num_threads = 1;
42};
43
47namespace internal {
48
49template<typename Output_ = double, typename Value_, typename Index_ >
50void add_welford(Output_& mean, Output_& sumsq, Value_ value, Index_ count) {
51 Output_ delta = value - mean;
52 mean += delta / count;
53 sumsq += delta * (value - mean);
54}
55
56template<typename Output_ = double, typename Index_ >
57void add_welford_zeros(Output_& mean, Output_& sumsq, Index_ num_nonzero, Index_ num_all) {
58 auto ratio = static_cast<Output_>(num_nonzero) / static_cast<Output_>(num_all);
59 sumsq += mean * mean * ratio * (num_all - num_nonzero);
60 mean *= ratio;
61}
62
63// Avoid problems from interactions between constexpr/lambda/std::conditional.
64template<typename Index_>
65struct MockVector {
66 MockVector(size_t) {}
67 Index_& operator[](size_t) { return out; }
68 Index_ out = 0;
69};
70
71}
94template<typename Output_ = double, typename Value_, typename Index_ >
95std::pair<Output_, Output_> direct(const Value_* value, Index_ num_nonzero, Index_ num_all, bool skip_nan) {
96 Output_ mean = 0;
97 Index_ lost = 0;
98
99 ::tatami_stats::internal::nanable_ifelse<Value_>(
100 skip_nan,
101 [&]() {
102 auto copy = value;
103 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
104 auto val = *copy;
105 if (std::isnan(val)) {
106 ++lost;
107 } else {
108 mean += val;
109 }
110 }
111 },
112 [&]() {
113 auto copy = value;
114 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
115 mean += *copy;
116 }
117 }
118 );
119
120 auto count = num_all - lost;
121 mean /= count;
122
123 Output_ var = 0;
124 ::tatami_stats::internal::nanable_ifelse<Value_>(
125 skip_nan,
126 [&]() {
127 for (Index_ i = 0; i < num_nonzero; ++i) {
128 auto val = value[i];
129 if (!std::isnan(val)) {
130 auto delta = static_cast<Output_>(val) - mean;
131 var += delta * delta;
132 }
133 }
134 },
135 [&]() {
136 for (Index_ i = 0; i < num_nonzero; ++i) {
137 auto delta = static_cast<Output_>(value[i]) - mean;
138 var += delta * delta;
139 }
140 }
141 );
142
143 if (num_nonzero < num_all) {
144 var += static_cast<Output_>(num_all - num_nonzero) * mean * mean;
145 }
146
147 if (count == 0) {
148 return std::make_pair(std::numeric_limits<Output_>::quiet_NaN(), std::numeric_limits<Output_>::quiet_NaN());
149 } else if (count == 1) {
150 return std::make_pair(mean, std::numeric_limits<Output_>::quiet_NaN());
151 } else {
152 return std::make_pair(mean, var / (count - 1));
153 }
154}
155
172template<typename Output_ = double, typename Value_, typename Index_ >
173std::pair<Output_, Output_> direct(const Value_* ptr, Index_ num, bool skip_nan) {
174 return direct<Output_>(ptr, num, num, skip_nan);
175}
176
191template<typename Output_, typename Value_, typename Index_>
193public:
202 RunningDense(Index_ num, Output_* mean, Output_* variance, bool skip_nan) :
203 my_num(num), my_mean(mean), my_variance(variance), my_skip_nan(skip_nan), my_ok_count(skip_nan ? num : 0) {}
204
209 void add(const Value_* ptr) {
210 ::tatami_stats::internal::nanable_ifelse<Value_>(
211 my_skip_nan,
212 [&]() {
213 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
214 auto val = *ptr;
215 if (!std::isnan(val)) {
216 internal::add_welford(my_mean[i], my_variance[i], val, ++(my_ok_count[i]));
217 }
218 }
219 },
220 [&]() {
221 ++my_count;
222 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
223 internal::add_welford(my_mean[i], my_variance[i], *ptr, my_count);
224 }
225 }
226 );
227 }
228
232 void finish() {
233 ::tatami_stats::internal::nanable_ifelse<Value_>(
234 my_skip_nan,
235 [&]() {
236 for (Index_ i = 0; i < my_num; ++i) {
237 auto ct = my_ok_count[i];
238 if (ct < 2) {
239 my_variance[i] = std::numeric_limits<Output_>::quiet_NaN();
240 if (ct == 0) {
241 my_mean[i] = std::numeric_limits<Output_>::quiet_NaN();
242 }
243 } else {
244 my_variance[i] /= ct - 1;
245 }
246 }
247 },
248 [&]() {
249 if (my_count < 2) {
250 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
251 if (my_count == 0) {
252 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
253 }
254 } else {
255 for (Index_ i = 0; i < my_num; ++i) {
256 my_variance[i] /= my_count - 1;
257 }
258 }
259 }
260 );
261 }
262
263private:
264 Index_ my_num;
265 Output_* my_mean;
266 Output_* my_variance;
267 bool my_skip_nan;
268 Index_ my_count = 0;
269 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_ok_count;
270};
271
282template<typename Output_, typename Value_, typename Index_>
284public:
296 RunningSparse(Index_ num, Output_* mean, Output_* variance, bool skip_nan, Index_ subtract = 0) :
297 my_num(num), my_mean(mean), my_variance(variance), my_nonzero(num), my_skip_nan(skip_nan), my_subtract(subtract), my_nan(skip_nan ? num : 0) {}
298
305 void add(const Value_* value, const Index_* index, Index_ number) {
306 ++my_count;
307
308 ::tatami_stats::internal::nanable_ifelse<Value_>(
309 my_skip_nan,
310 [&]() {
311 for (Index_ i = 0; i < number; ++i) {
312 auto val = value[i];
313 auto ri = index[i] - my_subtract;
314 if (std::isnan(val)) {
315 ++my_nan[ri];
316 } else {
317 internal::add_welford(my_mean[ri], my_variance[ri], val, ++(my_nonzero[ri]));
318 }
319 }
320 },
321 [&]() {
322 for (Index_ i = 0; i < number; ++i) {
323 auto ri = index[i] - my_subtract;
324 internal::add_welford(my_mean[ri], my_variance[ri], value[i], ++(my_nonzero[ri]));
325 }
326 }
327 );
328 }
329
333 void finish() {
334 ::tatami_stats::internal::nanable_ifelse<Value_>(
335 my_skip_nan,
336 [&]() {
337 for (Index_ i = 0; i < my_num; ++i) {
338 auto& curM = my_mean[i];
339 auto& curV = my_variance[i];
340 Index_ ct = my_count - my_nan[i];
341
342 if (ct < 2) {
343 curV = std::numeric_limits<Output_>::quiet_NaN();
344 if (ct == 0) {
345 curM = std::numeric_limits<Output_>::quiet_NaN();
346 }
347 } else {
348 internal::add_welford_zeros(curM, curV, my_nonzero[i], ct);
349 curV /= ct - 1;
350 }
351 }
352 },
353 [&]() {
354 if (my_count < 2) {
355 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
356 if (my_count == 0) {
357 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
358 }
359 } else {
360 for (Index_ i = 0; i < my_num; ++i) {
361 auto& var = my_variance[i];
362 internal::add_welford_zeros(my_mean[i], var, my_nonzero[i], my_count);
363 var /= my_count - 1;
364 }
365 }
366 }
367 );
368 }
369
370private:
371 Index_ my_num;
372 Output_* my_mean;
373 Output_* my_variance;
374 std::vector<Index_> my_nonzero;
375 bool my_skip_nan;
376 Index_ my_subtract;
377 Index_ my_count = 0;
378 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_nan;
379};
380
397template<typename Value_, typename Index_, typename Output_>
398void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const Options& vopt) {
399 auto dim = (row ? p->nrow() : p->ncol());
400 auto otherdim = (row ? p->ncol() : p->nrow());
401 const bool direct = p->prefer_rows() == row;
402
403 if (p->sparse()) {
404 if (direct) {
405 tatami::Options opt;
406 opt.sparse_extract_index = false;
407 tatami::parallelize([&](int, Index_ s, Index_ l) {
408 auto ext = tatami::consecutive_extractor<true>(p, row, s, l);
409 std::vector<Value_> vbuffer(otherdim);
410 for (Index_ x = 0; x < l; ++x) {
411 auto out = ext->fetch(vbuffer.data(), NULL);
412 output[x + s] = variances::direct<Output_>(out.value, out.number, otherdim, vopt.skip_nan).second;
413 }
414 }, dim, vopt.num_threads);
415
416 } else {
417 tatami::parallelize([&](int thread, Index_ s, Index_ l) {
418 auto ext = tatami::consecutive_extractor<true>(p, !row, static_cast<Index_>(0), otherdim, s, l);
419 std::vector<Value_> vbuffer(l);
420 std::vector<Index_> ibuffer(l);
421
422 std::vector<Output_> running_means(l);
423 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
424 variances::RunningSparse<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan, s);
425
426 for (Index_ x = 0; x < otherdim; ++x) {
427 auto out = ext->fetch(vbuffer.data(), ibuffer.data());
428 runner.add(out.value, out.index, out.number);
429 }
430 runner.finish();
431
432 local_output.transfer();
433 }, dim, vopt.num_threads);
434 }
435
436 } else {
437 if (direct) {
438 tatami::parallelize([&](int, Index_ s, Index_ l) {
439 auto ext = tatami::consecutive_extractor<false>(p, row, s, l);
440 std::vector<Value_> buffer(otherdim);
441 for (Index_ x = 0; x < l; ++x) {
442 auto out = ext->fetch(buffer.data());
443 output[x + s] = variances::direct<Output_>(out, otherdim, vopt.skip_nan).second;
444 }
445 }, dim, vopt.num_threads);
446
447 } else {
448 tatami::parallelize([&](int thread, Index_ s, Index_ l) {
449 auto ext = tatami::consecutive_extractor<false>(p, !row, static_cast<Index_>(0), otherdim, s, l);
450 std::vector<Value_> buffer(l);
451
452 std::vector<Output_> running_means(l);
453 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
454 variances::RunningDense<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan);
455
456 for (Index_ x = 0; x < otherdim; ++x) {
457 runner.add(ext->fetch(buffer.data()));
458 }
459 runner.finish();
460
461 local_output.transfer();
462 }, dim, vopt.num_threads);
463 }
464 }
465}
466
479template<typename Output_ = double, typename Value_, typename Index_>
480std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
481 std::vector<Output_> output(p->ncol());
482 apply(false, p, output.data(), vopt);
483 return output;
484}
485
497template<typename Output_ = double, typename Value_, typename Index_>
498std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
499 Options vopt;
500 return by_column(p, vopt);
501}
502
515template<typename Output_ = double, typename Value_, typename Index_>
516std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
517 std::vector<Output_> output(p->nrow());
518 apply(true, p, output.data(), vopt);
519 return output;
520}
521
533template<typename Output_ = double, typename Value_, typename Index_>
534std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
535 Options vopt;
536 return by_row(p, vopt);
537}
538
539}
540
541}
542
543#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Local output buffer for running calculations.
Definition utils.hpp:72
void transfer()
Definition utils.hpp:126
Output_ * data()
Definition utils.hpp:111
Running variances from dense data.
Definition variances.hpp:192
void add(const Value_ *ptr)
Definition variances.hpp:209
void finish()
Definition variances.hpp:232
RunningDense(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan)
Definition variances.hpp:202
Running variances from sparse data.
Definition variances.hpp:283
void finish()
Definition variances.hpp:333
RunningSparse(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan, Index_ subtract=0)
Definition variances.hpp:296
void add(const Value_ *value, const Index_ *index, Index_ number)
Definition variances.hpp:305
std::vector< Output_ > by_row(const tatami::Matrix< Value_, Index_ > *p, const Options &vopt)
Definition variances.hpp:516
std::pair< Output_, Output_ > direct(const Value_ *value, Index_ num_nonzero, Index_ num_all, bool skip_nan)
Definition variances.hpp:95
void apply(bool row, const tatami::Matrix< Value_, Index_ > *p, Output_ *output, const Options &vopt)
Definition variances.hpp:398
std::vector< Output_ > by_column(const tatami::Matrix< Value_, Index_ > *p, const Options &vopt)
Definition variances.hpp:480
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
void parallelize(Function_ fun, Index_ tasks, int threads)
auto consecutive_extractor(const Matrix< Value_, Index_ > *mat, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_extract_index
Variance calculation options.
Definition variances.hpp:30
int num_threads
Definition variances.hpp:41
bool skip_nan
Definition variances.hpp:35
Utilities for computing matrix statistics.