tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
variances.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS_VARS_HPP
2#define TATAMI_STATS_VARS_HPP
3
4#include "utils.hpp"
5
6#include <vector>
7#include <cmath>
8#include <numeric>
9#include <limits>
10#include <algorithm>
11#include <cstddef>
12
13#include "tatami/tatami.hpp"
14#include "sanisizer/sanisizer.hpp"
15
22namespace tatami_stats {
23
28namespace variances {
29
33struct Options {
38 bool skip_nan = false;
39
44 int num_threads = 1;
45};
46
50namespace internal {
51
52template<typename Output_ = double, typename Value_, typename Index_ >
53void add_welford(Output_& mean, Output_& sumsq, Value_ value, Index_ count) {
54 Output_ delta = value - mean;
55 mean += delta / count;
56 sumsq += delta * (value - mean);
57}
58
59template<typename Output_ = double, typename Index_ >
60void add_welford_zeros(Output_& mean, Output_& sumsq, Index_ num_nonzero, Index_ num_all) {
61 auto ratio = static_cast<Output_>(num_nonzero) / static_cast<Output_>(num_all);
62 sumsq += mean * mean * ratio * (num_all - num_nonzero);
63 mean *= ratio;
64}
65
66// Avoid problems from interactions between constexpr/lambda/std::conditional.
67template<typename Index_>
68struct MockVector {
69 MockVector(std::size_t) {}
70 Index_& operator[](std::size_t) { return out; }
71 std::size_t size() { return 0; }
72 Index_ out = 0;
73};
74
75}
98template<typename Output_ = double, typename Value_, typename Index_ >
99std::pair<Output_, Output_> direct(const Value_* value, Index_ num_nonzero, Index_ num_all, bool skip_nan) {
100 Output_ mean = 0;
101 Index_ lost = 0;
102
103 ::tatami_stats::internal::nanable_ifelse<Value_>(
104 skip_nan,
105 [&]() -> void {
106 auto copy = value;
107 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
108 auto val = *copy;
109 if (std::isnan(val)) {
110 ++lost;
111 } else {
112 mean += val;
113 }
114 }
115 },
116 [&]() -> void {
117 auto copy = value;
118 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
119 mean += *copy;
120 }
121 }
122 );
123
124 auto count = num_all - lost;
125 mean /= count;
126
127 Output_ var = 0;
128 ::tatami_stats::internal::nanable_ifelse<Value_>(
129 skip_nan,
130 [&]() -> void {
131 for (Index_ i = 0; i < num_nonzero; ++i) {
132 auto val = value[i];
133 if (!std::isnan(val)) {
134 auto delta = static_cast<Output_>(val) - mean;
135 var += delta * delta;
136 }
137 }
138 },
139 [&]() -> void {
140 for (Index_ i = 0; i < num_nonzero; ++i) {
141 auto delta = static_cast<Output_>(value[i]) - mean;
142 var += delta * delta;
143 }
144 }
145 );
146
147 if (num_nonzero < num_all) {
148 var += static_cast<Output_>(num_all - num_nonzero) * mean * mean;
149 }
150
151 if (count == 0) {
152 return std::make_pair(std::numeric_limits<Output_>::quiet_NaN(), std::numeric_limits<Output_>::quiet_NaN());
153 } else if (count == 1) {
154 return std::make_pair(mean, std::numeric_limits<Output_>::quiet_NaN());
155 } else {
156 return std::make_pair(mean, var / (count - 1));
157 }
158}
159
176template<typename Output_ = double, typename Value_, typename Index_ >
177std::pair<Output_, Output_> direct(const Value_* ptr, Index_ num, bool skip_nan) {
178 return direct<Output_>(ptr, num, num, skip_nan);
179}
180
195template<typename Output_, typename Value_, typename Index_>
197public:
206 RunningDense(Index_ num, Output_* mean, Output_* variance, bool skip_nan) :
207 my_num(num),
208 my_mean(mean),
209 my_variance(variance),
210 my_skip_nan(skip_nan),
211 my_ok_count(skip_nan ? tatami::can_cast_Index_to_container_size<decltype(my_ok_count)>(num) : static_cast<Index_>(0))
212 {}
213
218 void add(const Value_* ptr) {
219 // my_count is the upper bound of all my_ok_count, so we check it regardless of my_skip_nan.
220 my_count = sanisizer::sum<Index_>(my_count, 1);
221
222 ::tatami_stats::internal::nanable_ifelse<Value_>(
223 my_skip_nan,
224 [&]() -> void {
225 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
226 auto val = *ptr;
227 if (!std::isnan(val)) {
228 internal::add_welford(my_mean[i], my_variance[i], val, ++(my_ok_count[i]));
229 }
230 }
231 },
232 [&]() -> void {
233 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
234 internal::add_welford(my_mean[i], my_variance[i], *ptr, my_count);
235 }
236 }
237 );
238 }
239
243 void finish() {
244 ::tatami_stats::internal::nanable_ifelse<Value_>(
245 my_skip_nan,
246 [&]() -> void {
247 for (Index_ i = 0; i < my_num; ++i) {
248 auto ct = my_ok_count[i];
249 if (ct < 2) {
250 my_variance[i] = std::numeric_limits<Output_>::quiet_NaN();
251 if (ct == 0) {
252 my_mean[i] = std::numeric_limits<Output_>::quiet_NaN();
253 }
254 } else {
255 my_variance[i] /= ct - 1;
256 }
257 }
258 },
259 [&]() -> void {
260 if (my_count < 2) {
261 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
262 if (my_count == 0) {
263 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
264 }
265 } else {
266 for (Index_ i = 0; i < my_num; ++i) {
267 my_variance[i] /= my_count - 1;
268 }
269 }
270 }
271 );
272 }
273
274private:
275 Index_ my_num;
276 Output_* my_mean;
277 Output_* my_variance;
278 bool my_skip_nan;
279 Index_ my_count = 0;
280 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_ok_count;
281};
282
293template<typename Output_, typename Value_, typename Index_>
295public:
307 RunningSparse(Index_ num, Output_* mean, Output_* variance, bool skip_nan, Index_ subtract = 0) :
308 my_num(num),
309 my_mean(mean),
310 my_variance(variance),
311 my_nonzero(tatami::can_cast_Index_to_container_size<decltype(my_nonzero)>(num)),
312 my_skip_nan(skip_nan),
313 my_subtract(subtract),
314 my_nan(skip_nan ? tatami::can_cast_Index_to_container_size<decltype(my_nan)>(num) : static_cast<Index_>(0))
315 {}
316
323 void add(const Value_* value, const Index_* index, Index_ number) {
324 // my_count is the upper bound of all my_nonzero, so no need to check individual increments.
325 my_count = sanisizer::sum<Index_>(my_count, 1);
326
327 ::tatami_stats::internal::nanable_ifelse<Value_>(
328 my_skip_nan,
329 [&]() -> void {
330 for (Index_ i = 0; i < number; ++i) {
331 auto val = value[i];
332 auto ri = index[i] - my_subtract;
333 if (std::isnan(val)) {
334 ++my_nan[ri];
335 } else {
336 internal::add_welford(my_mean[ri], my_variance[ri], val, ++(my_nonzero[ri]));
337 }
338 }
339 },
340 [&]() -> void {
341 for (Index_ i = 0; i < number; ++i) {
342 auto ri = index[i] - my_subtract;
343 internal::add_welford(my_mean[ri], my_variance[ri], value[i], ++(my_nonzero[ri]));
344 }
345 }
346 );
347 }
348
352 void finish() {
353 ::tatami_stats::internal::nanable_ifelse<Value_>(
354 my_skip_nan,
355 [&]() -> void {
356 for (Index_ i = 0; i < my_num; ++i) {
357 auto& curM = my_mean[i];
358 auto& curV = my_variance[i];
359 Index_ ct = my_count - my_nan[i];
360
361 if (ct < 2) {
362 curV = std::numeric_limits<Output_>::quiet_NaN();
363 if (ct == 0) {
364 curM = std::numeric_limits<Output_>::quiet_NaN();
365 }
366 } else {
367 internal::add_welford_zeros(curM, curV, my_nonzero[i], ct);
368 curV /= ct - 1;
369 }
370 }
371 },
372 [&]() -> void {
373 if (my_count < 2) {
374 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
375 if (my_count == 0) {
376 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
377 }
378 } else {
379 for (Index_ i = 0; i < my_num; ++i) {
380 auto& var = my_variance[i];
381 internal::add_welford_zeros(my_mean[i], var, my_nonzero[i], my_count);
382 var /= my_count - 1;
383 }
384 }
385 }
386 );
387 }
388
389private:
390 Index_ my_num;
391 Output_* my_mean;
392 Output_* my_variance;
393 std::vector<Index_> my_nonzero;
394 bool my_skip_nan;
395 Index_ my_subtract;
396 Index_ my_count = 0;
397 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_nan;
398};
399
416template<typename Value_, typename Index_, typename Output_>
417void apply(bool row, const tatami::Matrix<Value_, Index_>& mat, Output_* output, const Options& vopt) {
418 auto dim = (row ? mat.nrow() : mat.ncol());
419 auto otherdim = (row ? mat.ncol() : mat.nrow());
420 const bool direct = mat.prefer_rows() == row;
421
422 if (mat.sparse()) {
423 if (direct) {
424 tatami::Options opt;
425 opt.sparse_extract_index = false;
426 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
427 auto ext = tatami::consecutive_extractor<true>(mat, row, s, l);
429 for (Index_ x = 0; x < l; ++x) {
430 auto out = ext->fetch(vbuffer.data(), NULL);
431 output[x + s] = variances::direct<Output_>(out.value, out.number, otherdim, vopt.skip_nan).second;
432 }
433 }, dim, vopt.num_threads);
434
435 } else {
436 tatami::parallelize([&](int thread, Index_ s, Index_ l) -> void {
437 auto ext = tatami::consecutive_extractor<true>(mat, !row, static_cast<Index_>(0), otherdim, s, l);
440
442 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
443 variances::RunningSparse<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan, s);
444
445 for (Index_ x = 0; x < otherdim; ++x) {
446 auto out = ext->fetch(vbuffer.data(), ibuffer.data());
447 runner.add(out.value, out.index, out.number);
448 }
449 runner.finish();
450
451 local_output.transfer();
452 }, dim, vopt.num_threads);
453 }
454
455 } else {
456 if (direct) {
457 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
458 auto ext = tatami::consecutive_extractor<false>(mat, row, s, l);
460 for (Index_ x = 0; x < l; ++x) {
461 auto out = ext->fetch(buffer.data());
462 output[x + s] = variances::direct<Output_>(out, otherdim, vopt.skip_nan).second;
463 }
464 }, dim, vopt.num_threads);
465
466 } else {
467 tatami::parallelize([&](int thread, Index_ s, Index_ l) -> void {
468 auto ext = tatami::consecutive_extractor<false>(mat, !row, static_cast<Index_>(0), otherdim, s, l);
470
472 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
473 variances::RunningDense<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan);
474
475 for (Index_ x = 0; x < otherdim; ++x) {
476 runner.add(ext->fetch(buffer.data()));
477 }
478 runner.finish();
479
480 local_output.transfer();
481 }, dim, vopt.num_threads);
482 }
483 }
484}
485
489// Back-compatibility.
490template<typename Value_, typename Index_, typename Output_>
491void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const Options& vopt) {
492 apply(row, *p, output, vopt);
493}
510template<typename Output_ = double, typename Value_, typename Index_>
511std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat, const Options& vopt) {
513 apply(false, mat, output.data(), vopt);
514 return output;
515}
516
520// Back-compatibility.
521template<typename Output_ = double, typename Value_, typename Index_>
522std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
523 return by_column<Output_>(*p, vopt);
524}
525
526template<typename Output_ = double, typename Value_, typename Index_>
527std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat) {
528 return by_column<Output_>(mat, {});
529}
530
531template<typename Output_ = double, typename Value_, typename Index_>
532std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
533 return by_column<Output_>(*p);
534}
551template<typename Output_ = double, typename Value_, typename Index_>
552std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat, const Options& vopt) {
554 apply(true, mat, output.data(), vopt);
555 return output;
556}
557
561// Back-compatibility.
562template<typename Output_ = double, typename Value_, typename Index_>
563std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
564 return by_row<Output_>(*p, vopt);
565}
566
567template<typename Output_ = double, typename Value_, typename Index_>
568std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat) {
569 return by_row<Output_>(mat, {});
570}
571
572template<typename Output_ = double, typename Value_, typename Index_>
573std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
574 return by_row<Output_>(*p);
575}
580}
581
582}
583
584#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Local output buffer for running calculations.
Definition utils.hpp:82
Running variances from dense data.
Definition variances.hpp:196
void add(const Value_ *ptr)
Definition variances.hpp:218
void finish()
Definition variances.hpp:243
RunningDense(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan)
Definition variances.hpp:206
Running variances from sparse data.
Definition variances.hpp:294
void finish()
Definition variances.hpp:352
RunningSparse(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan, Index_ subtract=0)
Definition variances.hpp:307
void add(const Value_ *value, const Index_ *index, Index_ number)
Definition variances.hpp:323
std::vector< Output_ > by_column(const tatami::Matrix< Value_, Index_ > &mat, const Options &vopt)
Definition variances.hpp:511
std::pair< Output_, Output_ > direct(const Value_ *value, Index_ num_nonzero, Index_ num_all, bool skip_nan)
Definition variances.hpp:99
void apply(bool row, const tatami::Matrix< Value_, Index_ > &mat, Output_ *output, const Options &vopt)
Definition variances.hpp:417
std::vector< Output_ > by_row(const tatami::Matrix< Value_, Index_ > &mat, const Options &vopt)
Definition variances.hpp:552
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
void parallelize(Function_ fun, Index_ tasks, int threads)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_extract_index
Variance calculation options.
Definition variances.hpp:33
int num_threads
Definition variances.hpp:44
bool skip_nan
Definition variances.hpp:38
Utilities for computing matrix statistics.