tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
variances.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS_VARS_HPP
2#define TATAMI_STATS_VARS_HPP
3
4#include "utils.hpp"
5
6#include <vector>
7#include <cmath>
8#include <numeric>
9#include <limits>
10#include <algorithm>
11#include <cstddef>
12
13#include "tatami/tatami.hpp"
14#include "sanisizer/sanisizer.hpp"
15
22namespace tatami_stats {
23
28namespace variances {
29
33struct Options {
38 bool skip_nan = false;
39
44 int num_threads = 1;
45};
46
50namespace internal {
51
52template<typename Output_ = double, typename Value_, typename Index_ >
53void add_welford(Output_& mean, Output_& sumsq, Value_ value, Index_ count) {
54 Output_ delta = value - mean;
55 mean += delta / count;
56 sumsq += delta * (value - mean);
57}
58
59template<typename Output_ = double, typename Index_ >
60void add_welford_zeros(Output_& mean, Output_& sumsq, Index_ num_nonzero, Index_ num_all) {
61 auto ratio = static_cast<Output_>(num_nonzero) / static_cast<Output_>(num_all);
62 sumsq += mean * mean * ratio * (num_all - num_nonzero);
63 mean *= ratio;
64}
65
66// Avoid problems from interactions between constexpr/lambda/std::conditional.
67template<typename Index_>
68struct MockVector {
69 MockVector(std::size_t) {}
70 Index_& operator[](std::size_t) { return out; }
71 std::size_t size() { return 0; }
72 Index_ out = 0;
73};
74
75}
99template<typename Output_ = double, typename Value_, typename Index_ >
100std::pair<Output_, Output_> direct(const Value_* value, Index_ num_nonzero, Index_ num_all, bool skip_nan) {
101 Output_ mean = 0;
102 Index_ lost = 0;
103
104 ::tatami_stats::internal::nanable_ifelse<Value_>(
105 skip_nan,
106 [&]() -> void {
107 auto copy = value;
108 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
109 auto val = *copy;
110 if (std::isnan(val)) {
111 ++lost;
112 } else {
113 mean += val;
114 }
115 }
116 },
117 [&]() -> void {
118 auto copy = value;
119 for (Index_ i = 0; i < num_nonzero; ++i, ++copy) {
120 mean += *copy;
121 }
122 }
123 );
124
125 auto count = num_all - lost;
126 mean /= count;
127
128 Output_ var = 0;
129 ::tatami_stats::internal::nanable_ifelse<Value_>(
130 skip_nan,
131 [&]() -> void {
132 for (Index_ i = 0; i < num_nonzero; ++i) {
133 auto val = value[i];
134 if (!std::isnan(val)) {
135 auto delta = static_cast<Output_>(val) - mean;
136 var += delta * delta;
137 }
138 }
139 },
140 [&]() -> void {
141 for (Index_ i = 0; i < num_nonzero; ++i) {
142 auto delta = static_cast<Output_>(value[i]) - mean;
143 var += delta * delta;
144 }
145 }
146 );
147
148 if (num_nonzero < num_all) {
149 var += static_cast<Output_>(num_all - num_nonzero) * mean * mean;
150 }
151
152 if (count == 0) {
153 return std::make_pair(std::numeric_limits<Output_>::quiet_NaN(), std::numeric_limits<Output_>::quiet_NaN());
154 } else if (count == 1) {
155 return std::make_pair(mean, std::numeric_limits<Output_>::quiet_NaN());
156 } else {
157 return std::make_pair(mean, var / (count - 1));
158 }
159}
160
178template<typename Output_ = double, typename Value_, typename Index_ >
179std::pair<Output_, Output_> direct(const Value_* ptr, Index_ num, bool skip_nan) {
180 return direct<Output_>(ptr, num, num, skip_nan);
181}
182
198template<typename Output_, typename Value_, typename Index_>
200public:
209 RunningDense(Index_ num, Output_* mean, Output_* variance, bool skip_nan) :
210 my_num(num),
211 my_mean(mean),
212 my_variance(variance),
213 my_skip_nan(skip_nan),
214 my_ok_count(skip_nan ? tatami::can_cast_Index_to_container_size<I<decltype(my_ok_count)> >(num) : static_cast<Index_>(0))
215 {}
216
221 void add(const Value_* ptr) {
222 // my_count is the upper bound of all my_ok_count, so we check it regardless of my_skip_nan.
223 my_count = sanisizer::sum<Index_>(my_count, 1);
224
225 ::tatami_stats::internal::nanable_ifelse<Value_>(
226 my_skip_nan,
227 [&]() -> void {
228 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
229 auto val = *ptr;
230 if (!std::isnan(val)) {
231 internal::add_welford(my_mean[i], my_variance[i], val, ++(my_ok_count[i]));
232 }
233 }
234 },
235 [&]() -> void {
236 for (Index_ i = 0; i < my_num; ++i, ++ptr) {
237 internal::add_welford(my_mean[i], my_variance[i], *ptr, my_count);
238 }
239 }
240 );
241 }
242
246 void finish() {
247 ::tatami_stats::internal::nanable_ifelse<Value_>(
248 my_skip_nan,
249 [&]() -> void {
250 for (Index_ i = 0; i < my_num; ++i) {
251 auto ct = my_ok_count[i];
252 if (ct < 2) {
253 my_variance[i] = std::numeric_limits<Output_>::quiet_NaN();
254 if (ct == 0) {
255 my_mean[i] = std::numeric_limits<Output_>::quiet_NaN();
256 }
257 } else {
258 my_variance[i] /= ct - 1;
259 }
260 }
261 },
262 [&]() -> void {
263 if (my_count < 2) {
264 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
265 if (my_count == 0) {
266 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
267 }
268 } else {
269 for (Index_ i = 0; i < my_num; ++i) {
270 my_variance[i] /= my_count - 1;
271 }
272 }
273 }
274 );
275 }
276
277private:
278 Index_ my_num;
279 Output_* my_mean;
280 Output_* my_variance;
281 bool my_skip_nan;
282 Index_ my_count = 0;
283 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_ok_count;
284};
285
297template<typename Output_, typename Value_, typename Index_>
299public:
311 RunningSparse(Index_ num, Output_* mean, Output_* variance, bool skip_nan, Index_ subtract = 0) :
312 my_num(num),
313 my_mean(mean),
314 my_variance(variance),
315 my_nonzero(tatami::can_cast_Index_to_container_size<I<decltype(my_nonzero)> >(num)),
316 my_skip_nan(skip_nan),
317 my_subtract(subtract),
318 my_nan(skip_nan ? tatami::can_cast_Index_to_container_size<I<decltype(my_nan)> >(num) : static_cast<Index_>(0))
319 {}
320
327 void add(const Value_* value, const Index_* index, Index_ number) {
328 // my_count is the upper bound of all my_nonzero, so no need to check individual increments.
329 my_count = sanisizer::sum<Index_>(my_count, 1);
330
331 ::tatami_stats::internal::nanable_ifelse<Value_>(
332 my_skip_nan,
333 [&]() -> void {
334 for (Index_ i = 0; i < number; ++i) {
335 auto val = value[i];
336 auto ri = index[i] - my_subtract;
337 if (std::isnan(val)) {
338 ++my_nan[ri];
339 } else {
340 internal::add_welford(my_mean[ri], my_variance[ri], val, ++(my_nonzero[ri]));
341 }
342 }
343 },
344 [&]() -> void {
345 for (Index_ i = 0; i < number; ++i) {
346 auto ri = index[i] - my_subtract;
347 internal::add_welford(my_mean[ri], my_variance[ri], value[i], ++(my_nonzero[ri]));
348 }
349 }
350 );
351 }
352
356 void finish() {
357 ::tatami_stats::internal::nanable_ifelse<Value_>(
358 my_skip_nan,
359 [&]() -> void {
360 for (Index_ i = 0; i < my_num; ++i) {
361 auto& curM = my_mean[i];
362 auto& curV = my_variance[i];
363 Index_ ct = my_count - my_nan[i];
364
365 if (ct < 2) {
366 curV = std::numeric_limits<Output_>::quiet_NaN();
367 if (ct == 0) {
368 curM = std::numeric_limits<Output_>::quiet_NaN();
369 }
370 } else {
371 internal::add_welford_zeros(curM, curV, my_nonzero[i], ct);
372 curV /= ct - 1;
373 }
374 }
375 },
376 [&]() -> void {
377 if (my_count < 2) {
378 std::fill_n(my_variance, my_num, std::numeric_limits<Output_>::quiet_NaN());
379 if (my_count == 0) {
380 std::fill_n(my_mean, my_num, std::numeric_limits<Output_>::quiet_NaN());
381 }
382 } else {
383 for (Index_ i = 0; i < my_num; ++i) {
384 auto& var = my_variance[i];
385 internal::add_welford_zeros(my_mean[i], var, my_nonzero[i], my_count);
386 var /= my_count - 1;
387 }
388 }
389 }
390 );
391 }
392
393private:
394 Index_ my_num;
395 Output_* my_mean;
396 Output_* my_variance;
397 std::vector<Index_> my_nonzero;
398 bool my_skip_nan;
399 Index_ my_subtract;
400 Index_ my_count = 0;
401 typename std::conditional<std::numeric_limits<Value_>::has_quiet_NaN, std::vector<Index_>, internal::MockVector<Index_> >::type my_nan;
402};
403
421template<typename Value_, typename Index_, typename Output_>
422void apply(bool row, const tatami::Matrix<Value_, Index_>& mat, Output_* output, const Options& vopt) {
423 auto dim = (row ? mat.nrow() : mat.ncol());
424 auto otherdim = (row ? mat.ncol() : mat.nrow());
425 const bool direct = mat.prefer_rows() == row;
426
427 if (mat.sparse()) {
428 if (direct) {
429 tatami::Options opt;
430 opt.sparse_extract_index = false;
431 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
432 auto ext = tatami::consecutive_extractor<true>(mat, row, s, l);
434 for (Index_ x = 0; x < l; ++x) {
435 auto out = ext->fetch(vbuffer.data(), NULL);
436 output[x + s] = variances::direct<Output_>(out.value, out.number, otherdim, vopt.skip_nan).second;
437 }
438 }, dim, vopt.num_threads);
439
440 } else {
441 tatami::parallelize([&](int thread, Index_ s, Index_ l) -> void {
442 auto ext = tatami::consecutive_extractor<true>(mat, !row, static_cast<Index_>(0), otherdim, s, l);
445
447 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
448 variances::RunningSparse<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan, s);
449
450 for (Index_ x = 0; x < otherdim; ++x) {
451 auto out = ext->fetch(vbuffer.data(), ibuffer.data());
452 runner.add(out.value, out.index, out.number);
453 }
454 runner.finish();
455
456 local_output.transfer();
457 }, dim, vopt.num_threads);
458 }
459
460 } else {
461 if (direct) {
462 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
463 auto ext = tatami::consecutive_extractor<false>(mat, row, s, l);
465 for (Index_ x = 0; x < l; ++x) {
466 auto out = ext->fetch(buffer.data());
467 output[x + s] = variances::direct<Output_>(out, otherdim, vopt.skip_nan).second;
468 }
469 }, dim, vopt.num_threads);
470
471 } else {
472 tatami::parallelize([&](int thread, Index_ s, Index_ l) -> void {
473 auto ext = tatami::consecutive_extractor<false>(mat, !row, static_cast<Index_>(0), otherdim, s, l);
475
477 LocalOutputBuffer<Output_> local_output(thread, s, l, output);
478 variances::RunningDense<Output_, Value_, Index_> runner(l, running_means.data(), local_output.data(), vopt.skip_nan);
479
480 for (Index_ x = 0; x < otherdim; ++x) {
481 runner.add(ext->fetch(buffer.data()));
482 }
483 runner.finish();
484
485 local_output.transfer();
486 }, dim, vopt.num_threads);
487 }
488 }
489}
490
494// Back-compatibility.
495template<typename Value_, typename Index_, typename Output_>
496void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const Options& vopt) {
497 apply(row, *p, output, vopt);
498}
516template<typename Output_ = double, typename Value_, typename Index_>
517std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat, const Options& vopt) {
519 apply(false, mat, output.data(), vopt);
520 return output;
521}
522
526// Back-compatibility.
527template<typename Output_ = double, typename Value_, typename Index_>
528std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
529 return by_column<Output_>(*p, vopt);
530}
531
532template<typename Output_ = double, typename Value_, typename Index_>
533std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat) {
534 return by_column<Output_>(mat, {});
535}
536
537template<typename Output_ = double, typename Value_, typename Index_>
538std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
539 return by_column<Output_>(*p);
540}
558template<typename Output_ = double, typename Value_, typename Index_>
559std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat, const Options& vopt) {
561 apply(true, mat, output.data(), vopt);
562 return output;
563}
564
568// Back-compatibility.
569template<typename Output_ = double, typename Value_, typename Index_>
570std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& vopt) {
571 return by_row<Output_>(*p, vopt);
572}
573
574template<typename Output_ = double, typename Value_, typename Index_>
575std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat) {
576 return by_row<Output_>(mat, {});
577}
578
579template<typename Output_ = double, typename Value_, typename Index_>
580std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
581 return by_row<Output_>(*p);
582}
587}
588
589}
590
591#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Local output buffer for running calculations.
Definition utils.hpp:93
Running variances from dense data.
Definition variances.hpp:199
void add(const Value_ *ptr)
Definition variances.hpp:221
void finish()
Definition variances.hpp:246
RunningDense(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan)
Definition variances.hpp:209
Running variances from sparse data.
Definition variances.hpp:298
void finish()
Definition variances.hpp:356
RunningSparse(Index_ num, Output_ *mean, Output_ *variance, bool skip_nan, Index_ subtract=0)
Definition variances.hpp:311
void add(const Value_ *value, const Index_ *index, Index_ number)
Definition variances.hpp:327
std::vector< Output_ > by_column(const tatami::Matrix< Value_, Index_ > &mat, const Options &vopt)
Definition variances.hpp:517
std::pair< Output_, Output_ > direct(const Value_ *value, Index_ num_nonzero, Index_ num_all, bool skip_nan)
Definition variances.hpp:100
void apply(bool row, const tatami::Matrix< Value_, Index_ > &mat, Output_ *output, const Options &vopt)
Definition variances.hpp:422
std::vector< Output_ > by_row(const tatami::Matrix< Value_, Index_ > &mat, const Options &vopt)
Definition variances.hpp:559
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
void parallelize(Function_ fun, const Index_ tasks, const int threads)
Container_ create_container_of_Index_size(const Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, const bool row, const Index_ iter_start, const Index_ iter_length, Args_ &&... args)
bool sparse_extract_index
Variance calculation options.
Definition variances.hpp:33
int num_threads
Definition variances.hpp:44
bool skip_nan
Definition variances.hpp:38
Utilities for computing matrix statistics.