tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
grouped_variances.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS_GROUPED_VARIANCES_HPP
2#define TATAMI_STATS_GROUPED_VARIANCES_HPP
3
4#include "utils.hpp"
5#include "tatami/tatami.hpp"
6#include "variances.hpp"
7#include <vector>
8#include <algorithm>
9#include <cstddef>
10
17namespace tatami_stats {
18
23namespace grouped_variances {
24
28struct Options {
33 bool skip_nan = false;
34
39 int num_threads = 1;
40};
41
45namespace internal {
46
47template<typename Index_, typename Output_>
48void finish_means(std::size_t num_groups, const Index_* group_size, Output_* output_means) {
49 for (decltype(num_groups) b = 0; b < num_groups; ++b) {
50 if (group_size[b]) {
51 output_means[b] /= group_size[b];
52 } else {
53 output_means[b] = std::numeric_limits<Output_>::quiet_NaN();
54 }
55 }
56}
57
58template<typename Index_, typename Output_>
59void finish_variances(std::size_t num_groups, const Index_* group_size, Output_* output_variances) {
60 for (decltype(num_groups) b = 0; b < num_groups; ++b) {
61 if (group_size[b] > 1) {
62 output_variances[b] /= group_size[b] - 1;
63 } else {
64 output_variances[b] = std::numeric_limits<Output_>::quiet_NaN();
65 }
66 }
67}
68
69}
102template<typename Value_, typename Index_, typename Group_, typename Output_>
104 const Value_* ptr,
105 Index_ num,
106 const Group_* group,
107 std::size_t num_groups,
108 const Index_* group_size,
109 Output_* output_means,
110 Output_* output_variances,
111 bool skip_nan,
112 Index_* valid_group_size)
113{
114 std::fill_n(output_means, num_groups, 0);
115 std::fill_n(output_variances, num_groups, 0);
116
117 ::tatami_stats::internal::nanable_ifelse<Value_>(
118 skip_nan,
119 [&]() -> void {
120 std::fill_n(valid_group_size, num_groups, 0);
121
122 for (Index_ j = 0; j < num; ++j) {
123 auto x = ptr[j];
124 if (!std::isnan(x)) {
125 auto b = group[j];
126 output_means[b] += x;
127 ++valid_group_size[b];
128 }
129 }
130 internal::finish_means(num_groups, valid_group_size, output_means);
131
132 for (Index_ j = 0; j < num; ++j) {
133 auto x = ptr[j];
134 if (!std::isnan(x)) {
135 auto b = group[j];
136 auto delta = x - output_means[b];
137 output_variances[b] += delta * delta;
138 }
139 }
140 internal::finish_variances(num_groups, valid_group_size, output_variances);
141 },
142 [&]() -> void {
143 for (Index_ j = 0; j < num; ++j) {
144 output_means[group[j]] += ptr[j];
145 }
146 internal::finish_means(num_groups, group_size, output_means);
147
148 for (Index_ j = 0; j < num; ++j) {
149 auto b = group[j];
150 auto delta = ptr[j] - output_means[b];
151 output_variances[b] += delta * delta;
152 }
153 internal::finish_variances(num_groups, group_size, output_variances);
154 }
155 );
156}
157
190template<typename Value_, typename Index_, typename Group_, typename Output_>
192 const Value_* value,
193 const Index_* index,
194 Index_ num_nonzero,
195 const Group_* group,
196 std::size_t num_groups,
197 const Index_* group_size,
198 Output_* output_means,
199 Output_* output_variances,
200 Index_* output_nonzero,
201 bool skip_nan,
202 Index_* valid_group_size)
203{
204 std::fill_n(output_means, num_groups, 0);
205 std::fill_n(output_nonzero, num_groups, 0);
206 std::fill_n(output_variances, num_groups, 0);
207
208 ::tatami_stats::internal::nanable_ifelse<Value_>(
209 skip_nan,
210 [&]() -> void {
211 std::copy_n(group_size, num_groups, valid_group_size);
212
213 for (Index_ j = 0; j < num_nonzero; ++j) {
214 auto x = value[j];
215 auto b = group[index[j]];
216 if (!std::isnan(x)) {
217 output_means[b] += x;
218 ++(output_nonzero[b]);
219 } else {
220 --(valid_group_size[b]);
221 }
222 }
223 internal::finish_means(num_groups, valid_group_size, output_means);
224
225 for (Index_ j = 0; j < num_nonzero; ++j) {
226 auto x = value[j];
227 if (!std::isnan(x)) {
228 auto b = group[index[j]];
229 auto delta = x - output_means[b];
230 output_variances[b] += delta * delta;
231 }
232 }
233 for (decltype(num_groups) b = 0; b < num_groups; ++b) {
234 output_variances[b] += output_means[b] * output_means[b] * (valid_group_size[b] - output_nonzero[b]);
235 }
236 internal::finish_variances(num_groups, valid_group_size, output_variances);
237 },
238 [&]() -> void {
239 for (Index_ j = 0; j < num_nonzero; ++j) {
240 auto b = group[index[j]];
241 output_means[b] += value[j];
242 ++output_nonzero[b];
243 }
244 internal::finish_means(num_groups, group_size, output_means);
245
246 for (Index_ j = 0; j < num_nonzero; ++j) {
247 auto b = group[index[j]];
248 auto delta = value[j] - output_means[b];
249 output_variances[b] += delta * delta;
250 }
251 for (decltype(num_groups) b = 0; b < num_groups; ++b) {
252 output_variances[b] += output_means[b] * output_means[b] * (group_size[b] - output_nonzero[b]);
253 }
254 internal::finish_variances(num_groups, group_size, output_variances);
255 }
256 );
257}
258
281template<typename Value_, typename Index_, typename Group_, typename Output_>
282void apply(bool row, const tatami::Matrix<Value_, Index_>& mat, const Group_* group, std::size_t num_groups, const Index_* group_size, Output_** output, const Options& sopt) {
283 Index_ dim = (row ? mat.nrow() : mat.ncol());
284 Index_ otherdim = (row ? mat.ncol() : mat.nrow());
285
286 if (mat.sparse()) {
287 if (mat.prefer_rows() == row) {
288 tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
289 auto ext = tatami::consecutive_extractor<true>(mat, row, start, len);
290 std::vector<Value_> xbuffer(otherdim);
291 std::vector<Index_> ibuffer(otherdim);
292
293 std::vector<Output_> tmp_means(num_groups);
294 std::vector<Output_> output_variances(num_groups);
295 std::vector<Index_> tmp_nonzero(num_groups);
296 std::vector<Index_> valid_group_size(sopt.skip_nan ? num_groups : 0);
297
298 for (Index_ i = 0; i < len; ++i) {
299 auto range = ext->fetch(xbuffer.data(), ibuffer.data());
300 direct(
301 range.value,
302 range.index,
303 range.number,
304 group,
305 num_groups,
306 group_size,
307 tmp_means.data(),
308 output_variances.data(),
309 tmp_nonzero.data(),
310 sopt.skip_nan,
311 valid_group_size.data()
312 );
313
314 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
315 output[g][i + start] = output_variances[g];
316 }
317 }
318 }, dim, sopt.num_threads);
319
320 } else {
321 // Order within each observed vector doesn't affect numerical
322 // precision of the outcome, as addition order for each objective
323 // vector is already well-defined for a running calculation.
324 tatami::Options opt;
325 opt.sparse_ordered_index = false;
326
327 tatami::parallelize([&](int thread, Index_ start, Index_ len) -> void {
328 std::vector<variances::RunningSparse<Output_, Value_, Index_> > runners;
329 runners.reserve(num_groups);
330 std::vector<LocalOutputBuffer<Output_> > local_var_output;
331 local_var_output.reserve(num_groups);
332 std::vector<std::vector<Output_> > local_mean_output;
333 local_mean_output.reserve(num_groups);
334
335 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
336 local_var_output.emplace_back(thread, start, len, output[g]);
337 local_mean_output.emplace_back(len);
338 runners.emplace_back(len, local_mean_output.back().data(), local_var_output.back().data(), sopt.skip_nan, start);
339 }
340
341 auto ext = tatami::consecutive_extractor<true>(mat, !row, static_cast<Index_>(0), otherdim, start, len, opt);
342 std::vector<Value_> xbuffer(len);
343 std::vector<Index_> ibuffer(len);
344
345 for (int i = 0; i < otherdim; ++i) {
346 auto range = ext->fetch(xbuffer.data(), ibuffer.data());
347 runners[group[i]].add(range.value, range.index, range.number);
348 }
349
350 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
351 runners[g].finish();
352 local_var_output[g].transfer();
353 }
354 }, dim, sopt.num_threads);
355 }
356
357 } else {
358 if (mat.prefer_rows() == row) {
359 tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
360 auto ext = tatami::consecutive_extractor<false>(mat, row, start, len);
361 std::vector<Value_> xbuffer(otherdim);
362 std::vector<Output_> tmp_means(num_groups);
363 std::vector<Output_> output_variances(num_groups);
364 std::vector<Index_> valid_group_size(sopt.skip_nan ? num_groups : 0);
365
366 for (Index_ i = 0; i < len; ++i) {
367 auto ptr = ext->fetch(xbuffer.data());
368 direct(
369 ptr,
370 otherdim,
371 group,
372 num_groups,
373 group_size,
374 tmp_means.data(),
375 output_variances.data(),
376 sopt.skip_nan,
377 valid_group_size.data()
378 );
379
380 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
381 output[g][i + start] = output_variances[g];
382 }
383 }
384 }, dim, sopt.num_threads);
385
386 } else {
387 tatami::parallelize([&](int thread, Index_ start, Index_ len) -> void {
388 std::vector<variances::RunningDense<Output_, Value_, Index_> > runners;
389 runners.reserve(num_groups);
390 std::vector<LocalOutputBuffer<Output_> > local_var_output;
391 local_var_output.reserve(num_groups);
392 std::vector<std::vector<Output_> > local_mean_output;
393 local_mean_output.reserve(num_groups);
394
395 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
396 local_var_output.emplace_back(thread, start, len, output[g]);
397 local_mean_output.emplace_back(len);
398 runners.emplace_back(len, local_mean_output.back().data(), local_var_output.back().data(), sopt.skip_nan);
399 }
400
401 std::vector<Value_> xbuffer(len);
402 auto ext = tatami::consecutive_extractor<false>(mat, !row, static_cast<Index_>(0), otherdim, start, len);
403
404 for (Index_ i = 0; i < otherdim; ++i) {
405 auto ptr = ext->fetch(xbuffer.data());
406 runners[group[i]].add(ptr);
407 }
408
409 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
410 runners[g].finish();
411 local_var_output[g].transfer();
412 }
413 }, dim, sopt.num_threads);
414 }
415 }
416}
417
421// Back-compatibility.
422template<typename Value_, typename Index_, typename Group_, typename Output_>
423void apply(bool row, const tatami::Matrix<Value_, Index_>* p, const Group_* group, std::size_t num_groups, const Index_* group_size, Output_** output, const Options& sopt) {
424 apply(row, *p, group, num_groups, group_size, output, sopt);
425}
447template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
448std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>& mat, const Group_* group, const Options& sopt) {
449 auto mydim = mat.nrow();
450 auto group_size = tabulate_groups(group, mat.ncol());
451 auto ngroup = group_size.size();
452
453 std::vector<std::vector<Output_> > output(ngroup);
454 std::vector<Output_*> ptrs;
455 ptrs.reserve(output.size());
456 for (auto& o : output) {
457 o.resize(mydim);
458 ptrs.push_back(o.data());
459 }
460
461 apply(true, mat, group, ngroup, group_size.data(), ptrs.data(), sopt);
462 return output;
463}
464
468// Back-compatibility.
469template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
470std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>* p, const Group_* group, const Options& sopt) {
471 return by_row<Output_>(*p, group, sopt);
472}
473
474template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
475std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>& mat, const Group_* group) {
476 return by_row<Output_>(mat, group, Options());
477}
478
479template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
480std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>* p, const Group_* group) {
481 return by_row<Output_>(*p, group);
482}
504template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
505std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>& mat, const Group_* group, const Options& sopt) {
506 auto mydim = mat.ncol();
507 auto group_size = tabulate_groups(group, mat.nrow());
508 auto ngroup = group_size.size();
509
510 std::vector<std::vector<Output_> > output(ngroup);
511 std::vector<Output_*> ptrs;
512 ptrs.reserve(output.size());
513 for (auto& o : output) {
514 o.resize(mydim);
515 ptrs.push_back(o.data());
516 }
517
518 apply(false, mat, group, ngroup, group_size.data(), ptrs.data(), sopt);
519 return output;
520}
521
525// Back-compatibility.
526template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
527std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>* p, const Group_* group, const Options& sopt) {
528 return by_column<Output_>(*p, group, sopt);
529}
530
531template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
532std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>& mat, const Group_* group) {
533 return by_column<Output_>(mat, group, Options());
534}
535
536template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
537std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>* p, const Group_* group) {
538 return by_column<Output_>(*p, group);
539}
544}
545
546}
547
548#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
void direct(const Value_ *ptr, Index_ num, const Group_ *group, std::size_t num_groups, const Index_ *group_size, Output_ *output_means, Output_ *output_variances, bool skip_nan, Index_ *valid_group_size)
Definition grouped_variances.hpp:103
void apply(bool row, const tatami::Matrix< Value_, Index_ > &mat, const Group_ *group, std::size_t num_groups, const Index_ *group_size, Output_ **output, const Options &sopt)
Definition grouped_variances.hpp:282
std::vector< std::vector< Output_ > > by_column(const tatami::Matrix< Value_, Index_ > &mat, const Group_ *group, const Options &sopt)
Definition grouped_variances.hpp:505
std::vector< std::vector< Output_ > > by_row(const tatami::Matrix< Value_, Index_ > &mat, const Group_ *group, const Options &sopt)
Definition grouped_variances.hpp:448
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
std::vector< Size_ > tabulate_groups(const Group_ *group, Size_ n)
Definition utils.hpp:50
void parallelize(Function_ fun, Index_ tasks, int threads)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_ordered_index
Grouped summation options.
Definition grouped_variances.hpp:28
bool skip_nan
Definition grouped_variances.hpp:33
int num_threads
Definition grouped_variances.hpp:39
Utilities for computing matrix statistics.
Compute row and column variances from a tatami::Matrix.