tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
grouped_variances.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS_GROUPED_VARIANCES_HPP
2#define TATAMI_STATS_GROUPED_VARIANCES_HPP
3
4#include "utils.hpp"
5#include "tatami/tatami.hpp"
6#include "variances.hpp"
7#include <vector>
8#include <algorithm>
9
16namespace tatami_stats {
17
22namespace grouped_variances {
23
27struct Options {
32 bool skip_nan = false;
33
38 int num_threads = 1;
39};
40
44namespace internal {
45
46template<typename Index_, typename Output_>
47void finish_means(size_t num_groups, const Index_* group_size, Output_* output_means) {
48 for (size_t b = 0; b < num_groups; ++b) {
49 if (group_size[b]) {
50 output_means[b] /= group_size[b];
51 } else {
52 output_means[b] = std::numeric_limits<Output_>::quiet_NaN();
53 }
54 }
55}
56
57template<typename Index_, typename Output_>
58void finish_variances(size_t num_groups, const Index_* group_size, Output_* output_variances) {
59 for (size_t b = 0; b < num_groups; ++b) {
60 if (group_size[b] > 1) {
61 output_variances[b] /= group_size[b] - 1;
62 } else {
63 output_variances[b] = std::numeric_limits<Output_>::quiet_NaN();
64 }
65 }
66}
67
68}
101template<typename Value_, typename Index_, typename Group_, typename Output_>
103 const Value_* ptr,
104 Index_ num,
105 const Group_* group,
106 size_t num_groups,
107 const Index_* group_size,
108 Output_* output_means,
109 Output_* output_variances,
110 bool skip_nan,
111 Index_* valid_group_size)
112{
113 std::fill_n(output_means, num_groups, 0);
114 std::fill_n(output_variances, num_groups, 0);
115
116 ::tatami_stats::internal::nanable_ifelse<Value_>(
117 skip_nan,
118 [&]() {
119 std::fill_n(valid_group_size, num_groups, 0);
120
121 for (Index_ j = 0; j < num; ++j) {
122 auto x = ptr[j];
123 if (!std::isnan(x)) {
124 auto b = group[j];
125 output_means[b] += x;
126 ++valid_group_size[b];
127 }
128 }
129 internal::finish_means(num_groups, valid_group_size, output_means);
130
131 for (Index_ j = 0; j < num; ++j) {
132 auto x = ptr[j];
133 if (!std::isnan(x)) {
134 auto b = group[j];
135 auto delta = x - output_means[b];
136 output_variances[b] += delta * delta;
137 }
138 }
139 internal::finish_variances(num_groups, valid_group_size, output_variances);
140 },
141 [&]() {
142 for (Index_ j = 0; j < num; ++j) {
143 output_means[group[j]] += ptr[j];
144 }
145 internal::finish_means(num_groups, group_size, output_means);
146
147 for (Index_ j = 0; j < num; ++j) {
148 auto b = group[j];
149 auto delta = ptr[j] - output_means[b];
150 output_variances[b] += delta * delta;
151 }
152 internal::finish_variances(num_groups, group_size, output_variances);
153 }
154 );
155}
156
189template<typename Value_, typename Index_, typename Group_, typename Output_>
191 const Value_* value,
192 const Index_* index,
193 Index_ num_nonzero,
194 const Group_* group,
195 size_t num_groups,
196 const Index_* group_size,
197 Output_* output_means,
198 Output_* output_variances,
199 Index_* output_nonzero,
200 bool skip_nan,
201 Index_* valid_group_size)
202{
203 std::fill_n(output_means, num_groups, 0);
204 std::fill_n(output_nonzero, num_groups, 0);
205 std::fill_n(output_variances, num_groups, 0);
206
207 ::tatami_stats::internal::nanable_ifelse<Value_>(
208 skip_nan,
209 [&]() {
210 std::copy_n(group_size, num_groups, valid_group_size);
211
212 for (Index_ j = 0; j < num_nonzero; ++j) {
213 auto x = value[j];
214 auto b = group[index[j]];
215 if (!std::isnan(x)) {
216 output_means[b] += x;
217 ++(output_nonzero[b]);
218 } else {
219 --(valid_group_size[b]);
220 }
221 }
222 internal::finish_means(num_groups, valid_group_size, output_means);
223
224 for (Index_ j = 0; j < num_nonzero; ++j) {
225 auto x = value[j];
226 if (!std::isnan(x)) {
227 auto b = group[index[j]];
228 auto delta = x - output_means[b];
229 output_variances[b] += delta * delta;
230 }
231 }
232 for (size_t b = 0; b < num_groups; ++b) {
233 output_variances[b] += output_means[b] * output_means[b] * (valid_group_size[b] - output_nonzero[b]);
234 }
235 internal::finish_variances(num_groups, valid_group_size, output_variances);
236 },
237 [&]() {
238 for (Index_ j = 0; j < num_nonzero; ++j) {
239 auto b = group[index[j]];
240 output_means[b] += value[j];
241 ++output_nonzero[b];
242 }
243 internal::finish_means(num_groups, group_size, output_means);
244
245 for (Index_ j = 0; j < num_nonzero; ++j) {
246 auto b = group[index[j]];
247 auto delta = value[j] - output_means[b];
248 output_variances[b] += delta * delta;
249 }
250 for (size_t b = 0; b < num_groups; ++b) {
251 output_variances[b] += output_means[b] * output_means[b] * (group_size[b] - output_nonzero[b]);
252 }
253 internal::finish_variances(num_groups, group_size, output_variances);
254 }
255 );
256}
257
280template<typename Value_, typename Index_, typename Group_, typename Output_>
281void apply(bool row, const tatami::Matrix<Value_, Index_>* p, const Group_* group, size_t num_groups, const Index_* group_size, Output_** output, const Options& sopt) {
282 Index_ dim = (row ? p->nrow() : p->ncol());
283 Index_ otherdim = (row ? p->ncol() : p->nrow());
284
285 if (p->sparse()) {
286 if (p->prefer_rows() == row) {
287 tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
288 auto ext = tatami::consecutive_extractor<true>(p, row, start, len);
289 std::vector<Value_> xbuffer(otherdim);
290 std::vector<Index_> ibuffer(otherdim);
291
292 std::vector<Output_> tmp_means(num_groups);
293 std::vector<Output_> output_variances(num_groups);
294 std::vector<Index_> tmp_nonzero(num_groups);
295 std::vector<Index_> valid_group_size(sopt.skip_nan ? num_groups : 0);
296
297 for (Index_ i = 0; i < len; ++i) {
298 auto range = ext->fetch(xbuffer.data(), ibuffer.data());
299 direct(
300 range.value,
301 range.index,
302 range.number,
303 group,
304 num_groups,
305 group_size,
306 tmp_means.data(),
307 output_variances.data(),
308 tmp_nonzero.data(),
309 sopt.skip_nan,
310 valid_group_size.data()
311 );
312
313 for (size_t g = 0; g < num_groups; ++g) {
314 output[g][i + start] = output_variances[g];
315 }
316 }
317 }, dim, sopt.num_threads);
318
319 } else {
320 // Order within each observed vector doesn't affect numerical
321 // precision of the outcome, as addition order for each objective
322 // vector is already well-defined for a running calculation.
323 tatami::Options opt;
324 opt.sparse_ordered_index = false;
325
326 tatami::parallelize([&](int thread, Index_ start, Index_ len) -> void {
327 std::vector<variances::RunningSparse<Output_, Value_, Index_> > runners;
328 runners.reserve(num_groups);
329 std::vector<LocalOutputBuffer<Output_> > local_var_output;
330 local_var_output.reserve(num_groups);
331 std::vector<std::vector<Output_> > local_mean_output;
332 local_mean_output.reserve(num_groups);
333
334 for (size_t g = 0; g < num_groups; ++g) {
335 local_var_output.emplace_back(thread, start, len, output[g]);
336 local_mean_output.emplace_back(len);
337 runners.emplace_back(len, local_mean_output.back().data(), local_var_output.back().data(), sopt.skip_nan, start);
338 }
339
340 auto ext = tatami::consecutive_extractor<true>(p, !row, static_cast<Index_>(0), otherdim, start, len, opt);
341 std::vector<Value_> xbuffer(len);
342 std::vector<Index_> ibuffer(len);
343
344 for (int i = 0; i < otherdim; ++i) {
345 auto range = ext->fetch(xbuffer.data(), ibuffer.data());
346 runners[group[i]].add(range.value, range.index, range.number);
347 }
348
349 for (size_t g = 0; g < num_groups; ++g) {
350 runners[g].finish();
351 local_var_output[g].transfer();
352 }
353 }, dim, sopt.num_threads);
354 }
355
356 } else {
357 if (p->prefer_rows() == row) {
358 tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
359 auto ext = tatami::consecutive_extractor<false>(p, row, start, len);
360 std::vector<Value_> xbuffer(otherdim);
361 std::vector<Output_> tmp_means(num_groups);
362 std::vector<Output_> output_variances(num_groups);
363 std::vector<Index_> valid_group_size(sopt.skip_nan ? num_groups : 0);
364
365 for (Index_ i = 0; i < len; ++i) {
366 auto ptr = ext->fetch(xbuffer.data());
367 direct(
368 ptr,
369 otherdim,
370 group,
371 num_groups,
372 group_size,
373 tmp_means.data(),
374 output_variances.data(),
375 sopt.skip_nan,
376 valid_group_size.data()
377 );
378
379 for (size_t g = 0; g < num_groups; ++g) {
380 output[g][i + start] = output_variances[g];
381 }
382 }
383 }, dim, sopt.num_threads);
384
385 } else {
386 tatami::parallelize([&](int thread, Index_ start, Index_ len) -> void {
387 std::vector<variances::RunningDense<Output_, Value_, Index_> > runners;
388 runners.reserve(num_groups);
389 std::vector<LocalOutputBuffer<Output_> > local_var_output;
390 local_var_output.reserve(num_groups);
391 std::vector<std::vector<Output_> > local_mean_output;
392 local_mean_output.reserve(num_groups);
393
394 for (size_t g = 0; g < num_groups; ++g) {
395 local_var_output.emplace_back(thread, start, len, output[g]);
396 local_mean_output.emplace_back(len);
397 runners.emplace_back(len, local_mean_output.back().data(), local_var_output.back().data(), sopt.skip_nan);
398 }
399
400 std::vector<Value_> xbuffer(len);
401 auto ext = tatami::consecutive_extractor<false>(p, !row, static_cast<Index_>(0), otherdim, start, len);
402
403 for (Index_ i = 0; i < otherdim; ++i) {
404 auto ptr = ext->fetch(xbuffer.data());
405 runners[group[i]].add(ptr);
406 }
407
408 for (size_t g = 0; g < num_groups; ++g) {
409 runners[g].finish();
410 local_var_output[g].transfer();
411 }
412 }, dim, sopt.num_threads);
413 }
414 }
415}
416
434template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
435std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>* p, const Group_* group, const Options& sopt) {
436 size_t mydim = p->nrow();
437 auto group_size = tabulate_groups(group, p->ncol());
438 size_t ngroup = group_size.size();
439
440 std::vector<std::vector<Output_> > output(ngroup);
441 std::vector<Output_*> ptrs;
442 ptrs.reserve(output.size());
443 for (auto& o : output) {
444 o.resize(mydim);
445 ptrs.push_back(o.data());
446 }
447
448 apply(true, p, group, ngroup, group_size.data(), ptrs.data(), sopt);
449 return output;
450}
451
468template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
469std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>* p, const Group_* group) {
470 return by_row(p, group, Options());
471}
472
490template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
491std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>* p, const Group_* group, const Options& sopt) {
492 size_t mydim = p->ncol();
493 auto group_size = tabulate_groups(group, p->nrow());
494 size_t ngroup = group_size.size();
495
496 std::vector<std::vector<Output_> > output(ngroup);
497 std::vector<Output_*> ptrs;
498 ptrs.reserve(output.size());
499 for (auto& o : output) {
500 o.resize(mydim);
501 ptrs.push_back(o.data());
502 }
503
504 apply(false, p, group, ngroup, group_size.data(), ptrs.data(), sopt);
505 return output;
506}
507
524template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
525std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>* p, const Group_* group) {
526 return by_column(p, group, Options());
527}
528
529}
530
531}
532
533#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
void direct(const Value_ *ptr, Index_ num, const Group_ *group, size_t num_groups, const Index_ *group_size, Output_ *output_means, Output_ *output_variances, bool skip_nan, Index_ *valid_group_size)
Definition grouped_variances.hpp:102
std::vector< std::vector< Output_ > > by_row(const tatami::Matrix< Value_, Index_ > *p, const Group_ *group, const Options &sopt)
Definition grouped_variances.hpp:435
std::vector< std::vector< Output_ > > by_column(const tatami::Matrix< Value_, Index_ > *p, const Group_ *group, const Options &sopt)
Definition grouped_variances.hpp:491
void apply(bool row, const tatami::Matrix< Value_, Index_ > *p, const Group_ *group, size_t num_groups, const Index_ *group_size, Output_ **output, const Options &sopt)
Definition grouped_variances.hpp:281
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
std::vector< Size_ > tabulate_groups(const Group_ *group, Size_ n)
Definition utils.hpp:49
void parallelize(Function_ fun, Index_ tasks, int threads)
auto consecutive_extractor(const Matrix< Value_, Index_ > *mat, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_ordered_index
Grouped summation options.
Definition grouped_variances.hpp:27
bool skip_nan
Definition grouped_variances.hpp:32
int num_threads
Definition grouped_variances.hpp:38
Utilities for computing matrix statistics.
Compute row and column variances from a tatami::Matrix.