tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
grouped_variances.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS_GROUPED_VARIANCES_HPP
2#define TATAMI_STATS_GROUPED_VARIANCES_HPP
3
4#include "utils.hpp"
5#include "variances.hpp"
6
7#include <vector>
8#include <algorithm>
9#include <cstddef>
10
11#include "tatami/tatami.hpp"
12#include "sanisizer/sanisizer.hpp"
13
20namespace tatami_stats {
21
26namespace grouped_variances {
27
31struct Options {
36 bool skip_nan = false;
37
42 int num_threads = 1;
43};
44
48namespace internal {
49
50template<typename Index_, typename Output_>
51void finish_means(std::size_t num_groups, const Index_* group_size, Output_* output_means) {
52 for (decltype(num_groups) b = 0; b < num_groups; ++b) {
53 if (group_size[b]) {
54 output_means[b] /= group_size[b];
55 } else {
56 output_means[b] = std::numeric_limits<Output_>::quiet_NaN();
57 }
58 }
59}
60
61template<typename Index_, typename Output_>
62void finish_variances(std::size_t num_groups, const Index_* group_size, Output_* output_variances) {
63 for (decltype(num_groups) b = 0; b < num_groups; ++b) {
64 if (group_size[b] > 1) {
65 output_variances[b] /= group_size[b] - 1;
66 } else {
67 output_variances[b] = std::numeric_limits<Output_>::quiet_NaN();
68 }
69 }
70}
71
72}
105template<typename Value_, typename Index_, typename Group_, typename Output_>
107 const Value_* ptr,
108 Index_ num,
109 const Group_* group,
110 std::size_t num_groups,
111 const Index_* group_size,
112 Output_* output_means,
113 Output_* output_variances,
114 bool skip_nan,
115 Index_* valid_group_size)
116{
117 std::fill_n(output_means, num_groups, 0);
118 std::fill_n(output_variances, num_groups, 0);
119
120 ::tatami_stats::internal::nanable_ifelse<Value_>(
121 skip_nan,
122 [&]() -> void {
123 std::fill_n(valid_group_size, num_groups, 0);
124
125 for (Index_ j = 0; j < num; ++j) {
126 auto x = ptr[j];
127 if (!std::isnan(x)) {
128 auto b = group[j];
129 output_means[b] += x;
130 ++valid_group_size[b];
131 }
132 }
133 internal::finish_means(num_groups, valid_group_size, output_means);
134
135 for (Index_ j = 0; j < num; ++j) {
136 auto x = ptr[j];
137 if (!std::isnan(x)) {
138 auto b = group[j];
139 auto delta = x - output_means[b];
140 output_variances[b] += delta * delta;
141 }
142 }
143 internal::finish_variances(num_groups, valid_group_size, output_variances);
144 },
145 [&]() -> void {
146 for (Index_ j = 0; j < num; ++j) {
147 output_means[group[j]] += ptr[j];
148 }
149 internal::finish_means(num_groups, group_size, output_means);
150
151 for (Index_ j = 0; j < num; ++j) {
152 auto b = group[j];
153 auto delta = ptr[j] - output_means[b];
154 output_variances[b] += delta * delta;
155 }
156 internal::finish_variances(num_groups, group_size, output_variances);
157 }
158 );
159}
160
193template<typename Value_, typename Index_, typename Group_, typename Output_>
195 const Value_* value,
196 const Index_* index,
197 Index_ num_nonzero,
198 const Group_* group,
199 std::size_t num_groups,
200 const Index_* group_size,
201 Output_* output_means,
202 Output_* output_variances,
203 Index_* output_nonzero,
204 bool skip_nan,
205 Index_* valid_group_size)
206{
207 std::fill_n(output_means, num_groups, 0);
208 std::fill_n(output_nonzero, num_groups, 0);
209 std::fill_n(output_variances, num_groups, 0);
210
211 ::tatami_stats::internal::nanable_ifelse<Value_>(
212 skip_nan,
213 [&]() -> void {
214 std::copy_n(group_size, num_groups, valid_group_size);
215
216 for (Index_ j = 0; j < num_nonzero; ++j) {
217 auto x = value[j];
218 auto b = group[index[j]];
219 if (!std::isnan(x)) {
220 output_means[b] += x;
221 ++(output_nonzero[b]);
222 } else {
223 --(valid_group_size[b]);
224 }
225 }
226 internal::finish_means(num_groups, valid_group_size, output_means);
227
228 for (Index_ j = 0; j < num_nonzero; ++j) {
229 auto x = value[j];
230 if (!std::isnan(x)) {
231 auto b = group[index[j]];
232 auto delta = x - output_means[b];
233 output_variances[b] += delta * delta;
234 }
235 }
236 for (decltype(num_groups) b = 0; b < num_groups; ++b) {
237 output_variances[b] += output_means[b] * output_means[b] * (valid_group_size[b] - output_nonzero[b]);
238 }
239 internal::finish_variances(num_groups, valid_group_size, output_variances);
240 },
241 [&]() -> void {
242 for (Index_ j = 0; j < num_nonzero; ++j) {
243 auto b = group[index[j]];
244 output_means[b] += value[j];
245 ++output_nonzero[b];
246 }
247 internal::finish_means(num_groups, group_size, output_means);
248
249 for (Index_ j = 0; j < num_nonzero; ++j) {
250 auto b = group[index[j]];
251 auto delta = value[j] - output_means[b];
252 output_variances[b] += delta * delta;
253 }
254 for (decltype(num_groups) b = 0; b < num_groups; ++b) {
255 output_variances[b] += output_means[b] * output_means[b] * (group_size[b] - output_nonzero[b]);
256 }
257 internal::finish_variances(num_groups, group_size, output_variances);
258 }
259 );
260}
261
284template<typename Value_, typename Index_, typename Group_, typename Output_>
285void apply(bool row, const tatami::Matrix<Value_, Index_>& mat, const Group_* group, std::size_t num_groups, const Index_* group_size, Output_** output, const Options& sopt) {
286 Index_ dim = (row ? mat.nrow() : mat.ncol());
287 Index_ otherdim = (row ? mat.ncol() : mat.nrow());
288
289 if (mat.sparse()) {
290 if (mat.prefer_rows() == row) {
291 tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
292 auto ext = tatami::consecutive_extractor<true>(mat, row, start, len);
295
296 auto tmp_means = sanisizer::create<std::vector<Output_> >(num_groups);
297 auto output_variances = sanisizer::create<std::vector<Output_> >(num_groups);
298 auto tmp_nonzero = sanisizer::create<std::vector<Index_> >(num_groups);
299 auto valid_group_size = sanisizer::create<std::vector<Index_> >(sopt.skip_nan ? num_groups : 0);
300
301 for (Index_ i = 0; i < len; ++i) {
302 auto range = ext->fetch(xbuffer.data(), ibuffer.data());
303 direct(
304 range.value,
305 range.index,
306 range.number,
307 group,
308 num_groups,
309 group_size,
310 tmp_means.data(),
311 output_variances.data(),
312 tmp_nonzero.data(),
313 sopt.skip_nan,
314 valid_group_size.data()
315 );
316
317 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
318 output[g][i + start] = output_variances[g];
319 }
320 }
321 }, dim, sopt.num_threads);
322
323 } else {
324 // Order within each observed vector doesn't affect numerical
325 // precision of the outcome, as addition order for each objective
326 // vector is already well-defined for a running calculation.
327 tatami::Options opt;
328 opt.sparse_ordered_index = false;
329
330 tatami::parallelize([&](int thread, Index_ start, Index_ len) -> void {
331 std::vector<variances::RunningSparse<Output_, Value_, Index_> > runners;
332 runners.reserve(num_groups);
333 std::vector<LocalOutputBuffer<Output_> > local_var_output;
334 local_var_output.reserve(num_groups);
335 std::vector<std::vector<Output_> > local_mean_output;
336 local_mean_output.reserve(num_groups);
337
338 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
339 local_var_output.emplace_back(thread, start, len, output[g]);
340 local_mean_output.emplace_back(len);
341 runners.emplace_back(len, local_mean_output.back().data(), local_var_output.back().data(), sopt.skip_nan, start);
342 }
343
344 auto ext = tatami::consecutive_extractor<true>(mat, !row, static_cast<Index_>(0), otherdim, start, len, opt);
347
348 for (int i = 0; i < otherdim; ++i) {
349 auto range = ext->fetch(xbuffer.data(), ibuffer.data());
350 runners[group[i]].add(range.value, range.index, range.number);
351 }
352
353 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
354 runners[g].finish();
355 local_var_output[g].transfer();
356 }
357 }, dim, sopt.num_threads);
358 }
359
360 } else {
361 if (mat.prefer_rows() == row) {
362 tatami::parallelize([&](int, Index_ start, Index_ len) -> void {
363 auto ext = tatami::consecutive_extractor<false>(mat, row, start, len);
365
366 auto tmp_means = sanisizer::create<std::vector<Output_> >(num_groups);
367 auto output_variances = sanisizer::create<std::vector<Output_> >(num_groups);
368 auto valid_group_size = sanisizer::create<std::vector<Index_> >(sopt.skip_nan ? num_groups : 0);
369
370 for (Index_ i = 0; i < len; ++i) {
371 auto ptr = ext->fetch(xbuffer.data());
372 direct(
373 ptr,
374 otherdim,
375 group,
376 num_groups,
377 group_size,
378 tmp_means.data(),
379 output_variances.data(),
380 sopt.skip_nan,
381 valid_group_size.data()
382 );
383
384 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
385 output[g][i + start] = output_variances[g];
386 }
387 }
388 }, dim, sopt.num_threads);
389
390 } else {
391 tatami::parallelize([&](int thread, Index_ start, Index_ len) -> void {
392 std::vector<variances::RunningDense<Output_, Value_, Index_> > runners;
393 runners.reserve(num_groups);
394 std::vector<LocalOutputBuffer<Output_> > local_var_output;
395 local_var_output.reserve(num_groups);
396 std::vector<std::vector<Output_> > local_mean_output;
397 local_mean_output.reserve(num_groups);
398
399 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
400 local_var_output.emplace_back(thread, start, len, output[g]);
401 local_mean_output.emplace_back(len);
402 runners.emplace_back(len, local_mean_output.back().data(), local_var_output.back().data(), sopt.skip_nan);
403 }
404
406 auto ext = tatami::consecutive_extractor<false>(mat, !row, static_cast<Index_>(0), otherdim, start, len);
407
408 for (Index_ i = 0; i < otherdim; ++i) {
409 auto ptr = ext->fetch(xbuffer.data());
410 runners[group[i]].add(ptr);
411 }
412
413 for (decltype(num_groups) g = 0; g < num_groups; ++g) {
414 runners[g].finish();
415 local_var_output[g].transfer();
416 }
417 }, dim, sopt.num_threads);
418 }
419 }
420}
421
425// Back-compatibility.
426template<typename Value_, typename Index_, typename Group_, typename Output_>
427void apply(bool row, const tatami::Matrix<Value_, Index_>* p, const Group_* group, std::size_t num_groups, const Index_* group_size, Output_** output, const Options& sopt) {
428 apply(row, *p, group, num_groups, group_size, output, sopt);
429}
451template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
452std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>& mat, const Group_* group, const Options& sopt) {
453 auto mydim = mat.nrow();
454 auto group_size = tabulate_groups(group, mat.ncol());
455 auto ngroup = group_size.size();
456
457 auto output = sanisizer::create<std::vector<std::vector<Output_> > >(ngroup);
458 std::vector<Output_*> ptrs;
459 ptrs.reserve(output.size());
460 for (auto& o : output) {
461 o.resize(mydim);
462 ptrs.push_back(o.data());
463 }
464
465 apply(true, mat, group, ngroup, group_size.data(), ptrs.data(), sopt);
466 return output;
467}
468
472// Back-compatibility.
473template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
474std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>* p, const Group_* group, const Options& sopt) {
475 return by_row<Output_>(*p, group, sopt);
476}
477
478template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
479std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>& mat, const Group_* group) {
480 return by_row<Output_>(mat, group, Options());
481}
482
483template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
484std::vector<std::vector<Output_> > by_row(const tatami::Matrix<Value_, Index_>* p, const Group_* group) {
485 return by_row<Output_>(*p, group);
486}
508template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
509std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>& mat, const Group_* group, const Options& sopt) {
510 auto mydim = mat.ncol();
511 auto group_size = tabulate_groups(group, mat.nrow());
512 auto ngroup = group_size.size();
513
514 auto output = sanisizer::create<std::vector<std::vector<Output_> > >(ngroup);
515 std::vector<Output_*> ptrs;
516 ptrs.reserve(output.size());
517 for (auto& o : output) {
518 o.resize(mydim);
519 ptrs.push_back(o.data());
520 }
521
522 apply(false, mat, group, ngroup, group_size.data(), ptrs.data(), sopt);
523 return output;
524}
525
529// Back-compatibility.
530template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
531std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>* p, const Group_* group, const Options& sopt) {
532 return by_column<Output_>(*p, group, sopt);
533}
534
535template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
536std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>& mat, const Group_* group) {
537 return by_column<Output_>(mat, group, Options());
538}
539
540template<typename Output_ = double, typename Value_, typename Index_, typename Group_>
541std::vector<std::vector<Output_> > by_column(const tatami::Matrix<Value_, Index_>* p, const Group_* group) {
542 return by_column<Output_>(*p, group);
543}
548}
549
550}
551
552#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
void direct(const Value_ *ptr, Index_ num, const Group_ *group, std::size_t num_groups, const Index_ *group_size, Output_ *output_means, Output_ *output_variances, bool skip_nan, Index_ *valid_group_size)
Definition grouped_variances.hpp:106
void apply(bool row, const tatami::Matrix< Value_, Index_ > &mat, const Group_ *group, std::size_t num_groups, const Index_ *group_size, Output_ **output, const Options &sopt)
Definition grouped_variances.hpp:285
std::vector< std::vector< Output_ > > by_column(const tatami::Matrix< Value_, Index_ > &mat, const Group_ *group, const Options &sopt)
Definition grouped_variances.hpp:509
std::vector< std::vector< Output_ > > by_row(const tatami::Matrix< Value_, Index_ > &mat, const Group_ *group, const Options &sopt)
Definition grouped_variances.hpp:452
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
std::vector< Size_ > tabulate_groups(const Group_ *group, Size_ n)
Definition utils.hpp:53
void parallelize(Function_ fun, Index_ tasks, int threads)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_ordered_index
Grouped summation options.
Definition grouped_variances.hpp:31
bool skip_nan
Definition grouped_variances.hpp:36
int num_threads
Definition grouped_variances.hpp:42
Utilities for computing matrix statistics.
Compute row and column variances from a tatami::Matrix.