tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
medians.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS__MEDIANS_HPP
2#define TATAMI_STATS__MEDIANS_HPP
3
4#include "tatami/tatami.hpp"
5#include "utils.hpp"
6
7#include <cmath>
8#include <vector>
9#include <algorithm>
10#include <limits>
11#include <iostream>
12
19namespace tatami_stats {
20
25namespace medians {
26
30struct Options {
35 bool skip_nan = false;
36
41 int num_threads = 1;
42};
43
47namespace internal {
48
49template<typename Value_, typename Index_>
50Index_ translocate_nans(Value_* ptr, Index_& num) {
51 Index_ pos = 0;
52 for (Index_ i = 0; i < num; ++i) {
53 if (std::isnan(ptr[i])) {
54 std::swap(ptr[i], ptr[pos]);
55 ++pos;
56 }
57 }
58 return pos;
59}
60
61}
81template<typename Output_ = double, typename Value_, typename Index_>
82Output_ direct(Value_* ptr, Index_ num, bool skip_nan) {
83 ::tatami_stats::internal::nanable_ifelse<Value_>(
84 skip_nan,
85 [&]() {
86 auto lost = internal::translocate_nans(ptr, num);
87 ptr += lost;
88 num -= lost;
89 },
90 [&]() {}
91 );
92
93 if (num == 0) {
94 return std::numeric_limits<Output_>::quiet_NaN();
95 }
96
97 size_t halfway = num / 2;
98 bool is_even = (num % 2 == 0);
99
100 std::nth_element(ptr, ptr + halfway, ptr + num);
101 Output_ medtmp = *(ptr + halfway);
102 if (!is_even) {
103 return medtmp;
104 }
105
106 // 'nth_element()' reorganizes 'ptr' so that everything below 'halfway' is
107 // less than or equal to 'ptr[halfway]', while everything above 'halfway'
108 // is greater than or equal to 'ptr[halfway]'. Thus, to get the element
109 // immediately before 'halfway' in the sort order, we just need to find the
110 // maximum from '[0, halfway)'.
111 Output_ other = *std::max_element(ptr, ptr + halfway);
112
113 if (medtmp == other) {
114 return medtmp; // Preserve exactness, respect infinities of the same sign.
115 } else {
116 return medtmp + (other - medtmp) / 2; // Avoid FP overflow.
117 }
118}
119
137template<typename Output_ = double, typename Value_, typename Index_>
138Output_ direct(Value_* value, Index_ num_nonzero, Index_ num_all, bool skip_nan) {
139 // Fallback to the dense code if there are no structural zeros. This is not
140 // just for efficiency as the downstream averaging code assumes that there
141 // is at least one structural zero when considering its scenarios.
142 if (num_nonzero == num_all) {
143 return direct<Output_>(value, num_all, skip_nan);
144 }
145
146 ::tatami_stats::internal::nanable_ifelse<Value_>(
147 skip_nan,
148 [&]() {
149 auto lost = internal::translocate_nans(value, num_nonzero);
150 value += lost;
151 num_nonzero -= lost;
152 num_all -= lost;
153 },
154 [&]() {}
155 );
156
157 // Is the number of non-zeros less than the number of zeros?
158 // If so, the median must be zero. Note that we calculate it
159 // in this way to avoid overflow from 'num_nonzero * 2'.
160 if (num_nonzero < num_all - num_nonzero) {
161 return 0;
162 }
163
164 size_t halfway = num_all / 2;
165 bool is_even = (num_all % 2 == 0);
166
167 size_t num_zero = num_all - num_nonzero;
168 size_t num_negative = 0;
169 for (Index_ i = 0; i < num_nonzero; ++i) {
170 num_negative += (value[i] < 0);
171 }
172
173 if (!is_even) {
174 if (num_negative > halfway) {
175 std::nth_element(value, value + halfway, value + num_nonzero);
176 return value[halfway];
177
178 } else if (halfway >= num_negative + num_zero) {
179 size_t skip_zeros = halfway - num_zero;
180 std::nth_element(value, value + skip_zeros, value + num_nonzero);
181 return value[skip_zeros];
182
183 } else {
184 return 0;
185 }
186 }
187
188 Output_ baseline = 0, other = 0;
189 if (num_negative > halfway) { // both halves of the median are negative.
190 std::nth_element(value, value + halfway, value + num_nonzero);
191 baseline = value[halfway];
192 other = *(std::max_element(value, value + halfway)); // max_element gets the sorted value at halfway - 1, see explanation for the dense case.
193
194 } else if (num_negative == halfway) { // the upper half is guaranteed to be zero.
195 size_t below_halfway = halfway - 1;
196 std::nth_element(value, value + below_halfway, value + num_nonzero);
197 other = value[below_halfway]; // set to other so that addition/subtraction of a zero baseline has no effect on precision.
198
199 } else if (num_negative < halfway && num_negative + num_zero > halfway) { // both halves are zero, so zero is the median.
200 ;
201
202 } else if (num_negative + num_zero == halfway) { // the lower half is guaranteed to be zero.
203 size_t skip_zeros = halfway - num_zero;
204 std::nth_element(value, value + skip_zeros, value + num_nonzero);
205 other = value[skip_zeros]; // set to other so that addition/subtraction of a zero baseline has no effect on precision.
206
207 } else { // both halves of the median are non-negative.
208 size_t skip_zeros = halfway - num_zero;
209 std::nth_element(value, value + skip_zeros, value + num_nonzero);
210 baseline = value[skip_zeros];
211 other = *(std::max_element(value, value + skip_zeros)); // max_element gets the sorted value at skip_zeros - 1, see explanation for the dense case.
212 }
213
214 if (baseline == other) {
215 return baseline; // Preserve exactness, respect infinities of the same sign.
216 } else {
217 return baseline + (other - baseline) / 2; // Avoid FP overflow.
218 }
219}
220
236template<typename Value_, typename Index_, typename Output_>
237void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const medians::Options& mopt) {
238 auto dim = (row ? p->nrow() : p->ncol());
239 auto otherdim = (row ? p->ncol() : p->nrow());
240
241 if (p->sparse()) {
242 tatami::Options opt;
243 opt.sparse_extract_index = false;
244 opt.sparse_ordered_index = false; // we'll be sorting by value anyway.
245
246 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
247 auto ext = tatami::consecutive_extractor<true>(p, row, s, l, opt);
248 std::vector<Value_> buffer(otherdim);
249 auto vbuffer = buffer.data();
250 for (Index_ x = 0; x < l; ++x) {
251 auto range = ext->fetch(vbuffer, NULL);
252 tatami::copy_n(range.value, range.number, vbuffer);
253 output[x + s] = medians::direct<Output_>(vbuffer, range.number, otherdim, mopt.skip_nan);
254 }
255 }, dim, mopt.num_threads);
256
257 } else {
258 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
259 std::vector<Value_> buffer(otherdim);
260 auto ext = tatami::consecutive_extractor<false>(p, row, s, l);
261 for (Index_ x = 0; x < l; ++x) {
262 auto ptr = ext->fetch(buffer.data());
263 tatami::copy_n(ptr, otherdim, buffer.data());
264 output[x + s] = medians::direct<Output_>(buffer.data(), otherdim, mopt.skip_nan);
265 }
266 }, dim, mopt.num_threads);
267 }
268}
269
283template<typename Output_ = double, typename Value_, typename Index_>
284std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& mopt) {
285 std::vector<Output_> output(p->ncol());
286 apply(false, p, output.data(), mopt);
287 return output;
288}
289
302template<typename Output_ = double, typename Value_, typename Index_>
303std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
304 return by_column(p, Options());
305}
306
320template<typename Output_ = double, typename Value_, typename Index_>
321std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& mopt) {
322 std::vector<Output_> output(p->nrow());
323 apply(true, p, output.data(), mopt);
324 return output;
325}
326
339template<typename Output_ = double, typename Value_, typename Index_>
340std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
341 return by_row(p, Options());
342}
343
344}
345
346}
347
348#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Output_ direct(Value_ *ptr, Index_ num, bool skip_nan)
Definition medians.hpp:82
void apply(bool row, const tatami::Matrix< Value_, Index_ > *p, Output_ *output, const medians::Options &mopt)
Definition medians.hpp:237
std::vector< Output_ > by_row(const tatami::Matrix< Value_, Index_ > *p, const Options &mopt)
Definition medians.hpp:321
std::vector< Output_ > by_column(const tatami::Matrix< Value_, Index_ > *p, const Options &mopt)
Definition medians.hpp:284
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
void parallelize(Function_ fun, Index_ tasks, int threads)
Value_ * copy_n(const Value_ *input, Size_ n, Value_ *output)
auto consecutive_extractor(const Matrix< Value_, Index_ > *mat, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_extract_index
bool sparse_ordered_index
Median calculation options.
Definition medians.hpp:30
int num_threads
Definition medians.hpp:41
bool skip_nan
Definition medians.hpp:35
Utilities for computing matrix statistics.