tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
medians.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS__MEDIANS_HPP
2#define TATAMI_STATS__MEDIANS_HPP
3
4#include "utils.hpp"
5
6#include <cmath>
7#include <vector>
8#include <algorithm>
9#include <limits>
10
11#include "tatami/tatami.hpp"
12#include "sanisizer/sanisizer.hpp"
13
20namespace tatami_stats {
21
26namespace medians {
27
31struct Options {
36 bool skip_nan = false;
37
42 int num_threads = 1;
43};
44
48namespace internal {
49
50template<typename Value_, typename Index_>
51Index_ translocate_nans(Value_* ptr, Index_& num) {
52 Index_ pos = 0;
53 for (Index_ i = 0; i < num; ++i) {
54 if (std::isnan(ptr[i])) {
55 std::swap(ptr[i], ptr[pos]);
56 ++pos;
57 }
58 }
59 return pos;
60}
61
62}
82template<typename Output_ = double, typename Value_, typename Index_>
83Output_ direct(Value_* ptr, Index_ num, bool skip_nan) {
84 ::tatami_stats::internal::nanable_ifelse<Value_>(
85 skip_nan,
86 [&]() -> void {
87 auto lost = internal::translocate_nans(ptr, num);
88 ptr += lost;
89 num -= lost;
90 },
91 []() -> void {}
92 );
93
94 if (num == 0) {
95 return std::numeric_limits<Output_>::quiet_NaN();
96 }
97
98 Index_ halfway = num / 2;
99 bool is_even = (num % 2 == 0);
100
101 std::nth_element(ptr, ptr + halfway, ptr + num);
102 Output_ medtmp = *(ptr + halfway);
103 if (!is_even) {
104 return medtmp;
105 }
106
107 // 'nth_element()' reorganizes 'ptr' so that everything below 'halfway' is
108 // less than or equal to 'ptr[halfway]', while everything above 'halfway'
109 // is greater than or equal to 'ptr[halfway]'. Thus, to get the element
110 // immediately before 'halfway' in the sort order, we just need to find the
111 // maximum from '[0, halfway)'.
112 Output_ other = *std::max_element(ptr, ptr + halfway);
113
114 if (medtmp == other) {
115 return medtmp; // Preserve exactness, respect infinities of the same sign.
116 } else {
117 return medtmp + (other - medtmp) / 2; // Avoid FP overflow.
118 }
119}
120
138template<typename Output_ = double, typename Value_, typename Index_>
139Output_ direct(Value_* value, Index_ num_nonzero, Index_ num_all, bool skip_nan) {
140 // Fallback to the dense code if there are no structural zeros. This is not
141 // just for efficiency as the downstream averaging code assumes that there
142 // is at least one structural zero when considering its scenarios.
143 if (num_nonzero == num_all) {
144 return direct<Output_>(value, num_all, skip_nan);
145 }
146
147 ::tatami_stats::internal::nanable_ifelse<Value_>(
148 skip_nan,
149 [&]() -> void {
150 auto lost = internal::translocate_nans(value, num_nonzero);
151 value += lost;
152 num_nonzero -= lost;
153 num_all -= lost;
154 },
155 []() -> void {}
156 );
157
158 // Is the number of non-zeros less than the number of zeros?
159 // If so, the median must be zero. Note that we calculate it
160 // in this way to avoid overflow from 'num_nonzero * 2'.
161 if (num_nonzero < num_all - num_nonzero) {
162 return 0;
163 }
164
165 Index_ halfway = num_all / 2;
166 bool is_even = (num_all % 2 == 0);
167
168 Index_ num_zero = num_all - num_nonzero;
169 Index_ num_negative = 0;
170 for (Index_ i = 0; i < num_nonzero; ++i) {
171 num_negative += (value[i] < 0);
172 }
173
174 if (!is_even) {
175 if (num_negative > halfway) {
176 std::nth_element(value, value + halfway, value + num_nonzero);
177 return value[halfway];
178
179 } else if (halfway >= num_negative + num_zero) {
180 Index_ skip_zeros = halfway - num_zero;
181 std::nth_element(value, value + skip_zeros, value + num_nonzero);
182 return value[skip_zeros];
183
184 } else {
185 return 0;
186 }
187 }
188
189 Output_ baseline = 0, other = 0;
190 if (num_negative > halfway) { // both halves of the median are negative.
191 std::nth_element(value, value + halfway, value + num_nonzero);
192 baseline = value[halfway];
193 other = *(std::max_element(value, value + halfway)); // max_element gets the sorted value at halfway - 1, see explanation for the dense case.
194
195 } else if (num_negative == halfway) { // the upper half is guaranteed to be zero.
196 Index_ below_halfway = halfway - 1;
197 std::nth_element(value, value + below_halfway, value + num_nonzero);
198 other = value[below_halfway]; // set to other so that addition/subtraction of a zero baseline has no effect on precision.
199
200 } else if (num_negative < halfway && num_negative + num_zero > halfway) { // both halves are zero, so zero is the median.
201 ;
202
203 } else if (num_negative + num_zero == halfway) { // the lower half is guaranteed to be zero.
204 Index_ skip_zeros = halfway - num_zero;
205 std::nth_element(value, value + skip_zeros, value + num_nonzero);
206 other = value[skip_zeros]; // set to other so that addition/subtraction of a zero baseline has no effect on precision.
207
208 } else { // both halves of the median are non-negative.
209 Index_ skip_zeros = halfway - num_zero;
210 std::nth_element(value, value + skip_zeros, value + num_nonzero);
211 baseline = value[skip_zeros];
212 other = *(std::max_element(value, value + skip_zeros)); // max_element gets the sorted value at skip_zeros - 1, see explanation for the dense case.
213 }
214
215 if (baseline == other) {
216 return baseline; // Preserve exactness, respect infinities of the same sign.
217 } else {
218 return baseline + (other - baseline) / 2; // Avoid FP overflow.
219 }
220}
221
237template<typename Value_, typename Index_, typename Output_>
238void apply(bool row, const tatami::Matrix<Value_, Index_>& mat, Output_* output, const medians::Options& mopt) {
239 auto dim = (row ? mat.nrow() : mat.ncol());
240 auto otherdim = (row ? mat.ncol() : mat.nrow());
241
242 if (mat.sparse()) {
243 tatami::Options opt;
244 opt.sparse_extract_index = false;
245 opt.sparse_ordered_index = false; // we'll be sorting by value anyway.
246
247 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
248 auto ext = tatami::consecutive_extractor<true>(mat, row, s, l, opt);
250 auto vbuffer = buffer.data();
251 for (Index_ x = 0; x < l; ++x) {
252 auto range = ext->fetch(vbuffer, NULL);
253 tatami::copy_n(range.value, range.number, vbuffer);
254 output[x + s] = medians::direct<Output_>(vbuffer, range.number, otherdim, mopt.skip_nan);
255 }
256 }, dim, mopt.num_threads);
257
258 } else {
259 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
261 auto ext = tatami::consecutive_extractor<false>(mat, row, s, l);
262 for (Index_ x = 0; x < l; ++x) {
263 auto ptr = ext->fetch(buffer.data());
264 tatami::copy_n(ptr, otherdim, buffer.data());
265 output[x + s] = medians::direct<Output_>(buffer.data(), otherdim, mopt.skip_nan);
266 }
267 }, dim, mopt.num_threads);
268 }
269}
270
274// Back-compatibility.
275template<typename Value_, typename Index_, typename Output_>
276void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const medians::Options& mopt) {
277 apply(row, *p, output, mopt);
278}
296template<typename Output_ = double, typename Value_, typename Index_>
297std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat, const Options& mopt) {
299 apply(false, mat, output.data(), mopt);
300 return output;
301}
302
306// Back-compatibility.
307template<typename Output_ = double, typename Value_, typename Index_>
308std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& mopt) {
309 return by_column<Output_>(*p, mopt);
310}
311
312template<typename Output_ = double, typename Value_, typename Index_>
313std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat) {
314 return by_column<Output_>(mat, Options());
315}
316
317template<typename Output_ = double, typename Value_, typename Index_>
318std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
319 return by_column<Output_>(*p);
320}
338template<typename Output_ = double, typename Value_, typename Index_>
339std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat, const Options& mopt) {
341 apply(true, mat, output.data(), mopt);
342 return output;
343}
344
348// Back-compatibility.
349template<typename Output_ = double, typename Value_, typename Index_>
350std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& mopt) {
351 return by_row<Output_>(*p, mopt);
352}
353
354template<typename Output_ = double, typename Value_, typename Index_>
355std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat) {
356 return by_row<Output_>(mat, Options());
357}
358
359template<typename Output_ = double, typename Value_, typename Index_>
360std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
361 return by_row<Output_>(*p);
362}
367}
368
369}
370
371#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
std::vector< Output_ > by_row(const tatami::Matrix< Value_, Index_ > &mat, const Options &mopt)
Definition medians.hpp:339
Output_ direct(Value_ *ptr, Index_ num, bool skip_nan)
Definition medians.hpp:83
void apply(bool row, const tatami::Matrix< Value_, Index_ > &mat, Output_ *output, const medians::Options &mopt)
Definition medians.hpp:238
std::vector< Output_ > by_column(const tatami::Matrix< Value_, Index_ > &mat, const Options &mopt)
Definition medians.hpp:297
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
void parallelize(Function_ fun, Index_ tasks, int threads)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
Value_ * copy_n(const Value_ *input, Size_ n, Value_ *output)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_extract_index
bool sparse_ordered_index
Median calculation options.
Definition medians.hpp:31
int num_threads
Definition medians.hpp:42
bool skip_nan
Definition medians.hpp:36
Utilities for computing matrix statistics.