tatami_stats
Matrix statistics for tatami
Loading...
Searching...
No Matches
medians.hpp
Go to the documentation of this file.
1#ifndef TATAMI_STATS__MEDIANS_HPP
2#define TATAMI_STATS__MEDIANS_HPP
3
4#include "tatami/tatami.hpp"
5#include "utils.hpp"
6
7#include <cmath>
8#include <vector>
9#include <algorithm>
10#include <limits>
11#include <iostream>
12
19namespace tatami_stats {
20
25namespace medians {
26
30struct Options {
35 bool skip_nan = false;
36
41 int num_threads = 1;
42};
43
47namespace internal {
48
49template<typename Value_, typename Index_>
50Index_ translocate_nans(Value_* ptr, Index_& num) {
51 Index_ pos = 0;
52 for (Index_ i = 0; i < num; ++i) {
53 if (std::isnan(ptr[i])) {
54 std::swap(ptr[i], ptr[pos]);
55 ++pos;
56 }
57 }
58 return pos;
59}
60
61}
81template<typename Output_ = double, typename Value_, typename Index_>
82Output_ direct(Value_* ptr, Index_ num, bool skip_nan) {
83 ::tatami_stats::internal::nanable_ifelse<Value_>(
84 skip_nan,
85 [&]() {
86 auto lost = internal::translocate_nans(ptr, num);
87 ptr += lost;
88 num -= lost;
89 },
90 [&]() {}
91 );
92
93 if (num == 0) {
94 return std::numeric_limits<Output_>::quiet_NaN();
95 }
96
97 size_t halfway = num / 2;
98 bool is_even = (num % 2 == 0);
99
100 std::nth_element(ptr, ptr + halfway, ptr + num);
101 Output_ medtmp = *(ptr + halfway);
102 if (!is_even) {
103 return medtmp;
104 }
105
106 // 'nth_element()' reorganizes 'ptr' so that everything below 'halfway' is
107 // less than or equal to 'ptr[halfway]', while everything above 'halfway'
108 // is greater than or equal to 'ptr[halfway]'. Thus, to get the element
109 // immediately before 'halfway' in the sort order, we just need to find the
110 // maximum from '[0, halfway)'.
111 Output_ other = *std::max_element(ptr, ptr + halfway);
112
113 // Avoid FP overflow, preserve exactness of the median if medtmp == other.
114 return medtmp + (other - medtmp)/2;
115}
116
134template<typename Output_ = double, typename Value_, typename Index_>
135Output_ direct(Value_* value, Index_ num_nonzero, Index_ num_all, bool skip_nan) {
136 // Fallback to the dense code if there are no structural zeros. This is not
137 // just for efficiency as the downstream averaging code assumes that there
138 // is at least one structural zero when considering its scenarios.
139 if (num_nonzero == num_all) {
140 return direct<Output_>(value, num_all, skip_nan);
141 }
142
143 ::tatami_stats::internal::nanable_ifelse<Value_>(
144 skip_nan,
145 [&]() {
146 auto lost = internal::translocate_nans(value, num_nonzero);
147 value += lost;
148 num_nonzero -= lost;
149 num_all -= lost;
150 },
151 [&]() {}
152 );
153
154 // Is the number of non-zeros less than the number of zeros?
155 // If so, the median must be zero. Note that we calculate it
156 // in this way to avoid overflow from 'num_nonzero * 2'.
157 if (num_nonzero < num_all - num_nonzero) {
158 return 0;
159 }
160
161 size_t halfway = num_all / 2;
162 bool is_even = (num_all % 2 == 0);
163
164 size_t num_zero = num_all - num_nonzero;
165 size_t num_negative = 0;
166 for (Index_ i = 0; i < num_nonzero; ++i) {
167 num_negative += (value[i] < 0);
168 }
169
170 if (!is_even) {
171 if (num_negative > halfway) {
172 std::nth_element(value, value + halfway, value + num_nonzero);
173 return value[halfway];
174
175 } else if (halfway >= num_negative + num_zero) {
176 size_t skip_zeros = halfway - num_zero;
177 std::nth_element(value, value + skip_zeros, value + num_nonzero);
178 return value[skip_zeros];
179
180 } else {
181 return 0;
182 }
183 }
184
185 Output_ baseline = 0, other = 0;
186 if (num_negative > halfway) { // both halves of the median are negative.
187 std::nth_element(value, value + halfway, value + num_nonzero);
188 baseline = value[halfway];
189 other = *(std::max_element(value, value + halfway)); // max_element gets the sorted value at halfway - 1, see explanation for the dense case.
190
191 } else if (num_negative == halfway) { // the upper half is guaranteed to be zero.
192 size_t below_halfway = halfway - 1;
193 std::nth_element(value, value + below_halfway, value + num_nonzero);
194 other = value[below_halfway]; // set to other so that addition/subtraction of a zero baseline has no effect on precision.
195
196 } else if (num_negative < halfway && num_negative + num_zero > halfway) { // both halves are zero, so zero is the median.
197 ;
198
199 } else if (num_negative + num_zero == halfway) { // the lower half is guaranteed to be zero.
200 size_t skip_zeros = halfway - num_zero;
201 std::nth_element(value, value + skip_zeros, value + num_nonzero);
202 other = value[skip_zeros]; // set to other so that addition/subtraction of a zero baseline has no effect on precision.
203
204 } else { // both halves of the median are non-negative.
205 size_t skip_zeros = halfway - num_zero;
206 std::nth_element(value, value + skip_zeros, value + num_nonzero);
207 baseline = value[skip_zeros];
208 other = *(std::max_element(value, value + skip_zeros)); // max_element gets the sorted value at skip_zeros - 1, see explanation for the dense case.
209 }
210
211 // Avoid FP overflow, preserve exactness of the median if baseline == other.
212 return baseline + (other - baseline) / 2;
213}
214
230template<typename Value_, typename Index_, typename Output_>
231void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const medians::Options& mopt) {
232 auto dim = (row ? p->nrow() : p->ncol());
233 auto otherdim = (row ? p->ncol() : p->nrow());
234
235 if (p->sparse()) {
236 tatami::Options opt;
237 opt.sparse_extract_index = false;
238 opt.sparse_ordered_index = false; // we'll be sorting by value anyway.
239
240 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
241 auto ext = tatami::consecutive_extractor<true>(p, row, s, l, opt);
242 std::vector<Value_> buffer(otherdim);
243 auto vbuffer = buffer.data();
244 for (Index_ x = 0; x < l; ++x) {
245 auto range = ext->fetch(vbuffer, NULL);
246 tatami::copy_n(range.value, range.number, vbuffer);
247 output[x + s] = medians::direct<Output_>(vbuffer, range.number, otherdim, mopt.skip_nan);
248 }
249 }, dim, mopt.num_threads);
250
251 } else {
252 tatami::parallelize([&](int, Index_ s, Index_ l) -> void {
253 std::vector<Value_> buffer(otherdim);
254 auto ext = tatami::consecutive_extractor<false>(p, row, s, l);
255 for (Index_ x = 0; x < l; ++x) {
256 auto ptr = ext->fetch(buffer.data());
257 tatami::copy_n(ptr, otherdim, buffer.data());
258 output[x + s] = medians::direct<Output_>(buffer.data(), otherdim, mopt.skip_nan);
259 }
260 }, dim, mopt.num_threads);
261 }
262}
263
277template<typename Output_ = double, typename Value_, typename Index_>
278std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& mopt) {
279 std::vector<Output_> output(p->ncol());
280 apply(false, p, output.data(), mopt);
281 return output;
282}
283
296template<typename Output_ = double, typename Value_, typename Index_>
297std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {
298 return by_column(p, Options());
299}
300
314template<typename Output_ = double, typename Value_, typename Index_>
315std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& mopt) {
316 std::vector<Output_> output(p->nrow());
317 apply(true, p, output.data(), mopt);
318 return output;
319}
320
333template<typename Output_ = double, typename Value_, typename Index_>
334std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {
335 return by_row(p, Options());
336}
337
338}
339
340}
341
342#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Output_ direct(Value_ *ptr, Index_ num, bool skip_nan)
Definition medians.hpp:82
void apply(bool row, const tatami::Matrix< Value_, Index_ > *p, Output_ *output, const medians::Options &mopt)
Definition medians.hpp:231
std::vector< Output_ > by_row(const tatami::Matrix< Value_, Index_ > *p, const Options &mopt)
Definition medians.hpp:315
std::vector< Output_ > by_column(const tatami::Matrix< Value_, Index_ > *p, const Options &mopt)
Definition medians.hpp:278
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18
void parallelize(Function_ fun, Index_ tasks, int threads)
Value_ * copy_n(const Value_ *input, Size_ n, Value_ *output)
auto consecutive_extractor(const Matrix< Value_, Index_ > *mat, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
bool sparse_extract_index
bool sparse_ordered_index
Median calculation options.
Definition medians.hpp:30
int num_threads
Definition medians.hpp:41
bool skip_nan
Definition medians.hpp:35
Utilities for computing matrix statistics.