tatami_stats/medians_8hpp_source.html

#ifndef TATAMI_STATS__MEDIANS_HPP

#define TATAMI_STATS__MEDIANS_HPP


#include "utils.hpp"


#include <cmath>

#include <vector>

#include <algorithm>

#include <limits>


#include "tatami/tatami.hpp"

#include "sanisizer/sanisizer.hpp"


namespace tatami_stats {


namespace medians {


struct Options {

    bool skip_nan = false;


    int num_threads = 1;

};


namespace internal {


template<typename Value_, typename Index_>

Index_ translocate_nans(Value_* ptr, Index_& num) {

    Index_ pos = 0;

    for (Index_ i = 0; i < num; ++i) {

        if (std::isnan(ptr[i])) {

            std::swap(ptr[i], ptr[pos]);

            ++pos;

        }

    }

    return pos;

}


}

template<typename Output_ = double, typename Value_, typename Index_>


Output_ direct(Value_* ptr, Index_ num, bool skip_nan) {

    ::tatami_stats::internal::nanable_ifelse<Value_>(

        skip_nan,

        [&]() -> void {

            auto lost = internal::translocate_nans(ptr, num);

            ptr += lost;

            num -= lost;

        },

        []() -> void {}

    );


    if (num == 0) {

        return std::numeric_limits<Output_>::quiet_NaN();

    }


    Index_ halfway = num / 2;

    bool is_even = (num % 2 == 0);


    std::nth_element(ptr, ptr + halfway, ptr + num);

    Output_ medtmp = *(ptr + halfway);

    if (!is_even) {

        return medtmp;

    }


    // 'nth_element()' reorganizes 'ptr' so that everything below 'halfway' is

    // less than or equal to 'ptr[halfway]', while everything above 'halfway'

    // is greater than or equal to 'ptr[halfway]'. Thus, to get the element

    // immediately before 'halfway' in the sort order, we just need to find the

    // maximum from '[0, halfway)'.

    Output_ other = *std::max_element(ptr, ptr + halfway);


    if (medtmp == other) {

        return medtmp; // Preserve exactness, respect infinities of the same sign.

    } else {

        return medtmp + (other - medtmp) / 2; // Avoid FP overflow.

    }

}


template<typename Output_ = double, typename Value_, typename Index_>


Output_ direct(Value_* value, Index_ num_nonzero, Index_ num_all, bool skip_nan) {

    // Fallback to the dense code if there are no structural zeros. This is not

    // just for efficiency as the downstream averaging code assumes that there

    // is at least one structural zero when considering its scenarios.

    if (num_nonzero == num_all) {

        return direct<Output_>(value, num_all, skip_nan);

    }


    ::tatami_stats::internal::nanable_ifelse<Value_>(

        skip_nan,

        [&]() -> void {

            auto lost = internal::translocate_nans(value, num_nonzero);

            value += lost;

            num_nonzero -= lost;

            num_all -= lost;

        },

        []() -> void {}

    );


    // Is the number of non-zeros less than the number of zeros?

    // If so, the median must be zero. Note that we calculate it

    // in this way to avoid overflow from 'num_nonzero * 2'.

    if (num_nonzero < num_all - num_nonzero) {

        return 0;

    }


    Index_ halfway = num_all / 2;

    bool is_even = (num_all % 2 == 0);


    Index_ num_zero = num_all - num_nonzero;

    Index_ num_negative = 0;

    for (Index_ i = 0; i < num_nonzero; ++i) {

        num_negative += (value[i] < 0);

    }


    if (!is_even) {

        if (num_negative > halfway) {

            std::nth_element(value, value + halfway, value + num_nonzero);

            return value[halfway];


        } else if (halfway >= num_negative + num_zero) {

            Index_ skip_zeros = halfway - num_zero;

            std::nth_element(value, value + skip_zeros, value + num_nonzero);

            return value[skip_zeros];


        } else {

            return 0;

        }

    }


    Output_ baseline = 0, other = 0;

    if (num_negative > halfway) { // both halves of the median are negative.

        std::nth_element(value, value + halfway, value + num_nonzero);

        baseline = value[halfway];

        other = *(std::max_element(value, value + halfway)); // max_element gets the sorted value at halfway - 1, see explanation for the dense case.


    } else if (num_negative == halfway) { // the upper half is guaranteed to be zero.

        Index_ below_halfway = halfway - 1;

        std::nth_element(value, value + below_halfway, value + num_nonzero);

        other = value[below_halfway]; // set to other so that addition/subtraction of a zero baseline has no effect on precision.


    } else if (num_negative < halfway && num_negative + num_zero > halfway) { // both halves are zero, so zero is the median.

        ;


    } else if (num_negative + num_zero == halfway) { // the lower half is guaranteed to be zero.

        Index_ skip_zeros = halfway - num_zero;

        std::nth_element(value, value + skip_zeros, value + num_nonzero);

        other = value[skip_zeros]; // set to other so that addition/subtraction of a zero baseline has no effect on precision.


    } else { // both halves of the median are non-negative.

        Index_ skip_zeros = halfway - num_zero;

        std::nth_element(value, value + skip_zeros, value + num_nonzero);

        baseline = value[skip_zeros];

        other = *(std::max_element(value, value + skip_zeros)); // max_element gets the sorted value at skip_zeros - 1, see explanation for the dense case.

    }


    if (baseline == other) {

        return baseline; // Preserve exactness, respect infinities of the same sign.

    } else {

        return baseline + (other - baseline) / 2; // Avoid FP overflow.

    }

}


template<typename Value_, typename Index_, typename Output_>


void apply(bool row, const tatami::Matrix<Value_, Index_>& mat, Output_* output, const medians::Options& mopt) {

    auto dim = (row ? mat.nrow() : mat.ncol());

    auto otherdim = (row ? mat.ncol() : mat.nrow());


    if (mat.sparse()) {

        tatami::Options opt;

        opt.sparse_extract_index = false;

        opt.sparse_ordered_index = false; // we'll be sorting by value anyway.


        tatami::parallelize([&](int, Index_ s, Index_ l) -> void {

            auto ext = tatami::consecutive_extractor<true>(mat, row, s, l, opt);

            auto buffer = tatami::create_container_of_Index_size<std::vector<Value_> >(otherdim);

            auto vbuffer = buffer.data();

            for (Index_ x = 0; x < l; ++x) {

                auto range = ext->fetch(vbuffer, NULL);

                tatami::copy_n(range.value, range.number, vbuffer);

                output[x + s] = medians::direct<Output_>(vbuffer, range.number, otherdim, mopt.skip_nan);

            }

        }, dim, mopt.num_threads);


    } else {

        tatami::parallelize([&](int, Index_ s, Index_ l) -> void {

            auto buffer = tatami::create_container_of_Index_size<std::vector<Value_> >(otherdim);

            auto ext = tatami::consecutive_extractor<false>(mat, row, s, l);

            for (Index_ x = 0; x < l; ++x) {

                auto ptr = ext->fetch(buffer.data());

                tatami::copy_n(ptr, otherdim, buffer.data());

                output[x + s] = medians::direct<Output_>(buffer.data(), otherdim, mopt.skip_nan);

            }

        }, dim, mopt.num_threads);

    }

}


// Back-compatibility.

template<typename Value_, typename Index_, typename Output_>

void apply(bool row, const tatami::Matrix<Value_, Index_>* p, Output_* output, const medians::Options& mopt) {

    apply(row, *p, output, mopt);

}

template<typename Output_ = double, typename Value_, typename Index_>


std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat, const Options& mopt) {

    auto output = tatami::create_container_of_Index_size<std::vector<Output_> >(mat.ncol());

    apply(false, mat, output.data(), mopt);

    return output;

}


// Back-compatibility.

template<typename Output_ = double, typename Value_, typename Index_>

std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p, const Options& mopt) {

    return by_column<Output_>(*p, mopt);

}


template<typename Output_ = double, typename Value_, typename Index_>

std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>& mat) {

    return by_column<Output_>(mat, Options());

}


template<typename Output_ = double, typename Value_, typename Index_>

std::vector<Output_> by_column(const tatami::Matrix<Value_, Index_>* p) {

    return by_column<Output_>(*p);

}

template<typename Output_ = double, typename Value_, typename Index_>


std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat, const Options& mopt) {

    auto output = tatami::create_container_of_Index_size<std::vector<Output_> >(mat.nrow());

    apply(true, mat, output.data(), mopt);

    return output;

}


// Back-compatibility.

template<typename Output_ = double, typename Value_, typename Index_>

std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p, const Options& mopt) {

    return by_row<Output_>(*p, mopt);

}


template<typename Output_ = double, typename Value_, typename Index_>

std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>& mat) {

    return by_row<Output_>(mat, Options());

}


template<typename Output_ = double, typename Value_, typename Index_>

std::vector<Output_> by_row(const tatami::Matrix<Value_, Index_>* p) {

    return by_row<Output_>(*p);

}

}


}


#endif

tatami::Matrix

tatami::Matrix::ncol
virtual Index_ ncol() const=0

tatami::Matrix::nrow
virtual Index_ nrow() const=0

tatami::Matrix::sparse
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0

tatami_stats::medians::by_row
std::vector< Output_ > by_row(const tatami::Matrix< Value_, Index_ > &mat, const Options &mopt)
Definition medians.hpp:339

tatami_stats::medians::direct
Output_ direct(Value_ *ptr, Index_ num, bool skip_nan)
Definition medians.hpp:83

tatami_stats::medians::apply
void apply(bool row, const tatami::Matrix< Value_, Index_ > &mat, Output_ *output, const medians::Options &mopt)
Definition medians.hpp:238

tatami_stats::medians::by_column
std::vector< Output_ > by_column(const tatami::Matrix< Value_, Index_ > &mat, const Options &mopt)
Definition medians.hpp:297

tatami_stats
Functions to compute statistics from a tatami::Matrix.
Definition counts.hpp:18

tatami::parallelize
void parallelize(Function_ fun, Index_ tasks, int threads)

tatami::create_container_of_Index_size
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)

tatami::copy_n
Value_ * copy_n(const Value_ *input, Size_ n, Value_ *output)

tatami::consecutive_extractor
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)

tatami::Options

tatami::Options::sparse_extract_index
bool sparse_extract_index

tatami::Options::sparse_ordered_index
bool sparse_ordered_index

tatami_stats::medians::Options
Median calculation options.
Definition medians.hpp:31

tatami_stats::medians::Options::num_threads
int num_threads
Definition medians.hpp:42

tatami_stats::medians::Options::skip_nan
bool skip_nan
Definition medians.hpp:36

tatami.hpp

utils.hpp
Utilities for computing matrix statistics.