38void transpose(
const Input_* 
const input, 
const std::size_t nrow, 
const std::size_t ncol, 
const std::size_t input_stride, Output_* 
const output, 
const std::size_t output_stride) {
 
   39    if ((nrow == 1 && output_stride == 1) || (ncol == 1 && input_stride == 1)) {
 
   40        std::copy_n(input, nrow * ncol, output);
 
   46    constexpr std::size_t block = 16;
 
   47    std::size_t col_start = 0;
 
   48    while (col_start < ncol) {
 
   49        std::size_t col_end = col_start + std::min(block, ncol - col_start);
 
   51        std::size_t row_start = 0;
 
   52        while (row_start < nrow) {
 
   53            std::size_t row_end = row_start + std::min(block, nrow - row_start);
 
   54            for (std::size_t c = col_start; c < col_end; ++c) {
 
   55                for (std::size_t r = row_start; r < row_end; ++r) {
 
   56                    output[c * output_stride + r] = input[r * input_stride + c];
 
 
void transpose(const Input_ *const input, const std::size_t nrow, const std::size_t ncol, const std::size_t input_stride, Output_ *const output, const std::size_t output_stride)
Definition transpose.hpp:38