From 7f11225c061482ec5a2f0afb1e3f60bdeaaab6c8 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Fri, 7 Oct 2022 18:35:46 +0200 Subject: [PATCH 01/25] Integrate accumulate_into_selected into raft prims --- cpp/bench/CMakeLists.txt | 1 + cpp/bench/common/benchmark.hpp | 3 +- cpp/bench/linalg/reduce_rows_by_key.cu | 88 ++++++ .../raft/linalg/detail/reduce_rows_by_key.cuh | 280 ++++++++---------- .../raft/linalg/reduce_rows_by_key.cuh | 65 ++-- .../knn/detail/ann_kmeans_balanced.cuh | 103 ++++++- .../raft/spatial/knn/detail/ann_utils.cuh | 6 +- .../spatial/knn/detail/ivf_flat_build.cuh | 3 +- cpp/test/linalg/reduce_rows_by_key.cu | 2 +- 9 files changed, 349 insertions(+), 202 deletions(-) create mode 100644 cpp/bench/linalg/reduce_rows_by_key.cu diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt index 51170e4265..7551bff830 100644 --- a/cpp/bench/CMakeLists.txt +++ b/cpp/bench/CMakeLists.txt @@ -96,6 +96,7 @@ if(BUILD_BENCH) bench/linalg/add.cu bench/linalg/map_then_reduce.cu bench/linalg/matrix_vector_op.cu + bench/linalg/reduce_rows_by_key.cu bench/linalg/reduce.cu bench/main.cpp ) diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp index adfe5218e2..13ca40a033 100644 --- a/cpp/bench/common/benchmark.hpp +++ b/cpp/bench/common/benchmark.hpp @@ -53,8 +53,9 @@ struct using_pool_memory_res { rmm::mr::set_current_device_resource(&pool_res_); } - using_pool_memory_res() : using_pool_memory_res(size_t(1) << size_t(30), size_t(16) << size_t(30)) + using_pool_memory_res() : orig_res_(rmm::mr::get_current_device_resource()), pool_res_(&cuda_res_) { + rmm::mr::set_current_device_resource(&pool_res_); } ~using_pool_memory_res() { rmm::mr::set_current_device_resource(orig_res_); } diff --git a/cpp/bench/linalg/reduce_rows_by_key.cu b/cpp/bench/linalg/reduce_rows_by_key.cu new file mode 100644 index 0000000000..075bc7c8c4 --- /dev/null +++ b/cpp/bench/linalg/reduce_rows_by_key.cu @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +namespace raft::bench::linalg { + +struct rrbk_params { + int64_t rows, cols; + int64_t keys; +}; + +template +struct reduce_rows_by_key : public fixture { + reduce_rows_by_key(const rrbk_params& p) + : params(p), + in(p.rows * p.cols, stream), + out(p.keys * p.cols, stream), + keys(p.rows, stream), + workspace(p.rows, stream) + { + raft::random::RngState rng{42}; + raft::random::uniformInt(rng, keys.data(), p.rows, (KeyT)0, (KeyT)p.keys, stream); + } + + void run_benchmark(::benchmark::State& state) override + { + loop_on_state(state, [this]() { + raft::linalg::reduce_rows_by_key(in.data(), + params.cols, + keys.data(), + workspace.data(), + params.rows, + params.cols, + params.keys, + out.data(), + stream, + false); + }); + } + + protected: + rrbk_params params; + rmm::device_uvector in, out; + rmm::device_uvector keys; + rmm::device_uvector workspace; +}; // struct reduce_rows_by_key + +const std::vector kInputSizes{ + {10000, 128, 64}, + {100000, 128, 64}, + {1000000, 128, 64}, + {10000000, 128, 64}, + {10000, 128, 256}, + {100000, 128, 256}, + {1000000, 128, 256}, + {10000000, 128, 256}, + {10000, 128, 1024}, + {100000, 128, 1024}, + {1000000, 128, 1024}, + {10000000, 128, 1024}, + {10000, 128, 4096}, + {100000, 128, 4096}, + {1000000, 128, 4096}, + {10000000, 128, 4096}, +}; + +RAFT_BENCH_REGISTER((reduce_rows_by_key), "", kInputSizes); +RAFT_BENCH_REGISTER((reduce_rows_by_key), "", kInputSizes); + +} // namespace raft::bench::linalg diff --git a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh index 9ddcbae20b..56c1142aff 100644 --- a/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/detail/reduce_rows_by_key.cuh @@ -92,48 +92,48 @@ struct quadSum { #define SUM_ROWS_SMALL_K_DIMX 256 #define SUM_ROWS_BY_KEY_SMALL_K_MAX_K 4 -template +template __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) - __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT* d_A, - int lda, + __global__ void sum_rows_by_key_small_nkeys_kernel(const DataIteratorT d_A, + IdxT lda, const char* d_keys, const WeightT* d_weights, - int nrows, - int ncols, - int nkeys, - DataIteratorT* d_sums) + IdxT nrows, + IdxT ncols, + IdxT nkeys, + SumsT* d_sums) { - typedef typename std::iterator_traits::value_type DataType; - typedef cub::BlockReduce, SUM_ROWS_SMALL_K_DIMX> BlockReduce; + typedef typename std::iterator_traits::value_type DataType; + typedef cub::BlockReduce, SUM_ROWS_SMALL_K_DIMX> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - for (int idim = static_cast(blockIdx.y); idim < ncols; idim += gridDim.y) { - if (idim != static_cast(blockIdx.y)) __syncthreads(); // we're reusing temp_storage + for (IdxT idim = static_cast(blockIdx.y); idim < ncols; idim += gridDim.y) { + if (idim != static_cast(blockIdx.y)) __syncthreads(); // we're reusing temp_storage // threadIdx.x stores partial sum for current dim and key=threadIdx.x in this reg - quad thread_sums; + quad thread_sums; thread_sums.x = 0.0; thread_sums.y = 0.0; thread_sums.z = 0.0; thread_sums.w = 0.0; // May use vectorized load - not necessary for doubles - for (int block_offset_irow = blockIdx.x * blockDim.x; + for (IdxT block_offset_irow = blockIdx.x * blockDim.x; block_offset_irow < nrows; // we will syncthreads() inside the loop, no CTA divergence block_offset_irow += blockDim.x * gridDim.x) { - int irow = block_offset_irow + threadIdx.x; + IdxT irow = block_offset_irow + threadIdx.x; DataType val = (irow < nrows) ? d_A[irow * lda + idim] : 0.0; if (d_weights && irow < nrows) { val = val * d_weights[irow]; } // we are not reusing the keys - after profiling // d_keys is mainly loaded from L2, and this kernel is DRAM BW bounded // (experimentation gave a 10% speed up - not worth the many code lines added) - int row_key = (irow < nrows) ? d_keys[irow] : -1; + IdxT row_key = (irow < nrows) ? d_keys[irow] : -1; - thread_sums.x += (row_key == 0) ? val : 0.0; - thread_sums.y += (row_key == 1) ? val : 0.0; - thread_sums.z += (row_key == 2) ? val : 0.0; - thread_sums.w += (row_key == 3) ? val : 0.0; + thread_sums.x += (row_key == 0) ? static_cast(val) : 0.0; + thread_sums.y += (row_key == 1) ? static_cast(val) : 0.0; + thread_sums.z += (row_key == 2) ? static_cast(val) : 0.0; + thread_sums.w += (row_key == 3) ? static_cast(val) : 0.0; } // End of column @@ -142,12 +142,12 @@ __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) // Strided access // Reducing by key - thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum()); + thread_sums = BlockReduce(temp_storage).Reduce(thread_sums, quadSum()); if (threadIdx.x < 32) { // We only need 4 thread_sums = cub::ShuffleIndex<32>(thread_sums, 0, 0xffffffff); - if (static_cast(threadIdx.x) < nkeys) { + if (static_cast(threadIdx.x) < nkeys) { if (threadIdx.x == 0) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.x); if (threadIdx.x == 1) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.y); if (threadIdx.x == 2) raft::myAtomicAdd(&d_sums[threadIdx.x * ncols + idim], thread_sums.z); @@ -157,22 +157,22 @@ __launch_bounds__(SUM_ROWS_SMALL_K_DIMX, 4) } } -template -void sum_rows_by_key_small_nkeys(const DataIteratorT* d_A, - int lda, +template +void sum_rows_by_key_small_nkeys(const DataIteratorT d_A, + IdxT lda, const char* d_keys, const WeightT* d_weights, - int nrows, - int ncols, - int nkeys, - DataIteratorT* d_sums, + IdxT nrows, + IdxT ncols, + IdxT nkeys, + SumsT* d_sums, cudaStream_t st) { dim3 grid, block; block.x = SUM_ROWS_SMALL_K_DIMX; block.y = 1; // Necessary - grid.x = raft::ceildiv(nrows, (int)block.x); + grid.x = raft::ceildiv(nrows, (IdxT)block.x); grid.x = std::min(grid.x, 32u); grid.y = ncols; grid.y = std::min(grid.y, MAX_BLOCKS); @@ -188,45 +188,49 @@ void sum_rows_by_key_small_nkeys(const DataIteratorT* d_A, #define SUM_ROWS_BY_KEY_LARGE_K_MAX_K 1024 -template -__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT* d_A, - int lda, +template +__global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT d_A, + IdxT lda, KeysIteratorT d_keys, const WeightT* d_weights, - int nrows, - int ncols, + IdxT nrows, + IdxT ncols, int key_offset, - int nkeys, - DataIteratorT* d_sums) + IdxT nkeys, + SumsT* d_sums) { typedef typename std::iterator_traits::value_type KeyType; - typedef typename std::iterator_traits::value_type DataType; - __shared__ DataType local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K]; + typedef typename std::iterator_traits::value_type DataType; + __shared__ SumsT local_sums[SUM_ROWS_BY_KEY_LARGE_K_MAX_K]; - for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) + for (IdxT local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) local_sums[local_key] = 0.0; - for (int idim = blockIdx.y; idim < ncols; idim += gridDim.y) { + for (IdxT idim = blockIdx.y; idim < ncols; idim += gridDim.y) { __syncthreads(); // local_sums // At this point local_sums if full of zeros - for (int irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows; + for (IdxT irow = blockIdx.x * blockDim.x + threadIdx.x; irow < nrows; irow += blockDim.x * gridDim.x) { // Branch div in this loop - not an issue with current code DataType val = d_A[idim * lda + irow]; if (d_weights) val = val * d_weights[irow]; - int local_key = d_keys[irow] - key_offset; + IdxT local_key = d_keys[irow] - key_offset; // We could load next val here - raft::myAtomicAdd(&local_sums[local_key], val); + raft::myAtomicAdd(&local_sums[local_key], static_cast(val)); } __syncthreads(); // local_sums - for (int local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) { - DataType local_sum = local_sums[local_key]; + for (IdxT local_key = threadIdx.x; local_key < nkeys; local_key += blockDim.x) { + SumsT local_sum = local_sums[local_key]; if (local_sum != 0.0) { KeyType global_key = key_offset + local_key; @@ -237,22 +241,22 @@ __global__ void sum_rows_by_key_large_nkeys_kernel_colmajor(const DataIteratorT* } } -template -void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT* d_A, - int lda, +template +void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT d_A, + IdxT lda, KeysIteratorT d_keys, - int nrows, - int ncols, + IdxT nrows, + IdxT ncols, int key_offset, - int nkeys, - DataIteratorT* d_sums, + IdxT nkeys, + SumsT* d_sums, cudaStream_t st) { dim3 grid, block; block.x = SUM_ROWS_SMALL_K_DIMX; block.y = 1; // Necessary - grid.x = raft::ceildiv(nrows, (int)block.x); + grid.x = raft::ceildiv(nrows, (IdxT)block.x); grid.x = std::min(grid.x, 32u); grid.y = ncols; grid.y = std::min(grid.y, MAX_BLOCKS); @@ -260,91 +264,47 @@ void sum_rows_by_key_large_nkeys_colmajor(const DataIteratorT* d_A, d_A, lda, d_keys, nrows, ncols, key_offset, nkeys, d_sums); } -#define RRBK_SHMEM_SZ 32 - -//#define RRBK_SHMEM -template -__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT* d_A, - int lda, +template +__global__ void sum_rows_by_key_large_nkeys_kernel_rowmajor(const DataIteratorT d_A, + IdxT lda, const WeightT* d_weights, KeysIteratorT d_keys, - int nrows, - int ncols, - int key_offset, - int nkeys, - DataIteratorT* d_sums) + IdxT nrows, + IdxT ncols, + SumsT* d_sums) { - typedef typename std::iterator_traits::value_type KeyType; - typedef typename std::iterator_traits::value_type DataType; - -#ifdef RRBK_SHMEM - __shared__ KeyType sh_keys[RRBK_SHMEM_SZ]; -#endif - int rows_per_partition = nrows / gridDim.z + 1; - int start_row = blockIdx.z * rows_per_partition; - int end_row = start_row + rows_per_partition; - end_row = end_row > nrows ? nrows : end_row; - - KeyType local_key = blockIdx.y; - if (local_key >= nkeys) return; - int this_col = threadIdx.x + blockIdx.x * blockDim.x; - if (this_col >= ncols) return; - - DataType sum = 0.0; - KeyType global_key = key_offset + local_key; -#ifdef RRBK_SHMEM - int sh_key_inx = 0; -#endif - for (int r = start_row; r < end_row; r++) { -#ifdef RRBK_SHMEM - if (0 == sh_key_inx % RRBK_SHMEM_SZ) { - for (int x = threadIdx.x; x < RRBK_SHMEM_SZ; x += blockDim.x) - sh_keys[x] = d_keys[r + x]; - __syncthreads(); - } - if (sh_keys[sh_key_inx] != global_key) continue; // No divergence since global_key is the - // same for the whole block - sh_key_inx++; -#else - if (d_keys[r] != global_key) - continue; // No divergence since global_key is the - // same for the whole block -#endif - // if ((end_row-start_row) / (r-start_row) != global_key) continue; - DataType val = __ldcg(&d_A[r * lda + this_col]); - if (d_weights) { val = val * d_weights[r]; } - sum += val; - } - - if (sum != 0.0) raft::myAtomicAdd(&d_sums[global_key * ncols + this_col], sum); + IdxT gid = threadIdx.x + (blockDim.x * static_cast(blockIdx.x)); + IdxT j = gid % ncols; + IdxT i = gid / ncols; + if (i >= nrows) return; + IdxT l = static_cast(d_keys[i]); + SumsT val = d_A[j + lda * i]; + if (d_weights != nullptr) val *= d_weights[i]; + raft::myAtomicAdd(&d_sums[j + ncols * l], val); } -template -void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A, - int lda, - KeysIteratorT d_keys, +template +void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT d_A, + IdxT lda, + const KeysIteratorT d_keys, const WeightT* d_weights, - int nrows, - int ncols, - int key_offset, - int nkeys, - DataIteratorT* d_sums, + IdxT nrows, + IdxT ncols, + SumsT* d_sums, cudaStream_t st) { - // x-dim refers to the column in the input data - // y-dim refers to the key - // z-dim refers to a partitioning of the rows among the threadblocks - dim3 grid, block; - block.x = 256; // Adjust me! - block.y = 1; // Don't adjust me! - grid.x = raft::ceildiv(ncols, (int)block.x); - grid.y = nkeys; - grid.z = std::max(40960000 / nkeys / ncols, (int)1); // Adjust me! - grid.z = std::min(grid.z, (unsigned int)nrows); - grid.z = std::min(grid.z, MAX_BLOCKS); - - sum_rows_by_key_large_nkeys_kernel_rowmajor<<>>( - d_A, lda, d_weights, d_keys, nrows, ncols, key_offset, nkeys, d_sums); + uint32_t block_dim = 128; + auto grid_dim = static_cast(ceildiv(nrows * ncols, (IdxT)block_dim)); + sum_rows_by_key_large_nkeys_kernel_rowmajor<<>>( + d_A, lda, d_weights, d_keys, nrows, ncols, d_sums); } /** @@ -354,6 +314,8 @@ void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A, * (may be a simple pointer type) * @tparam KeysIteratorT Random-access iterator type, for reading input keys * (may be a simple pointer type) + * @tparam SumsT Type of the output sums + * @tparam IdxT Index type * * @param[in] d_A Input data array (lda x nrows) * @param[in] lda Real row size for input data, d_A @@ -365,26 +327,31 @@ void sum_rows_by_key_large_nkeys_rowmajor(const DataIteratorT* d_A, * @param[in] nkeys Number of unique keys in d_keys * @param[out] d_sums Row sums by key (ncols x d_keys) * @param[in] stream CUDA stream + * @param[in] reset_sums Whether to reset the output sums to zero before reducing */ -template -void reduce_rows_by_key(const DataIteratorT* d_A, - int lda, +template +void reduce_rows_by_key(const DataIteratorT d_A, + IdxT lda, KeysIteratorT d_keys, const WeightT* d_weights, char* d_keys_char, - int nrows, - int ncols, - int nkeys, - DataIteratorT* d_sums, - cudaStream_t stream) + IdxT nrows, + IdxT ncols, + IdxT nkeys, + SumsT* d_sums, + cudaStream_t stream, + bool reset_sums) { typedef typename std::iterator_traits::value_type KeyType; - typedef typename std::iterator_traits::value_type DataType; // Following kernel needs memset - cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(DataType), stream); + if (reset_sums) { cudaMemsetAsync(d_sums, 0, ncols * nkeys * sizeof(SumsT), stream); } - if (nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) { + if (d_keys_char != nullptr && nkeys <= SUM_ROWS_BY_KEY_SMALL_K_MAX_K) { // sum_rows_by_key_small_k is BW bounded. d_keys is loaded ncols time - avoiding wasting BW // with doubles we have ~20% speed up - with floats we can hope something around 2x // Converting d_keys to char @@ -392,12 +359,7 @@ void reduce_rows_by_key(const DataIteratorT* d_A, sum_rows_by_key_small_nkeys( d_A, lda, d_keys_char, d_weights, nrows, ncols, nkeys, d_sums, stream); } else { - for (KeyType key_offset = 0; key_offset < static_cast(nkeys); - key_offset += SUM_ROWS_BY_KEY_LARGE_K_MAX_K) { - KeyType this_call_nkeys = std::min(SUM_ROWS_BY_KEY_LARGE_K_MAX_K, nkeys); - sum_rows_by_key_large_nkeys_rowmajor( - d_A, lda, d_keys, d_weights, nrows, ncols, key_offset, this_call_nkeys, d_sums, stream); - } + sum_rows_by_key_large_nkeys_rowmajor(d_A, lda, d_keys, d_weights, nrows, ncols, d_sums, stream); } } @@ -407,6 +369,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A, * pointer type) * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple * pointer type) + * @tparam SumsT Type of the output sums + * @tparam IdxT Index type * @param[in] d_A Input data array (lda x nrows) * @param[in] lda Real row size for input data, d_A * @param[in] d_keys Keys for each row (1 x nrows) @@ -417,18 +381,19 @@ void reduce_rows_by_key(const DataIteratorT* d_A, * @param[out] d_sums Row sums by key (ncols x d_keys) * @param[in] stream CUDA stream */ -template -void reduce_rows_by_key(const DataIteratorT* d_A, - int lda, +template +void reduce_rows_by_key(const DataIteratorT d_A, + IdxT lda, KeysIteratorT d_keys, char* d_keys_char, - int nrows, - int ncols, - int nkeys, - DataIteratorT* d_sums, - cudaStream_t stream) + IdxT nrows, + IdxT ncols, + IdxT nkeys, + SumsT* d_sums, + cudaStream_t stream, + bool reset_sums) { - typedef typename std::iterator_traits::value_type DataType; + typedef typename std::iterator_traits::value_type DataType; reduce_rows_by_key(d_A, lda, d_keys, @@ -438,7 +403,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A, ncols, nkeys, d_sums, - stream); + stream, + reset_sums); } }; // end namespace detail diff --git a/cpp/include/raft/linalg/reduce_rows_by_key.cuh b/cpp/include/raft/linalg/reduce_rows_by_key.cuh index 39c54e8b0c..1dabd92087 100644 --- a/cpp/include/raft/linalg/reduce_rows_by_key.cuh +++ b/cpp/include/raft/linalg/reduce_rows_by_key.cuh @@ -43,6 +43,8 @@ void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) * (may be a simple pointer type) * @tparam KeysIteratorT Random-access iterator type, for reading input keys * (may be a simple pointer type) + * @tparam SumsT Type of the output sums + * @tparam IdxT Index type * * @param[in] d_A Input data array (lda x nrows) * @param[in] lda Real row size for input data, d_A @@ -54,21 +56,27 @@ void convert_array(IteratorT1 dst, IteratorT2 src, int n, cudaStream_t st) * @param[in] nkeys Number of unique keys in d_keys * @param[out] d_sums Row sums by key (ncols x d_keys) * @param[in] stream CUDA stream + * @param[in] reset_sums Whether to reset the output sums to zero before reducing */ -template -void reduce_rows_by_key(const DataIteratorT* d_A, - int lda, +template +void reduce_rows_by_key(const DataIteratorT d_A, + IdxT lda, const KeysIteratorT d_keys, const WeightT* d_weights, char* d_keys_char, - int nrows, - int ncols, - int nkeys, - DataIteratorT* d_sums, - cudaStream_t stream) + IdxT nrows, + IdxT ncols, + IdxT nkeys, + SumsT* d_sums, + cudaStream_t stream, + bool reset_sums = true) { detail::reduce_rows_by_key( - d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream); + d_A, lda, d_keys, d_weights, d_keys_char, nrows, ncols, nkeys, d_sums, stream, reset_sums); } /** @@ -77,6 +85,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A, * pointer type) * @tparam KeysIteratorT Random-access iterator type, for reading input keys (may be a simple * pointer type) + * @tparam SumsT Type of the output sums + * @tparam IdxT Index type * @param[in] d_A Input data array (lda x nrows) * @param[in] lda Real row size for input data, d_A * @param[in] d_keys Keys for each row (1 x nrows) @@ -86,19 +96,21 @@ void reduce_rows_by_key(const DataIteratorT* d_A, * @param[in] nkeys Number of unique keys in d_keys * @param[out] d_sums Row sums by key (ncols x d_keys) * @param[in] stream CUDA stream + * @param[in] reset_sums Whether to reset the output sums to zero before reducing */ -template -void reduce_rows_by_key(const DataIteratorT* d_A, - int lda, - KeysIteratorT d_keys, +template +void reduce_rows_by_key(const DataIteratorT d_A, + IdxT lda, + const KeysIteratorT d_keys, char* d_keys_char, - int nrows, - int ncols, - int nkeys, - DataIteratorT* d_sums, - cudaStream_t stream) + IdxT nrows, + IdxT ncols, + IdxT nkeys, + SumsT* d_sums, + cudaStream_t stream, + bool reset_sums = true) { - typedef typename std::iterator_traits::value_type DataType; + typedef typename std::iterator_traits::value_type DataType; reduce_rows_by_key(d_A, lda, d_keys, @@ -108,7 +120,8 @@ void reduce_rows_by_key(const DataIteratorT* d_A, ncols, nkeys, d_sums, - stream); + stream, + reset_sums); } /** @@ -128,9 +141,10 @@ void reduce_rows_by_key(const DataIteratorT* d_A, * @param[in] d_keys Keys for each row raft::device_vector_view (1 x nrows) * @param[out] d_sums Row sums by key raft::device_matrix_view (ncols x d_keys) * @param[in] n_unique_keys Number of unique keys in d_keys + * @param[out] d_keys_char Scratch memory for conversion of keys to char, raft::device_vector_view * @param[in] d_weights Weights for each observation in d_A raft::device_vector_view optional (1 * x nrows) - * @param[out] d_keys_char Scratch memory for conversion of keys to char, raft::device_vector_view + * @param[in] reset_sums Whether to reset the output sums to zero before reducing */ template void reduce_rows_by_key( @@ -140,7 +154,8 @@ void reduce_rows_by_key( raft::device_matrix_view d_sums, IndexType n_unique_keys, raft::device_vector_view d_keys_char, - std::optional> d_weights = std::nullopt) + std::optional> d_weights = std::nullopt, + bool reset_sums = true) { RAFT_EXPECTS(d_A.extent(0) == d_A.extent(0) && d_sums.extent(1) == n_unique_keys, "Output is not of size ncols * n_unique_keys"); @@ -158,7 +173,8 @@ void reduce_rows_by_key( d_A.extent(0), n_unique_keys, d_sums.data_handle(), - handle.get_stream()); + handle.get_stream(), + reset_sums); } else { reduce_rows_by_key(d_A.data_handle(), d_A.extent(0), @@ -168,7 +184,8 @@ void reduce_rows_by_key( d_A.extent(0), n_unique_keys, d_sums.data_handle(), - handle.get_stream()); + handle.get_stream(), + reset_sums); } } diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index bf0df065b2..5ddc10542b 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -21,13 +21,16 @@ #include #include +#include #include #include #include #include #include #include +#include #include +#include #include #include #include @@ -231,9 +234,11 @@ constexpr inline auto calc_minibatch_size(uint32_t n_clusters, * When set to `false`, this function may be used to update existing centers and sizes using * the weighted average principle. * @param stream + * @param mr (optional) memory resource to use for temporary allocations */ template -void calc_centers_and_sizes(float* centers, +void calc_centers_and_sizes(const handle_t& handle, + float* centers, uint32_t* cluster_sizes, uint32_t n_clusters, uint32_t dim, @@ -241,12 +246,12 @@ void calc_centers_and_sizes(float* centers, IdxT n_rows, const LabelT* labels, bool reset_counters, - rmm::cuda_stream_view stream) + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = nullptr) { - if (reset_counters) { - utils::memzero(centers, n_clusters * dim, stream); - utils::memzero(cluster_sizes, n_clusters, stream); - } else { + if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } + + if (!reset_counters) { utils::map_along_rows( n_clusters, dim, @@ -255,13 +260,68 @@ void calc_centers_and_sizes(float* centers, [] __device__(float c, uint32_t s) -> float { return c * s; }, stream); } - utils::accumulate_into_selected(n_rows, dim, centers, cluster_sizes, dataset, labels, stream); - utils::map_along_rows( - n_clusters, - dim, + + rmm::device_uvector workspace(0, stream, mr); + rmm::device_uvector cluster_sizes_f(n_clusters, stream, mr); + float* sizes_f = cluster_sizes_f.data(); + + // If we reset the counters, we can compute directly the new sizes in cluster_sizes. + // If we don't reset, we compute in a temporary buffer and add in a separate step. + rmm::device_uvector temp_cluster_sizes(0, stream, mr); + uint32_t* temp_sizes = cluster_sizes; + if (!reset_counters) { + temp_cluster_sizes.resize(n_clusters, stream); + temp_sizes = temp_cluster_sizes.data(); + } + + utils::mapping mapping_op; + cub::TransformInputIterator, const T*> mapping_itr(dataset, + mapping_op); + + // todo(lsugy): use iterator from KV output of fusedL2NN + raft::linalg::reduce_rows_by_key(mapping_itr, + (int64_t)dim, + labels, + nullptr, + (int64_t)n_rows, + (int64_t)dim, + (int64_t)n_clusters, + centers, + stream, + reset_counters); + + // Compute weight of each cluster + // todo(lsugy): larger index type? + raft::cluster::detail::countLabels( + handle, labels, temp_sizes, (uint32_t)n_rows, (uint32_t)n_clusters, workspace); + + // Add previous sizes if necessary and cast to float + auto counting = thrust::make_counting_iterator(0); + thrust::for_each( + handle.get_thrust_policy(), counting, counting + n_clusters, [=] __device__(int idx) { + uint32_t temp_size = temp_sizes[idx]; + if (!reset_counters) { + temp_size += cluster_sizes[idx]; + cluster_sizes[idx] = temp_size; + } + sizes_f[idx] = static_cast(temp_size); + }); + + // todo(lsugy): custom iterator for float conversion? + raft::linalg::matrixVectorOp( + centers, centers, - cluster_sizes, - [] __device__(float c, uint32_t s) -> float { return s == 0 ? 0.0f : c / float(s); }, + sizes_f, + (int64_t)dim, + (int64_t)n_clusters, + true, + false, + [=] __device__(float mat, float vec) { + if (vec == 0.0f) + return 0.0f; + else + return mat / vec; + }, stream); } @@ -627,7 +687,8 @@ void balancing_em_iters(const handle_t& handle, device_memory, dataset_norm); // M: Maximization step - calculate optimal cluster centers - calc_centers_and_sizes(cluster_centers, + calc_centers_and_sizes(handle, + cluster_centers, cluster_sizes, n_clusters, dim, @@ -635,7 +696,8 @@ void balancing_em_iters(const handle_t& handle, n_rows, cluster_labels, true, - stream); + stream, + device_memory); } } @@ -666,8 +728,17 @@ void build_clusters(const handle_t& handle, linalg::writeOnlyUnaryOp(cluster_labels, n_rows, f, stream); // update centers to match the initialized labels. - calc_centers_and_sizes( - cluster_centers, cluster_sizes, n_clusters, dim, dataset, n_rows, cluster_labels, true, stream); + calc_centers_and_sizes(handle, + cluster_centers, + cluster_sizes, + n_clusters, + dim, + dataset, + n_rows, + cluster_labels, + true, + stream, + device_memory); // run EM balancing_em_iters(handle, diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index 8dda574314..11313b6ed5 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -112,13 +112,13 @@ struct mapping { * @{ */ template - HDI auto operator()(const S& x) -> std::enable_if_t, T> + HDI auto operator()(const S& x) const -> std::enable_if_t, T> { return x; }; template - HDI auto operator()(const S& x) -> std::enable_if_t, T> + HDI auto operator()(const S& x) const -> std::enable_if_t, T> { constexpr double kMult = config::kDivisor / config::kDivisor; if constexpr (std::is_floating_point_v) { return static_cast(x * static_cast(kMult)); } @@ -383,6 +383,8 @@ __global__ void map_along_rows_kernel( * @brief Map a binary function over a matrix and a vector element-wise, broadcasting the vector * values along rows: `m[i, j] = op(m[i,j], v[i])` * + * @todo(lsugy): replace with matrix_vector_op + * * NB: device-only function * * @tparam IdxT index type diff --git a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh index af1cb97d36..14f5ae4516 100644 --- a/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh +++ b/cpp/include/raft/spatial/knn/detail/ivf_flat_build.cuh @@ -144,7 +144,8 @@ inline auto extend(const handle_t& handle, raft::copy( list_sizes_ptr, orig_index.list_sizes().data_handle(), ext_index.list_sizes().size(), stream); - kmeans::calc_centers_and_sizes(centers_ptr, + kmeans::calc_centers_and_sizes(handle, + centers_ptr, list_sizes_ptr, n_lists, dim, diff --git a/cpp/test/linalg/reduce_rows_by_key.cu b/cpp/test/linalg/reduce_rows_by_key.cu index e575f37dd6..7b124cb7bb 100644 --- a/cpp/test/linalg/reduce_rows_by_key.cu +++ b/cpp/test/linalg/reduce_rows_by_key.cu @@ -103,7 +103,7 @@ class ReduceRowTest : public ::testing::TestWithParam> { raft::random::RngState r(params.seed); raft::random::RngState r_int(params.seed); - int nobs = params.nobs; + uint32_t nobs = params.nobs; uint32_t cols = params.cols; uint32_t nkeys = params.nkeys; uniform(handle, r, in.data(), nobs * cols, T(0.0), T(2.0 / nobs)); From 529931dc10c5d8303f6eaeff85b2f149b05b2173 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Mon, 10 Oct 2022 11:57:28 +0200 Subject: [PATCH 02/25] Remove accumulate_into_selected --- .../knn/detail/ann_kmeans_balanced.cuh | 2 - .../raft/spatial/knn/detail/ann_utils.cuh | 66 ------------------- 2 files changed, 68 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index 5ddc10542b..d322be0c0a 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -291,7 +291,6 @@ void calc_centers_and_sizes(const handle_t& handle, reset_counters); // Compute weight of each cluster - // todo(lsugy): larger index type? raft::cluster::detail::countLabels( handle, labels, temp_sizes, (uint32_t)n_rows, (uint32_t)n_clusters, workspace); @@ -307,7 +306,6 @@ void calc_centers_and_sizes(const handle_t& handle, sizes_f[idx] = static_cast(temp_size); }); - // todo(lsugy): custom iterator for float conversion? raft::linalg::matrixVectorOp( centers, centers, diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index 11313b6ed5..f302b3d4a0 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -259,72 +259,6 @@ inline void dots_along_rows( */ } -template -__global__ void accumulate_into_selected_kernel(IdxT n_rows, - uint32_t n_cols, - float* output, - uint32_t* selection_counters, - const T* input, - const LabelT* row_ids) -{ - IdxT gid = threadIdx.x + (blockDim.x * static_cast(blockIdx.x)); - IdxT j = gid % n_cols; - IdxT i = gid / n_cols; - if (i >= n_rows) return; - IdxT l = static_cast(row_ids[i]); - if (j == 0) { atomicAdd(&(selection_counters[l]), 1); } - atomicAdd(&(output[j + n_cols * l]), mapping{}(input[gid])); -} - -/** - * @brief Add all rows of input matrix into a selection of rows in the output matrix - * (cast and possibly scale the data input type). Count the number of times every output - * row was selected along the way. - * - * @tparam T element type - * @tparam IdxT index type - * @tparam LabelT label type - * - * @param n_cols number of columns in all matrices - * @param[out] output output matrix [..., n_cols] - * @param[inout] selection_counters number of occurrences of each row id in row_ids [..., n_cols] - * @param n_rows number of rows in the input - * @param[in] input row-major input matrix [n_rows, n_cols] - * @param[in] row_ids row indices in the output matrix [n_rows] - */ -template -void accumulate_into_selected(IdxT n_rows, - uint32_t n_cols, - float* output, - uint32_t* selection_counters, - const T* input, - const LabelT* row_ids, - rmm::cuda_stream_view stream) -{ - switch (check_pointer_residency(output, input, selection_counters, row_ids)) { - case pointer_residency::host_and_device: - case pointer_residency::device_only: { - uint32_t block_dim = 128; - auto grid_dim = - static_cast(ceildiv(n_rows * static_cast(n_cols), block_dim)); - accumulate_into_selected_kernel<<>>( - n_rows, n_cols, output, selection_counters, input, row_ids); - } break; - case pointer_residency::host_only: { - stream.synchronize(); - for (IdxT i = 0; i < n_rows; i++) { - IdxT l = static_cast(row_ids[i]); - selection_counters[l]++; - for (IdxT j = 0; j < n_cols; j++) { - output[j + n_cols * l] += mapping{}(input[j + n_cols * i]); - } - } - stream.synchronize(); - } break; - default: RAFT_FAIL("All pointers must reside on the same side, host or device."); - } -} - template __global__ void normalize_rows_kernel(IdxT n_rows, IdxT n_cols, float* a) { From 6a61537d08b65ec8be122227d63e8353007ce57f Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Mon, 10 Oct 2022 19:20:59 +0200 Subject: [PATCH 03/25] Replace map_along_rows with matrixVectorOp --- .../raft/linalg/detail/matrix_vector_op.cuh | 30 +++++++--- cpp/include/raft/linalg/matrix_vector_op.cuh | 39 +++++++++---- .../raft/matrix/detail/linewise_op.cuh | 58 ++++++++++--------- cpp/include/raft/matrix/matrix.cuh | 2 +- .../knn/detail/ann_kmeans_balanced.cuh | 21 +++---- .../raft/spatial/knn/detail/ann_utils.cuh | 41 ------------- 6 files changed, 93 insertions(+), 98 deletions(-) diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh index 4cfccdcaa3..d00014e56f 100644 --- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh @@ -74,6 +74,7 @@ __global__ void matrixVectorOpKernel(Type* out, mat.store(out, idx); } +// todo(lsugy): remove this function? It isn't used. template void matrixVectorOpImpl(Type* out, const Type* matrix, @@ -92,10 +93,15 @@ void matrixVectorOpImpl(Type* out, RAFT_CUDA_TRY(cudaPeekAtLastError()); } -template -void matrixVectorOp(Type* out, - const Type* matrix, - const Type* vec, +template +void matrixVectorOp(OutT* out, + const MatT* matrix, + const VecT* vec, IdxType D, IdxType N, bool rowMajor, @@ -109,11 +115,17 @@ void matrixVectorOp(Type* out, out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec); } -template -void matrixVectorOp(Type* out, - const Type* matrix, - const Type* vec1, - const Type* vec2, +template +void matrixVectorOp(OutT* out, + const MatT* matrix, + const Vec1T* vec1, + const Vec2T* vec2, IdxType D, IdxType N, bool rowMajor, diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 1438a09bd3..f9e9f370ab 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -35,8 +35,10 @@ namespace linalg { * Note : the function will also check that the size of the window of accesses * is a multiple of the number of elements processed by a thread in order to * enable faster processing - * @tparam Type the matrix/vector type + * @tparam OutT the output type * @tparam Lambda a device function which represents a binary operator + * @tparam MatT the input matrix type + * @tparam VecT the input vector type * @tparam IdxType Integer type used to for addressing * @tparam TPB threads per block of the cuda kernel launched * @param out the output matrix (passing out = matrix makes it in-place) @@ -50,10 +52,15 @@ namespace linalg { * @param op the mathematical operation * @param stream cuda stream where to launch work */ -template -void matrixVectorOp(Type* out, - const Type* matrix, - const Type* vec, +template +void matrixVectorOp(OutT* out, + const MatT* matrix, + const VecT* vec, IdxType D, IdxType N, bool rowMajor, @@ -72,7 +79,11 @@ void matrixVectorOp(Type* out, * Note : the function will also check that the size of the window of accesses * is a multiple of the number of elements processed by a thread in order to * enable faster processing - * @tparam Type the matrix/vector type + * @tparam OutT the output type + * @tparam Lambda a device function which represents a binary operator + * @tparam MatT the input matrix type + * @tparam Vec1T the first input vector type + * @tparam Vec2T the second input vector type * @tparam Lambda a device function which represents a binary operator * @tparam IdxType Integer type used to for addressing * @tparam TPB threads per block of the cuda kernel launched @@ -88,11 +99,17 @@ void matrixVectorOp(Type* out, * @param op the mathematical operation * @param stream cuda stream where to launch work */ -template -void matrixVectorOp(Type* out, - const Type* matrix, - const Type* vec1, - const Type* vec2, +template +void matrixVectorOp(OutT* out, + const MatT* matrix, + const Vec1T* vec1, + const Vec2T* vec2, IdxType D, IdxType N, bool rowMajor, diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 15f5204382..1be5dbc91c 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -76,7 +76,7 @@ struct Linewise { IdxType rowDiv, IdxType rowMod, Lambda op, - Vecs... vecs) noexcept + const Vecs*... vecs) noexcept { constexpr IdxType warpPad = (AlignWarp::Value - 1) * VecElems; Type args[sizeof...(Vecs)]; @@ -157,11 +157,14 @@ struct Linewise { * @param [in] rowLen the length of a vector. * @return a contiguous chunk of a vector, suitable for `vectorRows`. */ - static __device__ __forceinline__ Vec loadVec(Type* shm, - const Type* p, - const IdxType blockOffset, - const IdxType rowLen) noexcept + template + static __device__ __forceinline__ raft::TxN_t loadVec( + VecT* shm, const VecT* p, const IdxType blockOffset, const IdxType rowLen) noexcept { + static_assert( + sizeof(Type) == sizeof(VecT), + "linewiseOp currently requires that the size of the matrix and vector types be the same"); + IdxType j = blockOffset + threadIdx.x; #pragma unroll VecElems for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) { @@ -171,8 +174,9 @@ struct Linewise { } __syncthreads(); { - Vec out; - *out.vectorized_data() = reinterpret_cast(shm)[threadIdx.x]; + raft::TxN_t out; + *out.vectorized_data() = + reinterpret_cast::io_t*>(shm)[threadIdx.x]; return out; } } @@ -209,7 +213,7 @@ __global__ void __launch_bounds__(BlockSize) const IdxType len, const IdxType elemsPerThread, Lambda op, - Vecs... vecs) + const Vecs*... vecs) { typedef Linewise L; @@ -253,7 +257,7 @@ __global__ void __launch_bounds__(MaxOffset, 2) const IdxType rowLen, const IdxType len, Lambda op, - Vecs... vecs) + const Vecs*... vecs) { // Note, L::VecElems == 1 typedef Linewise L; @@ -309,7 +313,7 @@ __global__ void __launch_bounds__(BlockSize) const IdxType rowLen, const IdxType len, Lambda op, - Vecs... vecs) + const Vecs*... vecs) { typedef Linewise L; constexpr uint workSize = L::VecElems * BlockSize; @@ -322,7 +326,7 @@ __global__ void __launch_bounds__(BlockSize) reinterpret_cast(in), L::AlignElems::div(len), op, - (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, blockOffset, rowLen))...); + (workOffset ^= workSize, L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...); } /** @@ -351,7 +355,7 @@ __global__ void __launch_bounds__(MaxOffset, 2) const IdxType rowLen, const IdxType len, Lambda op, - Vecs... vecs) + const Vecs*... vecs) { // Note, L::VecElems == 1 constexpr uint workSize = MaxOffset; @@ -360,20 +364,21 @@ __global__ void __launch_bounds__(MaxOffset, 2) typedef Linewise L; if (blockIdx.x == 0) { // first block: offset = 0, length = arrOffset - L::vectorRows(reinterpret_cast(out), - reinterpret_cast(in), - arrOffset, - op, - (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, 0, rowLen))...); + L::vectorRows( + reinterpret_cast(out), + reinterpret_cast(in), + arrOffset, + op, + (workOffset ^= workSize, L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...); } else { // second block: offset = arrTail, length = len - arrTail // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1 - L::vectorRows( - reinterpret_cast(out + arrTail - MaxOffset), - reinterpret_cast(in + arrTail - MaxOffset), - len - arrTail + MaxOffset, - op, - (workOffset ^= workSize, L::loadVec(shm + workOffset, vecs, arrTail % rowLen, rowLen))...); + L::vectorRows(reinterpret_cast(out + arrTail - MaxOffset), + reinterpret_cast(in + arrTail - MaxOffset), + len - arrTail + MaxOffset, + op, + (workOffset ^= workSize, + L::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...); } } @@ -409,7 +414,7 @@ void matrixLinewiseVecCols(Type* out, const IdxType nRows, Lambda op, cudaStream_t stream, - Vecs... vecs) + const Vecs*... vecs) { typedef raft::Pow2 AlignBytes; constexpr std::size_t VecElems = VecBytes / sizeof(Type); @@ -456,7 +461,7 @@ void matrixLinewiseVecRows(Type* out, const IdxType nRows, Lambda op, cudaStream_t stream, - Vecs... vecs) + const Vecs*... vecs) { typedef raft::Pow2 AlignBytes; constexpr std::size_t VecElems = VecBytes / sizeof(Type); @@ -527,8 +532,9 @@ struct MatrixLinewiseOp { const bool alongLines, Lambda op, cudaStream_t stream, - Vecs... vecs) + const Vecs*... vecs) { + // todo(lsugy): add test with different vector types if constexpr (VecBytes > sizeof(Type)) { if (!raft::Pow2::areSameAlignOffsets(in, out)) return MatrixLinewiseOp> 1), sizeof(Type)), BlockSize>::run( diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh index 3a7e0dad47..cd6c4fa219 100644 --- a/cpp/include/raft/matrix/matrix.cuh +++ b/cpp/include/raft/matrix/matrix.cuh @@ -289,7 +289,7 @@ void linewiseOp(m_t* out, const bool alongLines, Lambda op, cudaStream_t stream, - Vecs... vecs) + const Vecs*... vecs) { common::nvtx::range fun_scope("linewiseOp-%c-%zu (%zu, %zu)", alongLines ? 'l' : 'x', diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index d322be0c0a..8167827dfb 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -252,18 +252,19 @@ void calc_centers_and_sizes(const handle_t& handle, if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); } if (!reset_counters) { - utils::map_along_rows( - n_clusters, - dim, + raft::linalg::matrixVectorOp( + centers, centers, cluster_sizes, - [] __device__(float c, uint32_t s) -> float { return c * s; }, + (int64_t)dim, + (int64_t)n_clusters, + true, + false, + [=] __device__(float c, uint32_t s) -> float { return c * s; }, stream); } rmm::device_uvector workspace(0, stream, mr); - rmm::device_uvector cluster_sizes_f(n_clusters, stream, mr); - float* sizes_f = cluster_sizes_f.data(); // If we reset the counters, we can compute directly the new sizes in cluster_sizes. // If we don't reset, we compute in a temporary buffer and add in a separate step. @@ -295,6 +296,7 @@ void calc_centers_and_sizes(const handle_t& handle, handle, labels, temp_sizes, (uint32_t)n_rows, (uint32_t)n_clusters, workspace); // Add previous sizes if necessary and cast to float + // todo(lsugy): replace with add wrapped in if auto counting = thrust::make_counting_iterator(0); thrust::for_each( handle.get_thrust_policy(), counting, counting + n_clusters, [=] __device__(int idx) { @@ -303,19 +305,18 @@ void calc_centers_and_sizes(const handle_t& handle, temp_size += cluster_sizes[idx]; cluster_sizes[idx] = temp_size; } - sizes_f[idx] = static_cast(temp_size); }); raft::linalg::matrixVectorOp( centers, centers, - sizes_f, + cluster_sizes, (int64_t)dim, (int64_t)n_clusters, true, false, - [=] __device__(float mat, float vec) { - if (vec == 0.0f) + [=] __device__(float mat, uint32_t vec) { + if (vec == 0u) return 0.0f; else return mat / vec; diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh index f302b3d4a0..7b26ccfb42 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh @@ -302,47 +302,6 @@ inline void normalize_rows(IdxT n_rows, IdxT n_cols, float* a, rmm::cuda_stream_ normalize_rows_kernel<<>>(n_rows, n_cols, a); } -template -__global__ void map_along_rows_kernel( - IdxT n_rows, uint32_t n_cols, float* a, const uint32_t* d, Lambda map) -{ - IdxT gid = threadIdx.x + blockDim.x * static_cast(blockIdx.x); - IdxT i = gid / n_cols; - if (i >= n_rows) return; - float& x = a[gid]; - x = map(x, d[i]); -} - -/** - * @brief Map a binary function over a matrix and a vector element-wise, broadcasting the vector - * values along rows: `m[i, j] = op(m[i,j], v[i])` - * - * @todo(lsugy): replace with matrix_vector_op - * - * NB: device-only function - * - * @tparam IdxT index type - * @tparam Lambda - * - * @param n_rows - * @param n_cols - * @param[inout] m device pointer to a row-major matrix [n_rows, n_cols] - * @param[in] v device pointer to a vector [n_rows] - * @param op the binary operation to apply on every element of matrix rows and of the vector - */ -template -inline void map_along_rows(IdxT n_rows, - uint32_t n_cols, - float* m, - const uint32_t* v, - Lambda op, - rmm::cuda_stream_view stream) -{ - dim3 threads(128, 1, 1); - dim3 blocks(ceildiv(n_rows * n_cols, threads.x), 1, 1); - map_along_rows_kernel<<>>(n_rows, n_cols, m, v, op); -} - template __global__ void outer_add_kernel(const T* a, IdxT len_a, const T* b, IdxT len_b, T* c) { From 0a98482936f2153918df98c7740fedfc91e28a0d Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 18 Oct 2022 14:56:37 +0200 Subject: [PATCH 04/25] Start adding support for arbitrary types in linewiseOp --- .../raft/matrix/detail/linewise_op.cuh | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 1be5dbc91c..9b9a7d2967 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -284,6 +284,15 @@ __global__ void __launch_bounds__(MaxOffset, 2) vecs...); } +/** Helper function to get the largest type from a variadic list of types */ +template +constexpr size_t maxSizeOf() +{ + size_t maxSize = 0; + ((maxSize = std::max(maxSize, sizeof(Types))), ...); + return maxSize; +} + /** * This kernel prepares the inputs for the `vectorRows` function where the most of the * work happens; see `vectorRows` for details. @@ -316,10 +325,12 @@ __global__ void __launch_bounds__(BlockSize) const Vecs*... vecs) { typedef Linewise L; - constexpr uint workSize = L::VecElems * BlockSize; - uint workOffset = workSize; - __shared__ __align__(sizeof(Type) * L::VecElems) - Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; + constexpr uint workSize = L::VecElems * BlockSize; + constexpr size_t maxVecItemSize = maxSizeOf(); + uint workOffset = workSize * maxVecItemSize; + __shared__ __align__( + maxVecItemSize * + L::VecElems) char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen; return L::vectorRows( reinterpret_cast(out), @@ -358,9 +369,10 @@ __global__ void __launch_bounds__(MaxOffset, 2) const Vecs*... vecs) { // Note, L::VecElems == 1 - constexpr uint workSize = MaxOffset; - uint workOffset = workSize; - __shared__ Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; + constexpr uint workSize = MaxOffset; + constexpr size_t maxVecItemSize = maxSizeOf(); + uint workOffset = workSize * maxVecItemSize; + __shared__ char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; typedef Linewise L; if (blockIdx.x == 0) { // first block: offset = 0, length = arrOffset From eafa61e542cd3af97de8a10eb56bca0c55f1c0fa Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 18 Oct 2022 14:57:33 +0200 Subject: [PATCH 05/25] Allow different types for output, matrix and vector(s) in mdspan-based API for matrixVectorOp --- cpp/include/raft/linalg/matrix_vector_op.cuh | 24 ++++++++++++-------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index f9e9f370ab..41e00b568b 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -133,7 +133,8 @@ void matrixVectorOp(OutT* out, * Note : the function will also check that the size of the window of accesses * is a multiple of the number of elements processed by a thread in order to * enable faster processing - * @tparam InValueType the data-type of the input matrices and vectors + * @tparam MatValueType the data-type of the input matrix + * @tparam VecValueType the data-type of the input vector * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major) * @tparam Lambda a device function which represents a binary operator * @tparam OutElementType the data-type of the output raft::matrix_view @@ -147,15 +148,16 @@ void matrixVectorOp(OutT* out, * the rows of the matrix or columns using enum class raft::linalg::Apply * @param[in] op the mathematical operation */ -template void matrix_vector_op(const raft::handle_t& handle, - raft::device_matrix_view matrix, - raft::device_vector_view vec, + raft::device_matrix_view matrix, + raft::device_vector_view vec, raft::device_matrix_view out, Apply apply, Lambda op) @@ -194,7 +196,9 @@ void matrix_vector_op(const raft::handle_t& handle, * Note : the function will also check that the size of the window of accesses * is a multiple of the number of elements processed by a thread in order to * enable faster processing - * @tparam InValueType the data-type of the input matrices and vectors + * @tparam MatValueType the data-type of the input matrix + * @tparam Vec1ValueType the data-type of the first input vector + * @tparam Vec2ValueType the data-type of the second input vector * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major) * @tparam Lambda a device function which represents a binary operator * @tparam OutElementType the data-type of the output raft::matrix_view @@ -209,16 +213,18 @@ void matrix_vector_op(const raft::handle_t& handle, * the rows of the matrix or columns using enum class raft::linalg::Apply * @param op the mathematical operation */ -template void matrix_vector_op(const raft::handle_t& handle, - raft::device_matrix_view matrix, - raft::device_vector_view vec1, - raft::device_vector_view vec2, + raft::device_matrix_view matrix, + raft::device_vector_view vec1, + raft::device_vector_view vec2, raft::device_matrix_view out, Apply apply, Lambda op) From 74309ed9702020adbc21b3506230ee620e156b1b Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 18 Oct 2022 14:58:35 +0200 Subject: [PATCH 06/25] Call cub histogram with signed type to avoid a warning breaking compilation --- cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index 8167827dfb..b429d2a052 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -293,7 +293,7 @@ void calc_centers_and_sizes(const handle_t& handle, // Compute weight of each cluster raft::cluster::detail::countLabels( - handle, labels, temp_sizes, (uint32_t)n_rows, (uint32_t)n_clusters, workspace); + handle, labels, temp_sizes, (int64_t)n_rows, (int64_t)n_clusters, workspace); // Add previous sizes if necessary and cast to float // todo(lsugy): replace with add wrapped in if From df7cfb5599ac433c5408957c2dab21101e26559d Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 18 Oct 2022 14:59:39 +0200 Subject: [PATCH 07/25] Start adding support for different output/matrix/vector types in MatVecOpTest + remove redundant inputs in LinewiseTest --- cpp/test/linalg/matrix_vector_op.cu | 243 +++++++++++++--------------- cpp/test/matrix/linewise_op.cu | 1 - 2 files changed, 110 insertions(+), 134 deletions(-) diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 2023ce4121..9fb4647eb9 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -27,7 +27,7 @@ template struct MatVecOpInputs { T tolerance; IdxType rows, cols; - bool rowMajor, bcastAlongRows, useTwoVectors; + bool rowMajor, bcastAlongRows; unsigned long long int seed; }; @@ -40,68 +40,58 @@ template // Or else, we get the following compilation error // for an extended __device__ lambda cannot have private or protected access // within its class -template +template void matrixVectorOpLaunch(const raft::handle_t& handle, - T* out, - const T* in, - const T* vec1, - const T* vec2, + OutT* out, + const MatT* in, + const Vec1T* vec1, + const Vec2T* vec2, IdxType D, IdxType N, bool rowMajor, - bool bcastAlongRows, - bool useTwoVectors) + bool bcastAlongRows) { - auto out_row_major = raft::make_device_matrix_view(out, N, D); - auto in_row_major = raft::make_device_matrix_view(in, N, D); + auto out_row_major = raft::make_device_matrix_view(out, N, D); + auto in_row_major = raft::make_device_matrix_view(in, N, D); - auto out_col_major = raft::make_device_matrix_view(out, N, D); - auto in_col_major = raft::make_device_matrix_view(in, N, D); + auto out_col_major = raft::make_device_matrix_view(out, N, D); + auto in_col_major = raft::make_device_matrix_view(in, N, D); auto apply = bcastAlongRows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS; auto len = bcastAlongRows ? D : N; - auto vec1_view = raft::make_device_vector_view(vec1, len); - auto vec2_view = raft::make_device_vector_view(vec2, len); - - if (useTwoVectors) { + auto vec1_view = raft::make_device_vector_view(vec1, len); + + if constexpr (OpT::useTwoVectors) { + auto vec2_view = raft::make_device_vector_view(vec2, len); if (rowMajor) { - matrix_vector_op(handle, - in_row_major, - vec1_view, - vec2_view, - out_row_major, - apply, - [] __device__(T a, T b, T c) { return a + b + c; }); + matrix_vector_op(handle, in_row_major, vec1_view, vec2_view, out_row_major, apply, OpT{}); } else { - matrix_vector_op(handle, - in_col_major, - vec1_view, - vec2_view, - out_col_major, - - apply, - [] __device__(T a, T b, T c) { return a + b + c; }); + matrix_vector_op(handle, in_col_major, vec1_view, vec2_view, out_col_major, apply, OpT{}); } } else { if (rowMajor) { - matrix_vector_op( - handle, in_row_major, vec1_view, out_row_major, apply, [] __device__(T a, T b) { - return a + b; - }); + matrix_vector_op(handle, in_row_major, vec1_view, out_row_major, apply, OpT{}); } else { - matrix_vector_op( - handle, in_col_major, vec1_view, out_col_major, apply, [] __device__(T a, T b) { - return a + b; - }); + matrix_vector_op(handle, in_col_major, vec1_view, out_col_major, apply, OpT{}); } } } -template -class MatVecOpTest : public ::testing::TestWithParam> { +template +class MatVecOpTest : public ::testing::TestWithParam> { public: MatVecOpTest() - : params(::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in(params.rows * params.cols, stream), out_ref(params.rows * params.cols, stream), @@ -118,10 +108,10 @@ class MatVecOpTest : public ::testing::TestWithParam> IdxType N = params.rows, D = params.cols; IdxType len = N * D; IdxType vecLen = params.bcastAlongRows ? D : N; - uniform(handle, r, in.data(), len, (T)-1.0, (T)1.0); - uniform(handle, r, vec1.data(), vecLen, (T)-1.0, (T)1.0); - uniform(handle, r, vec2.data(), vecLen, (T)-1.0, (T)1.0); - if (params.useTwoVectors) { + uniform(handle, r, in.data(), len, (MatT)-1.0, (MatT)1.0); + uniform(handle, r, vec1.data(), vecLen, (Vec1T)-1.0, (Vec1T)1.0); + uniform(handle, r, vec2.data(), vecLen, (Vec2T)-1.0, (Vec2T)1.0); + if constexpr (OpT::useTwoVectors) { naiveMatVec(out_ref.data(), in.data(), vec1.data(), @@ -130,7 +120,7 @@ class MatVecOpTest : public ::testing::TestWithParam> N, params.rowMajor, params.bcastAlongRows, - (T)1.0, + (OutT)1.0, stream); } else { naiveMatVec(out_ref.data(), @@ -140,19 +130,18 @@ class MatVecOpTest : public ::testing::TestWithParam> N, params.rowMajor, params.bcastAlongRows, - (T)1.0, + (OutT)1.0, stream); } - matrixVectorOpLaunch(handle, - out.data(), - in.data(), - vec1.data(), - vec2.data(), - D, - N, - params.rowMajor, - params.bcastAlongRows, - params.useTwoVectors); + matrixVectorOpLaunch(handle, + out.data(), + in.data(), + vec1.data(), + vec2.data(), + D, + N, + params.rowMajor, + params.bcastAlongRows); handle.sync_stream(); } @@ -160,87 +149,75 @@ class MatVecOpTest : public ::testing::TestWithParam> raft::handle_t handle; cudaStream_t stream; - MatVecOpInputs params; - rmm::device_uvector in, out, out_ref, vec1, vec2; + MatVecOpInputs params; + rmm::device_uvector in; + rmm::device_uvector out; + rmm::device_uvector out_ref; + rmm::device_uvector vec1; + rmm::device_uvector vec2; }; -const std::vector> inputsf_i32 = { - {0.00001f, 1024, 32, true, true, false, 1234ULL}, - {0.00001f, 1024, 64, true, true, false, 1234ULL}, - {0.00001f, 1024, 32, true, false, false, 1234ULL}, - {0.00001f, 1024, 64, true, false, false, 1234ULL}, - {0.00001f, 1024, 32, false, true, false, 1234ULL}, - {0.00001f, 1024, 64, false, true, false, 1234ULL}, - {0.00001f, 1024, 32, false, false, false, 1234ULL}, - {0.00001f, 1024, 64, false, false, false, 1234ULL}, - - {0.00001f, 1024, 32, true, true, true, 1234ULL}, - {0.00001f, 1024, 64, true, true, true, 1234ULL}, - {0.00001f, 1024, 32, true, false, true, 1234ULL}, - {0.00001f, 1024, 64, true, false, true, 1234ULL}, - {0.00001f, 1024, 32, false, true, true, 1234ULL}, - {0.00001f, 1024, 64, false, true, true, 1234ULL}, - {0.00001f, 1024, 32, false, false, true, 1234ULL}, - {0.00001f, 1024, 64, false, false, true, 1234ULL}}; -typedef MatVecOpTest MatVecOpTestF_i32; -TEST_P(MatVecOpTestF_i32, Result) -{ - ASSERT_TRUE(devArrMatch( - out_ref.data(), out.data(), params.rows * params.cols, CompareApprox(params.tolerance))); -} -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i32, ::testing::ValuesIn(inputsf_i32)); +#define MVTEST(TestClass, inputs) \ + TEST_P(TestClass, Result) \ + { \ + ASSERT_TRUE(devArrMatch(out_ref.data(), \ + out.data(), \ + params.rows* params.cols, \ + CompareApprox(params.tolerance))); \ + } \ + INSTANTIATE_TEST_SUITE_P(MatVecOpTests, TestClass, ::testing::ValuesIn(inputs)) +const std::vector> inputsf_i32 = { + {0.00001f, 1024, 32, true, true, 1234ULL}, + {0.00001f, 1024, 64, true, true, 1234ULL}, + {0.00001f, 1024, 32, true, false, 1234ULL}, + {0.00001f, 1024, 64, true, false, 1234ULL}, + {0.00001f, 1024, 32, false, true, 1234ULL}, + {0.00001f, 1024, 64, false, true, 1234ULL}, + {0.00001f, 1024, 32, false, false, 1234ULL}, + {0.00001f, 1024, 64, false, false, 1234ULL}}; const std::vector> inputsf_i64 = { - {0.00001f, 2500, 250, false, false, false, 1234ULL}, - {0.00001f, 2500, 250, false, false, true, 1234ULL}}; -typedef MatVecOpTest MatVecOpTestF_i64; -TEST_P(MatVecOpTestF_i64, Result) -{ - ASSERT_TRUE(devArrMatch( - out_ref.data(), out.data(), params.rows * params.cols, CompareApprox(params.tolerance))); -} -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestF_i64, ::testing::ValuesIn(inputsf_i64)); - + {0.00001f, 2500, 250, false, false, 1234ULL}, {0.00001f, 2500, 250, false, false, 1234ULL}}; const std::vector> inputsd_i32 = { - {0.0000001, 1024, 32, true, true, false, 1234ULL}, - {0.0000001, 1024, 64, true, true, false, 1234ULL}, - {0.0000001, 1024, 32, true, false, false, 1234ULL}, - {0.0000001, 1024, 64, true, false, false, 1234ULL}, - {0.0000001, 1024, 32, false, true, false, 1234ULL}, - {0.0000001, 1024, 64, false, true, false, 1234ULL}, - {0.0000001, 1024, 32, false, false, false, 1234ULL}, - {0.0000001, 1024, 64, false, false, false, 1234ULL}, - - {0.0000001, 1024, 32, true, true, true, 1234ULL}, - {0.0000001, 1024, 64, true, true, true, 1234ULL}, - {0.0000001, 1024, 32, true, false, true, 1234ULL}, - {0.0000001, 1024, 64, true, false, true, 1234ULL}, - {0.0000001, 1024, 32, false, true, true, 1234ULL}, - {0.0000001, 1024, 64, false, true, true, 1234ULL}, - {0.0000001, 1024, 32, false, false, true, 1234ULL}, - {0.0000001, 1024, 64, false, false, true, 1234ULL}}; -typedef MatVecOpTest MatVecOpTestD_i32; -TEST_P(MatVecOpTestD_i32, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref.data(), - out.data(), - params.rows * params.cols, - CompareApprox(params.tolerance))); -} -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i32, ::testing::ValuesIn(inputsd_i32)); - + {0.0000001, 1024, 32, true, true, 1234ULL}, + {0.0000001, 1024, 64, true, true, 1234ULL}, + {0.0000001, 1024, 32, true, false, 1234ULL}, + {0.0000001, 1024, 64, true, false, 1234ULL}, + {0.0000001, 1024, 32, false, true, 1234ULL}, + {0.0000001, 1024, 64, false, true, 1234ULL}, + {0.0000001, 1024, 32, false, false, 1234ULL}, + {0.0000001, 1024, 64, false, false, 1234ULL}}; const std::vector> inputsd_i64 = { - {0.0000001, 2500, 250, false, false, false, 1234ULL}, - {0.0000001, 2500, 250, false, false, true, 1234ULL}}; -typedef MatVecOpTest MatVecOpTestD_i64; -TEST_P(MatVecOpTestD_i64, Result) -{ - ASSERT_TRUE(devArrMatch(out_ref.data(), - out.data(), - params.rows * params.cols, - CompareApprox(params.tolerance))); -} -INSTANTIATE_TEST_SUITE_P(MatVecOpTests, MatVecOpTestD_i64, ::testing::ValuesIn(inputsd_i64)); + {0.0000001, 2500, 250, false, false, 1234ULL}, {0.0000001, 2500, 250, false, false, 1234ULL}}; + +template +struct Add1Vec { + static constexpr bool useTwoVectors = false; + HDI float operator()(T a, T b) const { return a + b; }; +}; +template +struct Add2Vec { + static constexpr bool useTwoVectors = true; + HDI float operator()(T a, T b, T c) const { return a + b + c; }; +}; + +typedef MatVecOpTest, float, int> MatVecOpTestF_i32_add1vec; +typedef MatVecOpTest, float, int> MatVecOpTestF_i32_add2vec; +typedef MatVecOpTest, float, size_t> MatVecOpTestF_i64_add1vec; +typedef MatVecOpTest, float, size_t> MatVecOpTestF_i64_add2vec; +typedef MatVecOpTest, double, int> MatVecOpTestD_i32_add1vec; +typedef MatVecOpTest, double, int> MatVecOpTestD_i32_add2vec; +typedef MatVecOpTest, double, size_t> MatVecOpTestD_i64_add1vec; +typedef MatVecOpTest, double, size_t> MatVecOpTestD_i64_add2vec; + +MVTEST(MatVecOpTestF_i32_add1vec, inputsf_i32); +MVTEST(MatVecOpTestF_i32_add2vec, inputsf_i32); +MVTEST(MatVecOpTestF_i64_add1vec, inputsf_i64); +MVTEST(MatVecOpTestF_i64_add2vec, inputsf_i64); +MVTEST(MatVecOpTestD_i32_add1vec, inputsd_i32); +MVTEST(MatVecOpTestD_i32_add2vec, inputsd_i32); +MVTEST(MatVecOpTestD_i64_add1vec, inputsd_i64); +MVTEST(MatVecOpTestD_i64_add2vec, inputsd_i64); } // end namespace linalg } // end namespace raft diff --git a/cpp/test/matrix/linewise_op.cu b/cpp/test/matrix/linewise_op.cu index 9d3d5af51e..8043dd05e7 100644 --- a/cpp/test/matrix/linewise_op.cu +++ b/cpp/test/matrix/linewise_op.cu @@ -216,7 +216,6 @@ struct LinewiseTest : public ::testing::TestWithParam> dims; for (auto m : sizes) { for (auto n : sizes) { - dims.push_back(std::make_tuple(n, m)); dims.push_back(std::make_tuple(m, n)); } } From 826446265c5617cbe6ad091e0554ad0e74f8c932 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 18 Oct 2022 15:41:48 +0200 Subject: [PATCH 08/25] Fix shared mem buffering offset in linewise kernels --- .../raft/matrix/detail/linewise_op.cuh | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 9b9a7d2967..707764fbe8 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -161,6 +161,7 @@ struct Linewise { static __device__ __forceinline__ raft::TxN_t loadVec( VecT* shm, const VecT* p, const IdxType blockOffset, const IdxType rowLen) noexcept { + // todo(lsugy): remove this check static_assert( sizeof(Type) == sizeof(VecT), "linewiseOp currently requires that the size of the matrix and vector types be the same"); @@ -324,6 +325,7 @@ __global__ void __launch_bounds__(BlockSize) Lambda op, const Vecs*... vecs) { + // todo(lsugy): use appropriate Linewise type per vec typedef Linewise L; constexpr uint workSize = L::VecElems * BlockSize; constexpr size_t maxVecItemSize = maxSizeOf(); @@ -332,12 +334,12 @@ __global__ void __launch_bounds__(BlockSize) maxVecItemSize * L::VecElems) char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; const IdxType blockOffset = (arrOffset + BlockSize * L::VecElems * blockIdx.x) % rowLen; - return L::vectorRows( - reinterpret_cast(out), - reinterpret_cast(in), - L::AlignElems::div(len), - op, - (workOffset ^= workSize, L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...); + return L::vectorRows(reinterpret_cast(out), + reinterpret_cast(in), + L::AlignElems::div(len), + op, + (workOffset ^= workSize * maxVecItemSize, + L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...); } /** @@ -371,17 +373,18 @@ __global__ void __launch_bounds__(MaxOffset, 2) // Note, L::VecElems == 1 constexpr uint workSize = MaxOffset; constexpr size_t maxVecItemSize = maxSizeOf(); - uint workOffset = workSize * maxVecItemSize; + uint workOffset = workSize * maxVecItemSize; __shared__ char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; + // todo(lsugy): use appropriate Linewise type per vec typedef Linewise L; if (blockIdx.x == 0) { // first block: offset = 0, length = arrOffset - L::vectorRows( - reinterpret_cast(out), - reinterpret_cast(in), - arrOffset, - op, - (workOffset ^= workSize, L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...); + L::vectorRows(reinterpret_cast(out), + reinterpret_cast(in), + arrOffset, + op, + (workOffset ^= workSize * maxVecItemSize, + L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...); } else { // second block: offset = arrTail, length = len - arrTail // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1 @@ -389,7 +392,7 @@ __global__ void __launch_bounds__(MaxOffset, 2) reinterpret_cast(in + arrTail - MaxOffset), len - arrTail + MaxOffset, op, - (workOffset ^= workSize, + (workOffset ^= workSize * maxVecItemSize, L::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...); } } From e4a8b661adf110534f5e7b951b320b5fa6502a4c Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 18 Oct 2022 16:51:57 +0200 Subject: [PATCH 09/25] Pass custom op to naiveMat --- cpp/test/linalg/matrix_vector_op.cu | 6 +-- cpp/test/linalg/matrix_vector_op.cuh | 70 +++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 14 deletions(-) diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 9fb4647eb9..ae132dea8c 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -65,7 +65,7 @@ void matrixVectorOpLaunch(const raft::handle_t& handle, auto apply = bcastAlongRows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS; auto len = bcastAlongRows ? D : N; auto vec1_view = raft::make_device_vector_view(vec1, len); - + if constexpr (OpT::useTwoVectors) { auto vec2_view = raft::make_device_vector_view(vec2, len); if (rowMajor) { @@ -120,7 +120,7 @@ class MatVecOpTest : public ::testing::TestWithParam(handle, diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh index f46d70eaa3..38c0eda984 100644 --- a/cpp/test/linalg/matrix_vector_op.cuh +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -21,7 +21,7 @@ namespace raft { namespace linalg { -template +template __global__ void naiveMatVecKernel(Type* out, const Type* mat, const Type* vec, @@ -29,7 +29,7 @@ __global__ void naiveMatVecKernel(Type* out, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar) + Lambda op) { IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; @@ -43,10 +43,10 @@ __global__ void naiveMatVecKernel(Type* out, } else { col = idx / N; } - if (idx < len) { out[idx] = mat[idx] + scalar * vec[col]; } + if (idx < len) { out[idx] = op(mat[idx], vec[col]); } } -template +template void naiveMatVec(Type* out, const Type* mat, const Type* vec, @@ -54,18 +54,41 @@ void naiveMatVec(Type* out, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar, + Lambda op, cudaStream_t stream) { static const IdxType TPB = 64; IdxType len = N * D; IdxType nblks = raft::ceildiv(len, TPB); naiveMatVecKernel - <<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, scalar); + <<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, op); RAFT_CUDA_TRY(cudaPeekAtLastError()); } template +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec1, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar, + cudaStream_t stream) +{ + naiveMatVec( + out, + mat, + vec1, + D, + N, + rowMajor, + bcastAlongRows, + [scalar] __device__(Type a, Type b) { return a + scalar * b; }, + stream); +} + +template __global__ void naiveMatVecKernel(Type* out, const Type* mat, const Type* vec1, @@ -74,7 +97,7 @@ __global__ void naiveMatVecKernel(Type* out, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar) + Lambda op) { IdxType idx = threadIdx.x + blockIdx.x * blockDim.x; IdxType len = N * D; @@ -88,10 +111,10 @@ __global__ void naiveMatVecKernel(Type* out, } else { col = idx / N; } - if (idx < len) { out[idx] = mat[idx] + scalar * vec1[col] + vec2[col]; } + if (idx < len) { out[idx] = op(mat[idx], vec1[col], vec2[col]); } } -template +template void naiveMatVec(Type* out, const Type* mat, const Type* vec1, @@ -100,16 +123,41 @@ void naiveMatVec(Type* out, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar, + Lambda op, cudaStream_t stream) { static const IdxType TPB = 64; IdxType len = N * D; IdxType nblks = raft::ceildiv(len, TPB); naiveMatVecKernel - <<>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, scalar); + <<>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, op); RAFT_CUDA_TRY(cudaPeekAtLastError()); } +template +void naiveMatVec(Type* out, + const Type* mat, + const Type* vec1, + const Type* vec2, + IdxType D, + IdxType N, + bool rowMajor, + bool bcastAlongRows, + Type scalar, + cudaStream_t stream) +{ + naiveMatVec( + out, + mat, + vec1, + vec2, + D, + N, + rowMajor, + bcastAlongRows, + [scalar] __device__(Type a, Type b, Type c) { return a + scalar * b + c; }, + stream); +} + } // end namespace linalg } // end namespace raft From 938b1801215446e230cf0c4662b72c741855e02d Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 18 Oct 2022 17:04:19 +0200 Subject: [PATCH 10/25] Support different output/matrix/vector(s) types in naiveMatVec --- cpp/test/linalg/matrix_vector_op.cuh | 81 +++++++++++++++------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/cpp/test/linalg/matrix_vector_op.cuh b/cpp/test/linalg/matrix_vector_op.cuh index 38c0eda984..602d05d153 100644 --- a/cpp/test/linalg/matrix_vector_op.cuh +++ b/cpp/test/linalg/matrix_vector_op.cuh @@ -21,10 +21,10 @@ namespace raft { namespace linalg { -template -__global__ void naiveMatVecKernel(Type* out, - const Type* mat, - const Type* vec, +template +__global__ void naiveMatVecKernel(OutT* out, + const MatT* mat, + const VecT* vec, IdxType D, IdxType N, bool rowMajor, @@ -46,10 +46,10 @@ __global__ void naiveMatVecKernel(Type* out, if (idx < len) { out[idx] = op(mat[idx], vec[col]); } } -template -void naiveMatVec(Type* out, - const Type* mat, - const Type* vec, +template +void naiveMatVec(OutT* out, + const MatT* mat, + const VecT* vec, IdxType D, IdxType N, bool rowMajor, @@ -60,39 +60,43 @@ void naiveMatVec(Type* out, static const IdxType TPB = 64; IdxType len = N * D; IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel - <<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, op); + naiveMatVecKernel<<>>(out, mat, vec, D, N, rowMajor, bcastAlongRows, op); RAFT_CUDA_TRY(cudaPeekAtLastError()); } -template -void naiveMatVec(Type* out, - const Type* mat, - const Type* vec1, +template +void naiveMatVec(OutT* out, + const MatT* mat, + const VecT* vec, IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar, + OutT scalar, cudaStream_t stream) { naiveMatVec( out, mat, - vec1, + vec, D, N, rowMajor, bcastAlongRows, - [scalar] __device__(Type a, Type b) { return a + scalar * b; }, + [scalar] __device__(MatT a, VecT b) { return (OutT)(a + scalar * b); }, stream); } -template -__global__ void naiveMatVecKernel(Type* out, - const Type* mat, - const Type* vec1, - const Type* vec2, +template +__global__ void naiveMatVecKernel(OutT* out, + const MatT* mat, + const Vec1T* vec1, + const Vec2T* vec2, IdxType D, IdxType N, bool rowMajor, @@ -114,11 +118,16 @@ __global__ void naiveMatVecKernel(Type* out, if (idx < len) { out[idx] = op(mat[idx], vec1[col], vec2[col]); } } -template -void naiveMatVec(Type* out, - const Type* mat, - const Type* vec1, - const Type* vec2, +template +void naiveMatVec(OutT* out, + const MatT* mat, + const Vec1T* vec1, + const Vec2T* vec2, IdxType D, IdxType N, bool rowMajor, @@ -129,21 +138,21 @@ void naiveMatVec(Type* out, static const IdxType TPB = 64; IdxType len = N * D; IdxType nblks = raft::ceildiv(len, TPB); - naiveMatVecKernel - <<>>(out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, op); + naiveMatVecKernel<<>>( + out, mat, vec1, vec2, D, N, rowMajor, bcastAlongRows, op); RAFT_CUDA_TRY(cudaPeekAtLastError()); } -template -void naiveMatVec(Type* out, - const Type* mat, - const Type* vec1, - const Type* vec2, +template +void naiveMatVec(OutT* out, + const MatT* mat, + const Vec1T* vec1, + const Vec2T* vec2, IdxType D, IdxType N, bool rowMajor, bool bcastAlongRows, - Type scalar, + OutT scalar, cudaStream_t stream) { naiveMatVec( @@ -155,7 +164,7 @@ void naiveMatVec(Type* out, N, rowMajor, bcastAlongRows, - [scalar] __device__(Type a, Type b, Type c) { return a + scalar * b + c; }, + [scalar] __device__(MatT a, Vec1T b, Vec2T c) { return (OutT)(a + scalar * b + c); }, stream); } From c85ee9bc3b29a5c0212299d57d2d458b3d7cc3a2 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 18 Oct 2022 18:15:23 +0200 Subject: [PATCH 11/25] Test matrix-vector op with different matrix / vector types --- cpp/include/raft/linalg/matrix_vector_op.cuh | 2 + .../raft/matrix/detail/linewise_op.cuh | 11 +- cpp/test/linalg/matrix_vector_op.cu | 136 +++++++++++------- 3 files changed, 92 insertions(+), 57 deletions(-) diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 41e00b568b..464591f593 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -68,6 +68,7 @@ void matrixVectorOp(OutT* out, Lambda op, cudaStream_t stream) { + // todo(lsugy): at the moment only OutT == MatT supported! detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); } @@ -117,6 +118,7 @@ void matrixVectorOp(OutT* out, Lambda op, cudaStream_t stream) { + // todo(lsugy): at the moment only OutT == MatT supported! detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); } diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 707764fbe8..f807b64f99 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -325,7 +325,6 @@ __global__ void __launch_bounds__(BlockSize) Lambda op, const Vecs*... vecs) { - // todo(lsugy): use appropriate Linewise type per vec typedef Linewise L; constexpr uint workSize = L::VecElems * BlockSize; constexpr size_t maxVecItemSize = maxSizeOf(); @@ -339,7 +338,8 @@ __global__ void __launch_bounds__(BlockSize) L::AlignElems::div(len), op, (workOffset ^= workSize * maxVecItemSize, - L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...); + Linewise::loadVec( + (Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...); } /** @@ -375,7 +375,6 @@ __global__ void __launch_bounds__(MaxOffset, 2) constexpr size_t maxVecItemSize = maxSizeOf(); uint workOffset = workSize * maxVecItemSize; __shared__ char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; - // todo(lsugy): use appropriate Linewise type per vec typedef Linewise L; if (blockIdx.x == 0) { // first block: offset = 0, length = arrOffset @@ -384,7 +383,7 @@ __global__ void __launch_bounds__(MaxOffset, 2) arrOffset, op, (workOffset ^= workSize * maxVecItemSize, - L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...); + Linewise::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...); } else { // second block: offset = arrTail, length = len - arrTail // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1 @@ -393,7 +392,7 @@ __global__ void __launch_bounds__(MaxOffset, 2) len - arrTail + MaxOffset, op, (workOffset ^= workSize * maxVecItemSize, - L::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...); + Linewise::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...); } } @@ -549,7 +548,7 @@ struct MatrixLinewiseOp { cudaStream_t stream, const Vecs*... vecs) { - // todo(lsugy): add test with different vector types + // todo(lsugy): vectors alignment? Where is it checked/enforced? if constexpr (VecBytes > sizeof(Type)) { if (!raft::Pow2::areSameAlignOffsets(in, out)) return MatrixLinewiseOp> 1), sizeof(Type)), BlockSize>::run( diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index ae132dea8c..4bf29c016f 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -23,20 +23,29 @@ namespace raft { namespace linalg { -template +template struct MatVecOpInputs { - T tolerance; IdxType rows, cols; bool rowMajor, bcastAlongRows; unsigned long long int seed; }; -template -::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs& dims) +template +::std::ostream& operator<<(::std::ostream& os, const MatVecOpInputs& dims) { return os; } +template +inline void gen_uniform(const raft::handle_t& handle, raft::random::RngState& rng, T* ptr, LenT len) +{ + if constexpr (std::is_integral_v) { + raft::random::uniformInt(handle, rng, ptr, len, (T)0, (T)100); + } else { + raft::random::uniform(handle, rng, ptr, len, (T)-10.0, (T)10.0); + } +} + // Or else, we get the following compilation error // for an extended __device__ lambda cannot have private or protected access // within its class @@ -88,10 +97,10 @@ template -class MatVecOpTest : public ::testing::TestWithParam> { +class MatVecOpTest : public ::testing::TestWithParam> { public: MatVecOpTest() - : params(::testing::TestWithParam>::GetParam()), + : params(::testing::TestWithParam>::GetParam()), stream(handle.get_stream()), in(params.rows * params.cols, stream), out_ref(params.rows * params.cols, stream), @@ -108,9 +117,9 @@ class MatVecOpTest : public ::testing::TestWithParam(handle, r, in.data(), len); + gen_uniform(handle, r, vec1.data(), vecLen); + gen_uniform(handle, r, vec2.data(), vecLen); if constexpr (OpT::useTwoVectors) { naiveMatVec(out_ref.data(), in.data(), @@ -149,56 +158,49 @@ class MatVecOpTest : public ::testing::TestWithParam params; + MatVecOpInputs params; rmm::device_uvector in; rmm::device_uvector out; - rmm::device_uvector out_ref; + rmm::device_uvector out_ref; rmm::device_uvector vec1; rmm::device_uvector vec2; }; -#define MVTEST(TestClass, inputs) \ - TEST_P(TestClass, Result) \ - { \ - ASSERT_TRUE(devArrMatch(out_ref.data(), \ - out.data(), \ - params.rows* params.cols, \ - CompareApprox(params.tolerance))); \ - } \ +#define MVTEST(TestClass, inputs, tolerance) \ + TEST_P(TestClass, Result) \ + { \ + ASSERT_TRUE(devArrMatch( \ + out_ref.data(), out.data(), params.rows* params.cols, CompareApprox(tolerance))); \ + } \ INSTANTIATE_TEST_SUITE_P(MatVecOpTests, TestClass, ::testing::ValuesIn(inputs)) -const std::vector> inputsf_i32 = { - {0.00001f, 1024, 32, true, true, 1234ULL}, - {0.00001f, 1024, 64, true, true, 1234ULL}, - {0.00001f, 1024, 32, true, false, 1234ULL}, - {0.00001f, 1024, 64, true, false, 1234ULL}, - {0.00001f, 1024, 32, false, true, 1234ULL}, - {0.00001f, 1024, 64, false, true, 1234ULL}, - {0.00001f, 1024, 32, false, false, 1234ULL}, - {0.00001f, 1024, 64, false, false, 1234ULL}}; -const std::vector> inputsf_i64 = { - {0.00001f, 2500, 250, false, false, 1234ULL}, {0.00001f, 2500, 250, false, false, 1234ULL}}; -const std::vector> inputsd_i32 = { - {0.0000001, 1024, 32, true, true, 1234ULL}, - {0.0000001, 1024, 64, true, true, 1234ULL}, - {0.0000001, 1024, 32, true, false, 1234ULL}, - {0.0000001, 1024, 64, true, false, 1234ULL}, - {0.0000001, 1024, 32, false, true, 1234ULL}, - {0.0000001, 1024, 64, false, true, 1234ULL}, - {0.0000001, 1024, 32, false, false, 1234ULL}, - {0.0000001, 1024, 64, false, false, 1234ULL}}; -const std::vector> inputsd_i64 = { - {0.0000001, 2500, 250, false, false, 1234ULL}, {0.0000001, 2500, 250, false, false, 1234ULL}}; +#define MV_EPS_F 0.00001f +#define MV_EPS_D 0.0000001 + +/* + * This set of tests covers cases where all the types are the same. + */ + +const std::vector> inputs_i32 = {{1024, 32, true, true, 1234ULL}, + {1024, 64, true, true, 1234ULL}, + {1024, 32, true, false, 1234ULL}, + {1024, 64, true, false, 1234ULL}, + {1024, 32, false, true, 1234ULL}, + {1024, 64, false, true, 1234ULL}, + {1024, 32, false, false, 1234ULL}, + {1024, 64, false, false, 1234ULL}}; +const std::vector> inputs_i64 = {{2500, 250, false, false, 1234ULL}, + {2500, 250, false, false, 1234ULL}}; template struct Add1Vec { static constexpr bool useTwoVectors = false; - HDI float operator()(T a, T b) const { return a + b; }; + HDI T operator()(T a, T b) const { return a + b; }; }; template struct Add2Vec { static constexpr bool useTwoVectors = true; - HDI float operator()(T a, T b, T c) const { return a + b + c; }; + HDI T operator()(T a, T b, T c) const { return a + b + c; }; }; typedef MatVecOpTest, float, int> MatVecOpTestF_i32_add1vec; @@ -210,14 +212,46 @@ typedef MatVecOpTest, double, int> MatVecOpTestD_i32_add2vec; typedef MatVecOpTest, double, size_t> MatVecOpTestD_i64_add1vec; typedef MatVecOpTest, double, size_t> MatVecOpTestD_i64_add2vec; -MVTEST(MatVecOpTestF_i32_add1vec, inputsf_i32); -MVTEST(MatVecOpTestF_i32_add2vec, inputsf_i32); -MVTEST(MatVecOpTestF_i64_add1vec, inputsf_i64); -MVTEST(MatVecOpTestF_i64_add2vec, inputsf_i64); -MVTEST(MatVecOpTestD_i32_add1vec, inputsd_i32); -MVTEST(MatVecOpTestD_i32_add2vec, inputsd_i32); -MVTEST(MatVecOpTestD_i64_add1vec, inputsd_i64); -MVTEST(MatVecOpTestD_i64_add2vec, inputsd_i64); +MVTEST(MatVecOpTestF_i32_add1vec, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i32_add2vec, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i64_add1vec, inputs_i64, MV_EPS_F); +MVTEST(MatVecOpTestF_i64_add2vec, inputs_i64, MV_EPS_F); +MVTEST(MatVecOpTestD_i32_add1vec, inputs_i32, MV_EPS_D); +MVTEST(MatVecOpTestD_i32_add2vec, inputs_i32, MV_EPS_D); +MVTEST(MatVecOpTestD_i64_add1vec, inputs_i64, MV_EPS_D); +MVTEST(MatVecOpTestD_i64_add2vec, inputs_i64, MV_EPS_D); + +/* + * This set of tests covers cases with different types. + */ + +template +struct DivAndAdd { + static constexpr bool useTwoVectors = true; + HDI OutT operator()(MatT a, Vec1T b, Vec2T c) const { return a / b + c; }; +}; + +typedef MatVecOpTest, float, int, float, int32_t, float> + MatVecOpTestF_i32_DivAndAdd_f_i32_f; +// typedef MatVecOpTest, float, int, double, int32_t, +// float> +// MatVecOpTestF_i32_DivAndAdd_d_i32_f; +// typedef MatVecOpTest, double, int, float, int32_t, +// float> +// MatVecOpTestD_i32_DivAndAdd_f_i32_f; +typedef MatVecOpTest, float, int, float, int32_t, double> + MatVecOpTestF_i32_DivAndAdd_f_i32_d; +typedef MatVecOpTest, float, int, float, int64_t, float> + MatVecOpTestF_i32_DivAndAdd_f_i64_f; +typedef MatVecOpTest, double, int, double, int32_t, float> + MatVecOpTestD_i32_DivAndAdd_d_i32_f; + +MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i32_f, inputs_i32, MV_EPS_F); +// MVTEST(MatVecOpTestF_i32_DivAndAdd_d_i32_f, inputs_i32, MV_EPS_F); +// MVTEST(MatVecOpTestD_i32_DivAndAdd_f_i32_f, inputs_i32, (double)MV_EPS_F); +MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i32_d, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i64_f, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestD_i32_DivAndAdd_d_i32_f, inputs_i32, (double)MV_EPS_F); } // end namespace linalg } // end namespace raft From be142c6c453259f7aef682f381d4557a20fab5c7 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Wed, 19 Oct 2022 17:10:59 +0200 Subject: [PATCH 12/25] Fix linewiseOp VecRows kernels --- .../raft/matrix/detail/linewise_op.cuh | 35 +++++++++-------- cpp/test/linalg/matrix_vector_op.cu | 38 ++++++++----------- 2 files changed, 34 insertions(+), 39 deletions(-) diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index f807b64f99..5a7e106951 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -26,6 +26,11 @@ namespace raft { namespace matrix { namespace detail { +template +struct VecArg { + Type val[VecElems]; +}; + template struct Linewise { static constexpr IdxType VecElems = VecBytes / sizeof(Type); @@ -79,6 +84,7 @@ struct Linewise { const Vecs*... vecs) noexcept { constexpr IdxType warpPad = (AlignWarp::Value - 1) * VecElems; + // todo(lsugy): don't use Type! Type args[sizeof...(Vecs)]; Vec v, w; bool update = true; @@ -141,7 +147,7 @@ struct Linewise { *v.vectorized_data() = __ldcv(in + i); #pragma unroll VecElems for (int k = 0; k < VecElems; k++) - v.val.data[k] = op(v.val.data[k], args.val.data[k]...); + v.val.data[k] = op(v.val.data[k], args.val[k]...); __stwt(out + i, *v.vectorized_data()); } } @@ -158,14 +164,11 @@ struct Linewise { * @return a contiguous chunk of a vector, suitable for `vectorRows`. */ template - static __device__ __forceinline__ raft::TxN_t loadVec( - VecT* shm, const VecT* p, const IdxType blockOffset, const IdxType rowLen) noexcept + static __device__ __forceinline__ VecArg loadVec(VecT* shm, + const VecT* p, + const IdxType blockOffset, + const IdxType rowLen) noexcept { - // todo(lsugy): remove this check - static_assert( - sizeof(Type) == sizeof(VecT), - "linewiseOp currently requires that the size of the matrix and vector types be the same"); - IdxType j = blockOffset + threadIdx.x; #pragma unroll VecElems for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) { @@ -175,9 +178,10 @@ struct Linewise { } __syncthreads(); { - raft::TxN_t out; - *out.vectorized_data() = - reinterpret_cast::io_t*>(shm)[threadIdx.x]; + VecArg out; +#pragma unroll VecElems + for (int i = 0; i < VecElems; i++) + out.val[i] = shm[threadIdx.x * VecElems + i]; return out; } } @@ -338,8 +342,7 @@ __global__ void __launch_bounds__(BlockSize) L::AlignElems::div(len), op, (workOffset ^= workSize * maxVecItemSize, - Linewise::loadVec( - (Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...); + L::loadVec((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen))...); } /** @@ -383,7 +386,7 @@ __global__ void __launch_bounds__(MaxOffset, 2) arrOffset, op, (workOffset ^= workSize * maxVecItemSize, - Linewise::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...); + L::loadVec((Vecs*)(shm + workOffset), vecs, 0, rowLen))...); } else { // second block: offset = arrTail, length = len - arrTail // NB: I substract MaxOffset (= blockDim.x) to get the correct indexing for block 1 @@ -392,7 +395,7 @@ __global__ void __launch_bounds__(MaxOffset, 2) len - arrTail + MaxOffset, op, (workOffset ^= workSize * maxVecItemSize, - Linewise::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...); + L::loadVec((Vecs*)(shm + workOffset), vecs, arrTail % rowLen, rowLen))...); } } @@ -503,6 +506,7 @@ void matrixLinewiseVecRows(Type* out, const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen)); // Minimum size of the grid to make the device well occupied const uint occupy = getOptimalGridSize(); + // todo(lsugy): alignedLen instead of totalLen? const dim3 gs(std::min( // does not make sense to have more blocks than this raft::ceildiv(uint(totalLen), block_work_size), @@ -548,7 +552,6 @@ struct MatrixLinewiseOp { cudaStream_t stream, const Vecs*... vecs) { - // todo(lsugy): vectors alignment? Where is it checked/enforced? if constexpr (VecBytes > sizeof(Type)) { if (!raft::Pow2::areSameAlignOffsets(in, out)) return MatrixLinewiseOp> 1), sizeof(Type)), BlockSize>::run( diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 4bf29c016f..ccce54390b 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -226,32 +226,24 @@ MVTEST(MatVecOpTestD_i64_add2vec, inputs_i64, MV_EPS_D); */ template -struct DivAndAdd { +struct MulAndAdd { static constexpr bool useTwoVectors = true; - HDI OutT operator()(MatT a, Vec1T b, Vec2T c) const { return a / b + c; }; + HDI OutT operator()(MatT a, Vec1T b, Vec2T c) const { return a * b + c; }; }; -typedef MatVecOpTest, float, int, float, int32_t, float> - MatVecOpTestF_i32_DivAndAdd_f_i32_f; -// typedef MatVecOpTest, float, int, double, int32_t, -// float> -// MatVecOpTestF_i32_DivAndAdd_d_i32_f; -// typedef MatVecOpTest, double, int, float, int32_t, -// float> -// MatVecOpTestD_i32_DivAndAdd_f_i32_f; -typedef MatVecOpTest, float, int, float, int32_t, double> - MatVecOpTestF_i32_DivAndAdd_f_i32_d; -typedef MatVecOpTest, float, int, float, int64_t, float> - MatVecOpTestF_i32_DivAndAdd_f_i64_f; -typedef MatVecOpTest, double, int, double, int32_t, float> - MatVecOpTestD_i32_DivAndAdd_d_i32_f; - -MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i32_f, inputs_i32, MV_EPS_F); -// MVTEST(MatVecOpTestF_i32_DivAndAdd_d_i32_f, inputs_i32, MV_EPS_F); -// MVTEST(MatVecOpTestD_i32_DivAndAdd_f_i32_f, inputs_i32, (double)MV_EPS_F); -MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i32_d, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestF_i32_DivAndAdd_f_i64_f, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestD_i32_DivAndAdd_d_i32_f, inputs_i32, (double)MV_EPS_F); +typedef MatVecOpTest, float, int, float, int32_t, float> + MatVecOpTestF_i32_MulAndAdd_f_i32_f; +typedef MatVecOpTest, float, int, float, int32_t, double> + MatVecOpTestF_i32_MulAndAdd_f_i32_d; +typedef MatVecOpTest, float, int, float, int64_t, float> + MatVecOpTestF_i32_MulAndAdd_f_i64_f; +typedef MatVecOpTest, double, int, double, int32_t, float> + MatVecOpTestD_i32_MulAndAdd_d_i32_f; + +MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i32_f, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i32_d, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i64_f, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestD_i32_MulAndAdd_d_i32_f, inputs_i32, (double)MV_EPS_F); } // end namespace linalg } // end namespace raft From a850d8525725df086384b9ab4d4a056236c5fd0d Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Thu, 20 Oct 2022 15:08:16 +0200 Subject: [PATCH 13/25] Fix linewiseOp VecCols kernels --- .../raft/matrix/detail/linewise_op.cuh | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 5a7e106951..9b40108289 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -21,16 +21,28 @@ #include #include +#include namespace raft { namespace matrix { namespace detail { +/** This type simplifies returning arrays and passing them as arguments */ template struct VecArg { Type val[VecElems]; }; +/** Executes the operation with the given matrix element and an arbitrary number of vector elements + * contained in the given tuple. The index_sequence is used here for compile-time indexing of the + * tuple in the fold expression. */ +template +__device__ __forceinline__ MatT +RunMatVecOp(Lambda op, MatT mat, Tuple&& args, std::index_sequence) +{ + return op(mat, (thrust::get(args))...); +} + template struct Linewise { static constexpr IdxType VecElems = VecBytes / sizeof(Type); @@ -84,8 +96,10 @@ struct Linewise { const Vecs*... vecs) noexcept { constexpr IdxType warpPad = (AlignWarp::Value - 1) * VecElems; - // todo(lsugy): don't use Type! - Type args[sizeof...(Vecs)]; + constexpr auto index = std::index_sequence_for(); + // todo(lsugy): switch to cuda::std::tuple from libcudacxx if we add it as a required + // dependency. Note that thrust::tuple is limited to 10 elements. + thrust::tuple args; Vec v, w; bool update = true; for (; in < in_end; in += AlignWarp::Value, out += AlignWarp::Value, rowMod += warpPad) { @@ -96,8 +110,7 @@ struct Linewise { update = true; } if (update) { - int l = 0; - ((args[l] = vecs[rowDiv], l++), ...); + args = thrust::make_tuple((vecs[rowDiv])...); update = false; } #pragma unroll VecElems @@ -105,11 +118,9 @@ struct Linewise { if (rowMod == rowLen) { rowMod = 0; rowDiv++; - int l = 0; - ((args[l] = vecs[rowDiv], l++), ...); + args = thrust::make_tuple((vecs[rowDiv])...); } - int l = 0; - w.val.data[k] = op(v.val.data[k], (std::ignore = vecs, args[l++])...); + w.val.data[k] = RunMatVecOp(op, v.val.data[k], args, index); } *out = *w.vectorized_data(); } From e44bcc4348b92d9780bb0abf2499ecd1f787262f Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Thu, 20 Oct 2022 15:28:53 +0200 Subject: [PATCH 14/25] Merge OutT=MatT because linewiseOp only supports one input/output matrix type + cleanup old matrixVectorOp implementation --- .../raft/linalg/detail/matrix_vector_op.cuh | 81 +------------------ cpp/include/raft/linalg/matrix_vector_op.cuh | 33 +++----- cpp/test/linalg/matrix_vector_op.cu | 50 +++++------- 3 files changed, 35 insertions(+), 129 deletions(-) diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh index d00014e56f..807bacc470 100644 --- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh @@ -22,84 +22,12 @@ namespace raft { namespace linalg { namespace detail { -namespace { -template -struct AlignedAccess { - template - static inline bool test(const T* matrix, size_t strideBytes) - { - return Pow2::isAligned(matrix) && Pow2::isAligned(strideBytes) && - Pow2::isAligned(VecBytes); - } -}; -}; // namespace - -template -__global__ void matrixVectorOpKernel(Type* out, - const Type* matrix, - const Type* vector, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op) -{ - typedef TxN_t VecType; - IdxType len = N * D; - IdxType idx = threadIdx.x; - idx += (IdxType)blockIdx.x * (IdxType)blockDim.x; - idx *= VecType::Ratio; - if (idx >= len) return; - IdxType vIdx; - VecType mat, vec; - ///@todo: yikes! use fast-int-div here. - ///@todo: shared mem for vector could help with perf - if (rowMajor && bcastAlongRows) { - vIdx = idx % D; - vec.load(vector, vIdx); - } else if (!rowMajor && !bcastAlongRows) { - vIdx = idx % N; - vec.load(vector, vIdx); - } else if (rowMajor && !bcastAlongRows) { - vIdx = idx / D; - vec.fill(vector[vIdx]); - } else { - vIdx = idx / N; - vec.fill(vector[vIdx]); - } - mat.load(matrix, idx); -#pragma unroll - for (int i = 0; i < VecType::Ratio; ++i) - mat.val.data[i] = op(mat.val.data[i], vec.val.data[i]); - mat.store(out, idx); -} - -// todo(lsugy): remove this function? It isn't used. -template -void matrixVectorOpImpl(Type* out, - const Type* matrix, - const Type* vec, - IdxType D, - IdxType N, - bool rowMajor, - bool bcastAlongRows, - Lambda op, - cudaStream_t stream) -{ - IdxType len = N * D; - IdxType nblks = raft::ceildiv(veclen_ ? len / veclen_ : veclen_, (IdxType)TPB); - matrixVectorOpKernel - <<>>(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op); - RAFT_CUDA_TRY(cudaPeekAtLastError()); -} - -template -void matrixVectorOp(OutT* out, +void matrixVectorOp(MatT* out, const MatT* matrix, const VecT* vec, IdxType D, @@ -115,14 +43,13 @@ void matrixVectorOp(OutT* out, out, matrix, stride, nLines, rowMajor == bcastAlongRows, op, stream, vec); } -template -void matrixVectorOp(OutT* out, +void matrixVectorOp(MatT* out, const MatT* matrix, const Vec1T* vec1, const Vec2T* vec2, diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index 464591f593..d673769dc3 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -35,9 +35,8 @@ namespace linalg { * Note : the function will also check that the size of the window of accesses * is a multiple of the number of elements processed by a thread in order to * enable faster processing - * @tparam OutT the output type + * @tparam MatT the matrix type * @tparam Lambda a device function which represents a binary operator - * @tparam MatT the input matrix type * @tparam VecT the input vector type * @tparam IdxType Integer type used to for addressing * @tparam TPB threads per block of the cuda kernel launched @@ -52,13 +51,8 @@ namespace linalg { * @param op the mathematical operation * @param stream cuda stream where to launch work */ -template -void matrixVectorOp(OutT* out, +template +void matrixVectorOp(MatT* out, const MatT* matrix, const VecT* vec, IdxType D, @@ -68,7 +62,6 @@ void matrixVectorOp(OutT* out, Lambda op, cudaStream_t stream) { - // todo(lsugy): at the moment only OutT == MatT supported! detail::matrixVectorOp(out, matrix, vec, D, N, rowMajor, bcastAlongRows, op, stream); } @@ -80,12 +73,10 @@ void matrixVectorOp(OutT* out, * Note : the function will also check that the size of the window of accesses * is a multiple of the number of elements processed by a thread in order to * enable faster processing - * @tparam OutT the output type + * @tparam MatT the matrix type * @tparam Lambda a device function which represents a binary operator - * @tparam MatT the input matrix type * @tparam Vec1T the first input vector type * @tparam Vec2T the second input vector type - * @tparam Lambda a device function which represents a binary operator * @tparam IdxType Integer type used to for addressing * @tparam TPB threads per block of the cuda kernel launched * @param out the output matrix (passing out = matrix makes it in-place) @@ -100,14 +91,13 @@ void matrixVectorOp(OutT* out, * @param op the mathematical operation * @param stream cuda stream where to launch work */ -template -void matrixVectorOp(OutT* out, +void matrixVectorOp(MatT* out, const MatT* matrix, const Vec1T* vec1, const Vec2T* vec2, @@ -118,7 +108,6 @@ void matrixVectorOp(OutT* out, Lambda op, cudaStream_t stream) { - // todo(lsugy): at the moment only OutT == MatT supported! detail::matrixVectorOp(out, matrix, vec1, vec2, D, N, rowMajor, bcastAlongRows, op, stream); } @@ -139,7 +128,6 @@ void matrixVectorOp(OutT* out, * @tparam VecValueType the data-type of the input vector * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major) * @tparam Lambda a device function which represents a binary operator - * @tparam OutElementType the data-type of the output raft::matrix_view * @tparam IndexType Integer used for addressing * @tparam TPB threads per block of the cuda kernel launched * @param[in] handle raft::handle_t @@ -154,13 +142,12 @@ template void matrix_vector_op(const raft::handle_t& handle, raft::device_matrix_view matrix, raft::device_vector_view vec, - raft::device_matrix_view out, + raft::device_matrix_view out, Apply apply, Lambda op) { @@ -198,12 +185,11 @@ void matrix_vector_op(const raft::handle_t& handle, * Note : the function will also check that the size of the window of accesses * is a multiple of the number of elements processed by a thread in order to * enable faster processing - * @tparam MatValueType the data-type of the input matrix + * @tparam MatValueType the data-type of the input and output matrices * @tparam Vec1ValueType the data-type of the first input vector * @tparam Vec2ValueType the data-type of the second input vector * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major) * @tparam Lambda a device function which represents a binary operator - * @tparam OutElementType the data-type of the output raft::matrix_view * @tparam IndexType Integer used for addressing * @tparam TPB threads per block of the cuda kernel launched * @param handle raft::handle_t @@ -220,14 +206,13 @@ template void matrix_vector_op(const raft::handle_t& handle, raft::device_matrix_view matrix, raft::device_vector_view vec1, raft::device_vector_view vec2, - raft::device_matrix_view out, + raft::device_matrix_view out, Apply apply, Lambda op) { diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index ccce54390b..0b2c54de88 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -49,14 +49,9 @@ inline void gen_uniform(const raft::handle_t& handle, raft::random::RngState& rn // Or else, we get the following compilation error // for an extended __device__ lambda cannot have private or protected access // within its class -template +template void matrixVectorOpLaunch(const raft::handle_t& handle, - OutT* out, + MatT* out, const MatT* in, const Vec1T* vec1, const Vec2T* vec2, @@ -65,10 +60,10 @@ void matrixVectorOpLaunch(const raft::handle_t& handle, bool rowMajor, bool bcastAlongRows) { - auto out_row_major = raft::make_device_matrix_view(out, N, D); + auto out_row_major = raft::make_device_matrix_view(out, N, D); auto in_row_major = raft::make_device_matrix_view(in, N, D); - auto out_col_major = raft::make_device_matrix_view(out, N, D); + auto out_col_major = raft::make_device_matrix_view(out, N, D); auto in_col_major = raft::make_device_matrix_view(in, N, D); auto apply = bcastAlongRows ? Apply::ALONG_ROWS : Apply::ALONG_COLUMNS; @@ -92,9 +87,8 @@ void matrixVectorOpLaunch(const raft::handle_t& handle, } template class MatVecOpTest : public ::testing::TestWithParam> { @@ -160,8 +154,8 @@ class MatVecOpTest : public ::testing::TestWithParam> { MatVecOpInputs params; rmm::device_uvector in; - rmm::device_uvector out; - rmm::device_uvector out_ref; + rmm::device_uvector out; + rmm::device_uvector out_ref; rmm::device_uvector vec1; rmm::device_uvector vec2; }; @@ -225,25 +219,25 @@ MVTEST(MatVecOpTestD_i64_add2vec, inputs_i64, MV_EPS_D); * This set of tests covers cases with different types. */ -template +template struct MulAndAdd { static constexpr bool useTwoVectors = true; - HDI OutT operator()(MatT a, Vec1T b, Vec2T c) const { return a * b + c; }; + HDI MatT operator()(MatT a, Vec1T b, Vec2T c) const { return a * b + c; }; }; -typedef MatVecOpTest, float, int, float, int32_t, float> - MatVecOpTestF_i32_MulAndAdd_f_i32_f; -typedef MatVecOpTest, float, int, float, int32_t, double> - MatVecOpTestF_i32_MulAndAdd_f_i32_d; -typedef MatVecOpTest, float, int, float, int64_t, float> - MatVecOpTestF_i32_MulAndAdd_f_i64_f; -typedef MatVecOpTest, double, int, double, int32_t, float> - MatVecOpTestD_i32_MulAndAdd_d_i32_f; - -MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i32_f, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i32_d, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestF_i32_MulAndAdd_f_i64_f, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestD_i32_MulAndAdd_d_i32_f, inputs_i32, (double)MV_EPS_F); +typedef MatVecOpTest, float, int, int32_t, float> + MatVecOpTestF_i32_MulAndAdd_i32_f; +typedef MatVecOpTest, float, int, int32_t, double> + MatVecOpTestF_i32_MulAndAdd_i32_d; +typedef MatVecOpTest, float, int, int64_t, float> + MatVecOpTestF_i32_MulAndAdd_i64_f; +typedef MatVecOpTest, double, int, int32_t, float> + MatVecOpTestD_i32_MulAndAdd_i32_f; + +MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_f, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_d, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i32_MulAndAdd_i64_f, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestD_i32_MulAndAdd_i32_f, inputs_i32, (double)MV_EPS_F); } // end namespace linalg } // end namespace raft From dcfd96277632457763e00e8fdb3abca282d34023 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Thu, 20 Oct 2022 15:56:25 +0200 Subject: [PATCH 15/25] Replace for_each with linalg::add + fix syntax error --- .../knn/detail/ann_kmeans_balanced.cuh | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh index df822ef009..fd009b30af 100644 --- a/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh +++ b/cpp/include/raft/spatial/knn/detail/ann_kmeans_balanced.cuh @@ -292,24 +292,17 @@ void calc_centers_and_sizes(const handle_t& handle, static_cast(n_clusters), workspace); - // Add previous sizes if necessary and cast to float - // todo(lsugy): add wrapped in if - auto counting = thrust::make_counting_iterator(0); - thrust::for_each( - handle.get_thrust_policy(), counting, counting + n_clusters, [=] __device__(int idx) { - uint32_t temp_size = temp_sizes[idx]; - if (!reset_counters) { - temp_size += cluster_sizes[idx]; - cluster_sizes[idx] = temp_size; - } - }); + // Add previous sizes if necessary + if (!reset_counters) { + raft::linalg::add(cluster_sizes, cluster_sizes, temp_sizes, n_clusters, stream); + } raft::linalg::matrixVectorOp( centers, centers, cluster_sizes, - static_castdim, - static_castn_clusters, + static_cast(dim), + static_cast(n_clusters), true, false, [=] __device__(float mat, uint32_t vec) { From 4718e5bc73b0235c34581ed7f6f9c320d7e7b347 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Thu, 20 Oct 2022 16:49:10 +0200 Subject: [PATCH 16/25] Clang-format fix --- cpp/include/raft/linalg/detail/matrix_vector_op.cuh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh index 807bacc470..62ec9bb7a4 100644 --- a/cpp/include/raft/linalg/detail/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/detail/matrix_vector_op.cuh @@ -22,11 +22,7 @@ namespace raft { namespace linalg { namespace detail { -template +template void matrixVectorOp(MatT* out, const MatT* matrix, const VecT* vec, From fc55921b4a6ba44367c1e5185636dec5530e1a55 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Fri, 21 Oct 2022 14:58:43 +0200 Subject: [PATCH 17/25] used alignedLen instead of totalLen in max block number calculation --- cpp/include/raft/matrix/detail/linewise_op.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 9b40108289..67177d7d05 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -517,10 +517,9 @@ void matrixLinewiseVecRows(Type* out, const uint expected_grid_size = rowLen / raft::gcd(block_work_size, uint(rowLen)); // Minimum size of the grid to make the device well occupied const uint occupy = getOptimalGridSize(); - // todo(lsugy): alignedLen instead of totalLen? const dim3 gs(std::min( // does not make sense to have more blocks than this - raft::ceildiv(uint(totalLen), block_work_size), + raft::ceildiv(uint(alignedLen), block_work_size), // increase the grid size to be not less than `occupy` while // still being the multiple of `expected_grid_size` raft::ceildiv(occupy, expected_grid_size) * expected_grid_size), From 5421dda583e4f5246467dcba43746e3db3e58df7 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Wed, 26 Oct 2022 12:28:39 +0200 Subject: [PATCH 18/25] Add misalignments to matrix-vector-op test --- cpp/include/raft/util/itertools.hpp | 54 +++++++++++++++++ cpp/test/linalg/matrix_vector_op.cu | 93 +++++++++++++++-------------- 2 files changed, 102 insertions(+), 45 deletions(-) create mode 100644 cpp/include/raft/util/itertools.hpp diff --git a/cpp/include/raft/util/itertools.hpp b/cpp/include/raft/util/itertools.hpp new file mode 100644 index 0000000000..025778659b --- /dev/null +++ b/cpp/include/raft/util/itertools.hpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +/** + * Helpers inspired by the Python itertools library + * + */ + +namespace raft { +namespace itertools { + +template +inline std::vector product(std::index_sequence index, const std::vector&... vecs) +{ + size_t len = 1; + ((len *= vecs.size()), ...); + std::vector out; + out.reserve(len); + for (size_t i = 0; i < len; i++) { + std::tuple tup; + size_t mod = len, new_mod; + ((new_mod = mod / vecs.size(), std::get(tup) = vecs[(i % mod) / new_mod], mod = new_mod), + ...); + out.push_back({std::get(tup)...}); + } + return out; +} + +template +std::vector product(std::initializer_list... lists) +{ + return product(std::index_sequence_for(), (std::vector(lists))...); +} + +}; // namespace itertools +}; // namespace raft diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 0b2c54de88..5a6320435f 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -19,6 +19,7 @@ #include #include #include +#include namespace raft { namespace linalg { @@ -27,6 +28,7 @@ template struct MatVecOpInputs { IdxType rows, cols; bool rowMajor, bcastAlongRows; + IdxType inAlignOffset, outAlignOffset; unsigned long long int seed; }; @@ -94,55 +96,58 @@ template > { public: MatVecOpTest() - : params(::testing::TestWithParam>::GetParam()), - stream(handle.get_stream()), - in(params.rows * params.cols, stream), - out_ref(params.rows * params.cols, stream), - out(params.rows * params.cols, stream), - vec1(params.bcastAlongRows ? params.cols : params.rows, stream), - vec2(params.bcastAlongRows ? params.cols : params.rows, stream) + : stream(handle.get_stream()), + params(::testing::TestWithParam>::GetParam()), + vec_size(params.bcastAlongRows ? params.cols : params.rows), + in(params.rows * params.cols + params.inAlignOffset, stream), + out_ref(params.rows * params.cols + params.outAlignOffset, stream), + out(params.rows * params.cols + params.outAlignOffset, stream), + vec1(vec_size, stream), + vec2(vec_size, stream) { } protected: void SetUp() override { + MatT* in_ptr = in.data() + params.inAlignOffset; + MatT* out_ptr = out.data() + params.outAlignOffset; + MatT* out_ref_ptr = out_ref.data() + params.outAlignOffset; + raft::random::RngState r(params.seed); - IdxType N = params.rows, D = params.cols; - IdxType len = N * D; - IdxType vecLen = params.bcastAlongRows ? D : N; - gen_uniform(handle, r, in.data(), len); - gen_uniform(handle, r, vec1.data(), vecLen); - gen_uniform(handle, r, vec2.data(), vecLen); + IdxType len = params.rows * params.cols; + gen_uniform(handle, r, in_ptr, len); + gen_uniform(handle, r, vec1.data(), vec_size); + gen_uniform(handle, r, vec2.data(), vec_size); if constexpr (OpT::useTwoVectors) { - naiveMatVec(out_ref.data(), - in.data(), + naiveMatVec(out_ref_ptr, + in_ptr, vec1.data(), vec2.data(), - D, - N, + params.cols, + params.rows, params.rowMajor, params.bcastAlongRows, OpT{}, stream); } else { - naiveMatVec(out_ref.data(), - in.data(), + naiveMatVec(out_ref_ptr, + in_ptr, vec1.data(), - D, - N, + params.cols, + params.rows, params.rowMajor, params.bcastAlongRows, OpT{}, stream); } matrixVectorOpLaunch(handle, - out.data(), - in.data(), + out_ptr, + in_ptr, vec1.data(), vec2.data(), - D, - N, + params.cols, + params.rows, params.rowMajor, params.bcastAlongRows); handle.sync_stream(); @@ -153,6 +158,7 @@ class MatVecOpTest : public ::testing::TestWithParam> { cudaStream_t stream; MatVecOpInputs params; + IdxType vec_size; rmm::device_uvector in; rmm::device_uvector out; rmm::device_uvector out_ref; @@ -160,12 +166,14 @@ class MatVecOpTest : public ::testing::TestWithParam> { rmm::device_uvector vec2; }; -#define MVTEST(TestClass, inputs, tolerance) \ - TEST_P(TestClass, Result) \ - { \ - ASSERT_TRUE(devArrMatch( \ - out_ref.data(), out.data(), params.rows* params.cols, CompareApprox(tolerance))); \ - } \ +#define MVTEST(TestClass, inputs, tolerance) \ + TEST_P(TestClass, Result) \ + { \ + ASSERT_TRUE(devArrMatch(out_ref.data() + params.outAlignOffset, \ + out.data() + params.outAlignOffset, \ + params.rows * params.cols, \ + CompareApprox(tolerance))); \ + } \ INSTANTIATE_TEST_SUITE_P(MatVecOpTests, TestClass, ::testing::ValuesIn(inputs)) #define MV_EPS_F 0.00001f @@ -175,16 +183,11 @@ class MatVecOpTest : public ::testing::TestWithParam> { * This set of tests covers cases where all the types are the same. */ -const std::vector> inputs_i32 = {{1024, 32, true, true, 1234ULL}, - {1024, 64, true, true, 1234ULL}, - {1024, 32, true, false, 1234ULL}, - {1024, 64, true, false, 1234ULL}, - {1024, 32, false, true, 1234ULL}, - {1024, 64, false, true, 1234ULL}, - {1024, 32, false, false, 1234ULL}, - {1024, 64, false, false, 1234ULL}}; -const std::vector> inputs_i64 = {{2500, 250, false, false, 1234ULL}, - {2500, 250, false, false, 1234ULL}}; +const std::vector> inputs_i32 = raft::itertools::product>( + {1024}, {32, 64}, {true, false}, {true, false}, {0, 1, 2}, {0, 1, 2}, {1234ULL}); +const std::vector> inputs_i64 = + raft::itertools::product>( + {2500}, {250}, {false}, {false}, {0, 1}, {0, 1}, {1234ULL}); template struct Add1Vec { @@ -199,12 +202,12 @@ struct Add2Vec { typedef MatVecOpTest, float, int> MatVecOpTestF_i32_add1vec; typedef MatVecOpTest, float, int> MatVecOpTestF_i32_add2vec; -typedef MatVecOpTest, float, size_t> MatVecOpTestF_i64_add1vec; -typedef MatVecOpTest, float, size_t> MatVecOpTestF_i64_add2vec; +typedef MatVecOpTest, float, int64_t> MatVecOpTestF_i64_add1vec; +typedef MatVecOpTest, float, int64_t> MatVecOpTestF_i64_add2vec; typedef MatVecOpTest, double, int> MatVecOpTestD_i32_add1vec; typedef MatVecOpTest, double, int> MatVecOpTestD_i32_add2vec; -typedef MatVecOpTest, double, size_t> MatVecOpTestD_i64_add1vec; -typedef MatVecOpTest, double, size_t> MatVecOpTestD_i64_add2vec; +typedef MatVecOpTest, double, int64_t> MatVecOpTestD_i64_add1vec; +typedef MatVecOpTest, double, int64_t> MatVecOpTestD_i64_add2vec; MVTEST(MatVecOpTestF_i32_add1vec, inputs_i32, MV_EPS_F); MVTEST(MatVecOpTestF_i32_add2vec, inputs_i32, MV_EPS_F); From ccdc961d6a671f8f4a4384efb6b94db8b42656ec Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Wed, 26 Oct 2022 17:06:18 +0200 Subject: [PATCH 19/25] Extend matrix-vector op benchmark --- cpp/bench/linalg/matrix_vector_op.cu | 140 +++++++++++++++++++++------ 1 file changed, 108 insertions(+), 32 deletions(-) diff --git a/cpp/bench/linalg/matrix_vector_op.cu b/cpp/bench/linalg/matrix_vector_op.cu index aa8f2667ed..aa388955da 100644 --- a/cpp/bench/linalg/matrix_vector_op.cu +++ b/cpp/bench/linalg/matrix_vector_op.cu @@ -20,61 +20,137 @@ namespace raft::bench::linalg { +template struct mat_vec_op_inputs { - int rows, cols; + IdxT rows, cols; bool rowMajor, bcastAlongRows; + IdxT inAlignOffset, outAlignOffset; }; // struct mat_vec_op_inputs -template +template +inline auto operator<<(std::ostream& os, const mat_vec_op_inputs& p) -> std::ostream& +{ + os << p.rows << "#" << p.cols << "#" << p.rowMajor << "#" << p.bcastAlongRows << "#" + << p.inAlignOffset << "#" << p.outAlignOffset; + return os; +} + +template struct mat_vec_op : public fixture { - mat_vec_op(const mat_vec_op_inputs& p) + mat_vec_op(const mat_vec_op_inputs& p) : params(p), - out(p.rows * p.cols, stream), - in(p.rows * p.cols, stream), - vec(p.bcastAlongRows ? p.cols : p.rows, stream) + out(p.rows * p.cols + params.outAlignOffset, stream), + in(p.rows * p.cols + params.inAlignOffset, stream), + vec1(p.bcastAlongRows ? p.cols : p.rows, stream), + vec2(p.bcastAlongRows ? p.cols : p.rows, stream) { } void run_benchmark(::benchmark::State& state) override { + std::ostringstream label_stream; + label_stream << params; + state.SetLabel(label_stream.str()); + loop_on_state(state, [this]() { - raft::linalg::matrixVectorOp(out.data(), - in.data(), - vec.data(), - params.cols, - params.rows, - params.rowMajor, - params.bcastAlongRows, - raft::Sum(), - stream); + if constexpr (OpT::useTwoVectors) { + raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset, + in.data() + params.inAlignOffset, + vec1.data(), + vec2.data(), + params.cols, + params.rows, + params.rowMajor, + params.bcastAlongRows, + OpT{}, + stream); + } else { + raft::linalg::matrixVectorOp(out.data() + params.outAlignOffset, + in.data() + params.inAlignOffset, + vec1.data(), + params.cols, + params.rows, + params.rowMajor, + params.bcastAlongRows, + OpT{}, + stream); + } }); } private: - mat_vec_op_inputs params; - rmm::device_uvector out, in, vec; + mat_vec_op_inputs params; + rmm::device_uvector out, in, vec1, vec2; }; // struct MatVecOp -const std::vector mat_vec_op_input_vecs{ - {1024, 128, true, true}, {1024 * 1024, 128, true, true}, - {1024, 128 + 2, true, true}, {1024 * 1024, 128 + 2, true, true}, - {1024, 128 + 1, true, true}, {1024 * 1024, 128 + 1, true, true}, +template +std::vector> get_mv_inputs() +{ + std::vector> out; - {1024, 128, true, false}, {1024 * 1024, 128, true, false}, - {1024, 128 + 2, true, false}, {1024 * 1024, 128 + 2, true, false}, - {1024, 128 + 1, true, false}, {1024 * 1024, 128 + 1, true, false}, + // Scalability benchmark with round dimensions + std::vector rows = {1000, 100000, 1000000}; + std::vector cols = {8, 64, 256, 1024}; + for (bool rowMajor : {true, false}) { + for (bool alongRows : {true, false}) { + for (IdxT rows_ : rows) { + for (IdxT cols_ : cols) { + out.push_back({rows_, cols_, rowMajor, alongRows, 0, 0}); + } + } + } + } - {1024, 128, false, false}, {1024 * 1024, 128, false, false}, - {1024, 128 + 2, false, false}, {1024 * 1024, 128 + 2, false, false}, - {1024, 128 + 1, false, false}, {1024 * 1024, 128 + 1, false, false}, + // Odd dimensions, misalignment + std::vector> rowcols = { + {44739207, 7}, + {44739207, 15}, + {44739207, 16}, + {44739207, 17}, + {2611236, 256}, + {2611236, 257}, + {2611236, 263}, + }; + for (bool rowMajor : {true, false}) { + for (bool alongRows : {true, false}) { + for (auto rc : rowcols) { + for (IdxT inAlignOffset : {0, 1}) { + for (IdxT outAlignOffset : {0, 1}) { + out.push_back({std::get<0>(rc), + std::get<1>(rc), + rowMajor, + alongRows, + inAlignOffset, + outAlignOffset}); + } + } + } + } + } + return out; +} - {1024, 128, false, true}, {1024 * 1024, 128, false, true}, - {1024, 128 + 2, false, true}, {1024 * 1024, 128 + 2, false, true}, - {1024, 128 + 1, false, true}, {1024 * 1024, 128 + 1, false, true}, +const std::vector> mv_input_i32 = get_mv_inputs(); +const std::vector> mv_input_i64 = get_mv_inputs(); +template +struct Add1Vec { + static constexpr bool useTwoVectors = false; + HDI T operator()(T a, T b) const { return a + b; }; +}; +template +struct Add2Vec { + static constexpr bool useTwoVectors = true; + HDI T operator()(T a, T b, T c) const { return a + b + c; }; }; -RAFT_BENCH_REGISTER(mat_vec_op, "", mat_vec_op_input_vecs); -RAFT_BENCH_REGISTER(mat_vec_op, "", mat_vec_op_input_vecs); +RAFT_BENCH_REGISTER((mat_vec_op, float, int>), "", mv_input_i32); +RAFT_BENCH_REGISTER((mat_vec_op, double, int>), "", mv_input_i32); +RAFT_BENCH_REGISTER((mat_vec_op, float, int>), "", mv_input_i32); +RAFT_BENCH_REGISTER((mat_vec_op, double, int>), "", mv_input_i32); +RAFT_BENCH_REGISTER((mat_vec_op, float, int64_t>), "", mv_input_i64); +RAFT_BENCH_REGISTER((mat_vec_op, double, int64_t>), "", mv_input_i64); +RAFT_BENCH_REGISTER((mat_vec_op, float, int64_t>), "", mv_input_i64); +RAFT_BENCH_REGISTER((mat_vec_op, double, int64_t>), "", mv_input_i64); } // namespace raft::bench::linalg From f2f67dbece5f2d797d84b5a9449293c3e9dbc4cf Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Fri, 28 Oct 2022 19:09:08 +0200 Subject: [PATCH 20/25] Apply changes to new padded kernel (note: test is still failing but also upstream) --- .../raft/matrix/detail/linewise_op.cuh | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/cpp/include/raft/matrix/detail/linewise_op.cuh b/cpp/include/raft/matrix/detail/linewise_op.cuh index 482571e2ea..37198684ee 100644 --- a/cpp/include/raft/matrix/detail/linewise_op.cuh +++ b/cpp/include/raft/matrix/detail/linewise_op.cuh @@ -170,6 +170,7 @@ struct Linewise { * of a vector. Most of the time this is not aligned, so we load it thread-striped * within a block and then use the shared memory to get a contiguous chunk. * + * @tparam VecT Type of the vector to load * @param [in] shm a shared memory region for rearranging the data among threads * @param [in] p pointer to a vector * @param [in] blockOffset the offset of the current block into a vector. @@ -202,6 +203,7 @@ struct Linewise { /** * @brief Same as loadVec, but padds data with Ones * + * @tparam VecT Type of the vector to load * @param shm * @param p * @param blockOffset @@ -209,23 +211,27 @@ struct Linewise { * @param rowLenPadded * @return a contiguous chunk of a vector, suitable for `vectorRows`. */ - static __device__ __forceinline__ Vec loadVecPadded(Type* shm, - const Type* p, - const IdxType blockOffset, - const IdxType rowLen, - const IdxType rowLenPadded) noexcept + template + static __device__ __forceinline__ VecArg loadVecPadded( + VecT* shm, + const VecT* p, + const IdxType blockOffset, + const IdxType rowLen, + const IdxType rowLenPadded) noexcept { IdxType j = blockOffset + threadIdx.x; #pragma unroll VecElems for (int k = threadIdx.x; k < VecElems * BlockSize; k += BlockSize, j += BlockSize) { while (j >= rowLenPadded) j -= rowLenPadded; - shm[k] = j < rowLen ? p[j] : Type(1); + shm[k] = j < rowLen ? p[j] : VecT(1); } __syncthreads(); { - Vec out; - *out.vectorized_data() = reinterpret_cast(shm)[threadIdx.x]; + VecArg out; +#pragma unroll VecElems + for (int i = 0; i < VecElems; i++) + out.val[i] = shm[threadIdx.x * VecElems + i]; return out; } } @@ -414,21 +420,23 @@ __global__ void __launch_bounds__(BlockSize) const IdxType rowLenPadded, const IdxType lenPadded, Lambda op, - Vecs... vecs) + const Vecs*... vecs) { typedef Linewise L; - constexpr uint workSize = L::VecElems * BlockSize; - uint workOffset = workSize; - __shared__ __align__(sizeof(Type) * L::VecElems) - Type shm[workSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; + constexpr uint workSize = L::VecElems * BlockSize; + constexpr size_t maxVecItemSize = maxSizeOf(); + uint workOffset = workSize * maxVecItemSize; + __shared__ __align__( + maxVecItemSize * + L::VecElems) char shm[workSize * maxVecItemSize * ((sizeof...(Vecs)) > 1 ? 2 : 1)]; const IdxType blockOffset = (BlockSize * L::VecElems * blockIdx.x) % rowLenPadded; return L::vectorRows( reinterpret_cast(out), reinterpret_cast(in), L::AlignElems::div(lenPadded), op, - (workOffset ^= workSize, - L::loadVecPadded(shm + workOffset, vecs, blockOffset, rowLen, rowLenPadded))...); + (workOffset ^= workSize * maxVecItemSize, + L::loadVecPadded((Vecs*)(shm + workOffset), vecs, blockOffset, rowLen, rowLenPadded))...); } /** @@ -570,7 +578,7 @@ void matrixLinewiseVecColsSpan( const IdxType nRows, Lambda op, cudaStream_t stream, - Vecs... vecs) + const Vecs*... vecs) { typedef raft::Pow2 AlignBytes; constexpr std::size_t VecElems = VecBytes / sizeof(Type); @@ -688,7 +696,7 @@ void matrixLinewiseVecRowsSpan( const IdxType nRows, Lambda op, cudaStream_t stream, - Vecs... vecs) + const Vecs*... vecs) { constexpr std::size_t VecElems = VecBytes / sizeof(Type); typedef raft::Pow2 AlignBytes; @@ -779,7 +787,7 @@ struct MatrixLinewiseOp { const bool alongLines, Lambda op, cudaStream_t stream, - Vecs... vecs) + const Vecs*... vecs) { constexpr auto is_rowmajor = std::is_same_v>; constexpr auto is_colmajor = std::is_same_v>; From 1450baedc5ace3df4a749bb2f1a8a0e28b26557d Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Fri, 28 Oct 2022 19:14:51 +0200 Subject: [PATCH 21/25] Put itertools in util namespace --- cpp/include/raft/util/itertools.hpp | 6 ++---- cpp/test/linalg/matrix_vector_op.cu | 7 ++++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/util/itertools.hpp b/cpp/include/raft/util/itertools.hpp index 025778659b..e6bb66e7ef 100644 --- a/cpp/include/raft/util/itertools.hpp +++ b/cpp/include/raft/util/itertools.hpp @@ -24,8 +24,7 @@ * */ -namespace raft { -namespace itertools { +namespace raft::util::itertools { template inline std::vector product(std::index_sequence index, const std::vector&... vecs) @@ -50,5 +49,4 @@ std::vector product(std::initializer_list... lists) return product(std::index_sequence_for(), (std::vector(lists))...); } -}; // namespace itertools -}; // namespace raft +} // namespace raft::util::itertools diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 3757e19274..c0355b0385 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -184,10 +184,11 @@ class MatVecOpTest : public ::testing::TestWithParam> { * This set of tests covers cases where all the types are the same. */ -const std::vector> inputs_i32 = raft::itertools::product>( - {1024}, {32, 64}, {true, false}, {true, false}, {0, 1, 2}, {0, 1, 2}, {1234ULL}); +const std::vector> inputs_i32 = + raft::utils::itertools::product>( + {1024}, {32, 64}, {true, false}, {true, false}, {0, 1, 2}, {0, 1, 2}, {1234ULL}); const std::vector> inputs_i64 = - raft::itertools::product>( + raft::utils::itertools::product>( {2500}, {250}, {false}, {false}, {0, 1}, {0, 1}, {1234ULL}); template From 7bcbbe2ebc7507b222825f1e9243a4903b36efee Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Fri, 28 Oct 2022 19:16:32 +0200 Subject: [PATCH 22/25] Remove TPB from public API (it wasn't even forwarded to the actual implementation) --- cpp/include/raft/linalg/matrix_vector_op.cuh | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/cpp/include/raft/linalg/matrix_vector_op.cuh b/cpp/include/raft/linalg/matrix_vector_op.cuh index d673769dc3..d68be838b0 100644 --- a/cpp/include/raft/linalg/matrix_vector_op.cuh +++ b/cpp/include/raft/linalg/matrix_vector_op.cuh @@ -39,7 +39,6 @@ namespace linalg { * @tparam Lambda a device function which represents a binary operator * @tparam VecT the input vector type * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads per block of the cuda kernel launched * @param out the output matrix (passing out = matrix makes it in-place) * @param matrix the input matrix * @param vec the vector @@ -51,7 +50,7 @@ namespace linalg { * @param op the mathematical operation * @param stream cuda stream where to launch work */ -template +template void matrixVectorOp(MatT* out, const MatT* matrix, const VecT* vec, @@ -78,7 +77,6 @@ void matrixVectorOp(MatT* out, * @tparam Vec1T the first input vector type * @tparam Vec2T the second input vector type * @tparam IdxType Integer type used to for addressing - * @tparam TPB threads per block of the cuda kernel launched * @param out the output matrix (passing out = matrix makes it in-place) * @param matrix the input matrix * @param vec1 the first vector @@ -91,12 +89,7 @@ void matrixVectorOp(MatT* out, * @param op the mathematical operation * @param stream cuda stream where to launch work */ -template +template void matrixVectorOp(MatT* out, const MatT* matrix, const Vec1T* vec1, @@ -129,7 +122,6 @@ void matrixVectorOp(MatT* out, * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major) * @tparam Lambda a device function which represents a binary operator * @tparam IndexType Integer used for addressing - * @tparam TPB threads per block of the cuda kernel launched * @param[in] handle raft::handle_t * @param[in] matrix input raft::matrix_view * @param[in] vec vector raft::vector_view @@ -142,8 +134,7 @@ template + typename IndexType> void matrix_vector_op(const raft::handle_t& handle, raft::device_matrix_view matrix, raft::device_vector_view vec, @@ -191,7 +182,6 @@ void matrix_vector_op(const raft::handle_t& handle, * @tparam LayoutPolicy the layout of input and output (raft::row_major or raft::col_major) * @tparam Lambda a device function which represents a binary operator * @tparam IndexType Integer used for addressing - * @tparam TPB threads per block of the cuda kernel launched * @param handle raft::handle_t * @param matrix input raft::matrix_view * @param vec1 the first vector raft::vector_view @@ -206,8 +196,7 @@ template + typename IndexType> void matrix_vector_op(const raft::handle_t& handle, raft::device_matrix_view matrix, raft::device_vector_view vec1, From c7942b00f38bb60b523fa10e47201853583de80a Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Fri, 28 Oct 2022 19:58:49 +0200 Subject: [PATCH 23/25] Fix utils -> util --- cpp/test/linalg/matrix_vector_op.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index c0355b0385..67bdfb74fc 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -185,10 +185,10 @@ class MatVecOpTest : public ::testing::TestWithParam> { */ const std::vector> inputs_i32 = - raft::utils::itertools::product>( + raft::util::itertools::product>( {1024}, {32, 64}, {true, false}, {true, false}, {0, 1, 2}, {0, 1, 2}, {1234ULL}); const std::vector> inputs_i64 = - raft::utils::itertools::product>( + raft::util::itertools::product>( {2500}, {250}, {false}, {false}, {0, 1}, {0, 1}, {1234ULL}); template From 68fd977d70b5762da065cfe61925ecfd7f99f3a1 Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 8 Nov 2022 18:43:13 +0100 Subject: [PATCH 24/25] Move product auxiliary function to itertools::detail --- cpp/include/raft/util/detail/itertools.hpp | 41 ++++++++++++++++++++++ cpp/include/raft/util/itertools.hpp | 35 ++++++++---------- 2 files changed, 56 insertions(+), 20 deletions(-) create mode 100644 cpp/include/raft/util/detail/itertools.hpp diff --git a/cpp/include/raft/util/detail/itertools.hpp b/cpp/include/raft/util/detail/itertools.hpp new file mode 100644 index 0000000000..1908d90b95 --- /dev/null +++ b/cpp/include/raft/util/detail/itertools.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace raft::util::itertools::detail { + +template +inline std::vector product(std::index_sequence index, const std::vector&... vecs) +{ + size_t len = 1; + ((len *= vecs.size()), ...); + std::vector out; + out.reserve(len); + for (size_t i = 0; i < len; i++) { + std::tuple tup; + size_t mod = len, new_mod; + ((new_mod = mod / vecs.size(), std::get(tup) = vecs[(i % mod) / new_mod], mod = new_mod), + ...); + out.push_back({std::get(tup)...}); + } + return out; +} + +} // namespace raft::util::itertools::detail diff --git a/cpp/include/raft/util/itertools.hpp b/cpp/include/raft/util/itertools.hpp index e6bb66e7ef..493ac9befe 100644 --- a/cpp/include/raft/util/itertools.hpp +++ b/cpp/include/raft/util/itertools.hpp @@ -16,8 +16,7 @@ #pragma once -#include -#include +#include /** * Helpers inspired by the Python itertools library @@ -26,27 +25,23 @@ namespace raft::util::itertools { -template -inline std::vector product(std::index_sequence index, const std::vector&... vecs) -{ - size_t len = 1; - ((len *= vecs.size()), ...); - std::vector out; - out.reserve(len); - for (size_t i = 0; i < len; i++) { - std::tuple tup; - size_t mod = len, new_mod; - ((new_mod = mod / vecs.size(), std::get(tup) = vecs[(i % mod) / new_mod], mod = new_mod), - ...); - out.push_back({std::get(tup)...}); - } - return out; -} - +/** + * @brief Cartesian product of the given initializer lists. + * + * This helper can be used to easily define input parameters in tests/benchmarks. + * Note that it's not optimized for use with large lists / many lists in performance-critical code! + * + * @tparam S Type of the output structures. + * @tparam Args Types of the elements of the initilizer lists, matching the types of the first + * fields of the structure (if the structure has more fields, some might be initialized + * with their default value). + * @param lists One or more initializer lists. + * @return std::vector A vector of structures containing the cartesian product. + */ template std::vector product(std::initializer_list... lists) { - return product(std::index_sequence_for(), (std::vector(lists))...); + return detail::product(std::index_sequence_for(), (std::vector(lists))...); } } // namespace raft::util::itertools From b2be0c1195ca749f353a30c860e03f34edf0a2db Mon Sep 17 00:00:00 2001 From: Louis Sugy Date: Tue, 8 Nov 2022 19:16:14 +0100 Subject: [PATCH 25/25] Add test case for int8_t matrix with float vectors --- cpp/test/linalg/matrix_vector_op.cu | 60 +++++++++++++++++++---------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/cpp/test/linalg/matrix_vector_op.cu b/cpp/test/linalg/matrix_vector_op.cu index 67bdfb74fc..1c96c3fc74 100644 --- a/cpp/test/linalg/matrix_vector_op.cu +++ b/cpp/test/linalg/matrix_vector_op.cu @@ -21,6 +21,7 @@ #include #include #include +#include namespace raft { namespace linalg { @@ -167,14 +168,21 @@ class MatVecOpTest : public ::testing::TestWithParam> { rmm::device_uvector vec2; }; -#define MVTEST(TestClass, inputs, tolerance) \ - TEST_P(TestClass, Result) \ - { \ - ASSERT_TRUE(devArrMatch(out_ref.data() + params.outAlignOffset, \ - out.data() + params.outAlignOffset, \ - params.rows * params.cols, \ - CompareApprox(tolerance))); \ - } \ +#define MVTEST(TestClass, OutType, inputs, tolerance) \ + TEST_P(TestClass, Result) \ + { \ + if constexpr (std::is_floating_point_v) { \ + ASSERT_TRUE(devArrMatch(out_ref.data() + params.outAlignOffset, \ + out.data() + params.outAlignOffset, \ + params.rows * params.cols, \ + CompareApprox(tolerance))); \ + } else { \ + ASSERT_TRUE(devArrMatch(out_ref.data() + params.outAlignOffset, \ + out.data() + params.outAlignOffset, \ + params.rows * params.cols, \ + Compare())); \ + } \ + } \ INSTANTIATE_TEST_SUITE_P(MatVecOpTests, TestClass, ::testing::ValuesIn(inputs)) #define MV_EPS_F 0.00001f @@ -211,14 +219,14 @@ typedef MatVecOpTest, double, int> MatVecOpTestD_i32_add2vec; typedef MatVecOpTest, double, int64_t> MatVecOpTestD_i64_add1vec; typedef MatVecOpTest, double, int64_t> MatVecOpTestD_i64_add2vec; -MVTEST(MatVecOpTestF_i32_add1vec, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestF_i32_add2vec, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestF_i64_add1vec, inputs_i64, MV_EPS_F); -MVTEST(MatVecOpTestF_i64_add2vec, inputs_i64, MV_EPS_F); -MVTEST(MatVecOpTestD_i32_add1vec, inputs_i32, MV_EPS_D); -MVTEST(MatVecOpTestD_i32_add2vec, inputs_i32, MV_EPS_D); -MVTEST(MatVecOpTestD_i64_add1vec, inputs_i64, MV_EPS_D); -MVTEST(MatVecOpTestD_i64_add2vec, inputs_i64, MV_EPS_D); +MVTEST(MatVecOpTestF_i32_add1vec, float, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i32_add2vec, float, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i64_add1vec, float, inputs_i64, MV_EPS_F); +MVTEST(MatVecOpTestF_i64_add2vec, float, inputs_i64, MV_EPS_F); +MVTEST(MatVecOpTestD_i32_add1vec, double, inputs_i32, MV_EPS_D); +MVTEST(MatVecOpTestD_i32_add2vec, double, inputs_i32, MV_EPS_D); +MVTEST(MatVecOpTestD_i64_add1vec, double, inputs_i64, MV_EPS_D); +MVTEST(MatVecOpTestD_i64_add2vec, double, inputs_i64, MV_EPS_D); /* * This set of tests covers cases with different types. @@ -239,10 +247,22 @@ typedef MatVecOpTest, float, int, int64_t, floa typedef MatVecOpTest, double, int, int32_t, float> MatVecOpTestD_i32_MulAndAdd_i32_f; -MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_f, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_d, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestF_i32_MulAndAdd_i64_f, inputs_i32, MV_EPS_F); -MVTEST(MatVecOpTestD_i32_MulAndAdd_i32_f, inputs_i32, (double)MV_EPS_F); +MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_f, float, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i32_MulAndAdd_i32_d, float, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestF_i32_MulAndAdd_i64_f, float, inputs_i32, MV_EPS_F); +MVTEST(MatVecOpTestD_i32_MulAndAdd_i32_f, double, inputs_i32, (double)MV_EPS_F); + +struct DQMultiply { + static constexpr bool useTwoVectors = true; + HDI int8_t operator()(int8_t a, float b, float c) const + { + return static_cast((static_cast(a) / 100.0f * (b + c) / 20.0f) * 100.0f); + }; +}; + +typedef MatVecOpTest MatVecOpTestI8_i32_DQMultiply_f_f; + +MVTEST(MatVecOpTestI8_i32_DQMultiply_f_f, int8_t, inputs_i32, 0); } // end namespace linalg } // end namespace raft