diff --git a/cpp/include/raft/stats/detail/meanvar.cuh b/cpp/include/raft/stats/detail/meanvar.cuh index ed411ef74d..1c97326d3b 100644 --- a/cpp/include/raft/stats/detail/meanvar.cuh +++ b/cpp/include/raft/stats/detail/meanvar.cuh @@ -93,46 +93,79 @@ class mean_var { /* NB: current implementation here is not optimal, especially the rowmajor version; leaving this for further work (perhaps, as a more generic "linewiseReduce"). - -Possible improvements: - - 1. (romajor) Process input by the warps, not by blocks (thus reduce the iteration workset), - then aggregate output partially within blocks. - 2. (both) Use vectorized loads to utilize memory better - 3. (rowmajor) Scale the grid size better to utilize more the GPU (like in linewise_op). - + Vectorized loads/stores could speed things up a lot. + */ +/** + * meanvar kernel - row-major version + * + * Assumptions: + * + * 1. blockDim.x == WarpSize + * 2. Dimension X goes along columns (D) + * 3. Dimension Y goes along rows (N) + * + * + * @tparam T element type + * @tparam I indexing type + * @tparam BlockSize must be equal to blockDim.x * blockDim.y * blockDim.z + * @param data input data + * @param mvs meanvars -- output + * @param locks guards for updating meanvars + * @param len total length of input data (N * D) + * @param D number of columns in the input data. */ template -__global__ void meanvar_kernel_rowmajor( - const T* data, volatile mean_var* mvs, int* locks, I len, I D) +__global__ void __launch_bounds__(BlockSize) + meanvar_kernel_rowmajor(const T* data, volatile mean_var* mvs, int* locks, I len, I D) { - const I thread_idx = threadIdx.x + BlockSize * blockIdx.x; + // read the data + const I col = threadIdx.x + blockDim.x * blockIdx.x; mean_var thread_data; - { - const I grid_size = BlockSize * gridDim.x; - for (I i = thread_idx; i < len; i += grid_size) { + if (col < D) { + const I step = D * blockDim.y * gridDim.y; + for (I i = col + D * (threadIdx.y + blockDim.y * blockIdx.y); i < len; i += step) { thread_data += mean_var(data[i]); } } - { - const I col = thread_idx % D; - int* lock = locks + col; - while (atomicCAS(lock, 0, 1) == 1) { + // aggregate within block + if (blockDim.y > 1) { + __shared__ uint8_t shm_bytes[BlockSize * sizeof(mean_var)]; + auto shm = (mean_var*)shm_bytes; + int tid = threadIdx.x + threadIdx.y * blockDim.x; + shm[tid] = thread_data; + for (int bs = BlockSize >> 1; bs >= blockDim.x; bs = bs >> 1) { + __syncthreads(); + if (tid < bs) { shm[tid] += shm[tid + bs]; } + } + thread_data = shm[tid]; + } + + // aggregate across blocks + if (threadIdx.y == 0) { + int* lock = locks + blockIdx.x; + if (threadIdx.x == 0 && col < D) { + while (atomicCAS(lock, 0, 1) == 1) { + __threadfence(); + } + } + __syncthreads(); + if (col < D) { + __threadfence(); + mean_var global_data; + global_data.load(mvs + col); + global_data += thread_data; + global_data.store(mvs + col); __threadfence(); } - __threadfence(); - mean_var global_data; - global_data.load(mvs + col); - global_data += thread_data; - global_data.store(mvs + col); - __threadfence(); - __stwt(lock, 0); + __syncthreads(); + if (threadIdx.x == 0 && col < D) { __stwt(lock, 0); } } } template -__global__ void meanvar_kernel_colmajor(T* mean, T* var, const T* data, I D, I N, bool sample) +__global__ void __launch_bounds__(BlockSize) + meanvar_kernel_colmajor(T* mean, T* var, const T* data, I D, I N, bool sample) { using BlockReduce = cub::BlockReduce, BlockSize>; __shared__ typename BlockReduce::TempStorage shm; @@ -164,21 +197,26 @@ void meanvar( T* mean, T* var, const T* data, I D, I N, bool sample, bool rowMajor, cudaStream_t stream) { if (rowMajor) { - const uint64_t len = uint64_t(D) * uint64_t(N); - ASSERT(len <= uint64_t(std::numeric_limits::max()), "N * D does not fit the indexing type"); - // lcm(row width, block size): - // this way, each thread processes the same column on each iteration. - const uint64_t expected_grid_size = - (uint64_t(N) / raft::gcd(uint64_t(N), uint64_t(BlockSize))) * uint64_t(BlockSize); - const uint gs = - uint(min(expected_grid_size, raft::ceildiv(len, uint64_t(BlockSize)))); - - rmm::device_buffer buf((sizeof(mean_var) + sizeof(int)) * D, stream); + static_assert(BlockSize >= WarpSize, "Block size must be not smaller than the warp size."); + const dim3 bs(WarpSize, BlockSize / WarpSize, 1); + dim3 gs(raft::ceildiv(D, bs.x), raft::ceildiv(N, bs.y), 1); + + // Don't create more blocks than necessary to occupy the GPU + int occupancy; + RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &occupancy, meanvar_kernel_rowmajor, BlockSize, 0)); + gs.y = min(gs.y, raft::ceildiv(occupancy * getMultiProcessorCount(), gs.x)); + + // Global memory: one mean_var for each column + // one lock per all blocks working on the same set of columns + rmm::device_buffer buf(sizeof(mean_var) * D + sizeof(int) * gs.x, stream); RAFT_CUDA_TRY(cudaMemsetAsync(buf.data(), 0, buf.size(), stream)); mean_var* mvs = static_cast*>(buf.data()); int* locks = static_cast(static_cast(mvs + D)); - meanvar_kernel_rowmajor - <<>>(data, mvs, locks, len, D); + + const uint64_t len = uint64_t(D) * uint64_t(N); + ASSERT(len <= uint64_t(std::numeric_limits::max()), "N * D does not fit the indexing type"); + meanvar_kernel_rowmajor<<>>(data, mvs, locks, len, D); meanvar_kernel_fill <<(D, BlockSize), BlockSize, 0, stream>>>(mean, var, mvs, D, sample); } else { diff --git a/cpp/test/stats/meanvar.cu b/cpp/test/stats/meanvar.cu index bebdd7a19d..a5bb5b0b0d 100644 --- a/cpp/test/stats/meanvar.cu +++ b/cpp/test/stats/meanvar.cu @@ -81,25 +81,18 @@ class MeanVarTest : public ::testing::TestWithParam> { rmm::device_uvector data, mean_act, vars_act; }; -const std::vector> inputsf = {{1.f, 2.f, 1024, 32, true, false, 1234ULL}, - {1.f, 2.f, 1024, 64, true, false, 1234ULL}, - {1.f, 2.f, 1024, 128, true, false, 1234ULL}, - {1.f, 2.f, 1024, 256, true, false, 1234ULL}, - {-1.f, 2.f, 1024, 32, false, false, 1234ULL}, - {-1.f, 2.f, 1024, 64, false, false, 1234ULL}, - {-1.f, 2.f, 1024, 128, false, false, 1234ULL}, - {-1.f, 2.f, 1024, 256, false, false, 1234ULL}, - {-1.f, 2.f, 1024, 256, false, false, 1234ULL}, - {-1.f, 2.f, 1024, 257, false, false, 1234ULL}, - {1.f, 2.f, 1024, 32, true, true, 1234ULL}, - {1.f, 2.f, 1024, 64, true, true, 1234ULL}, - {1.f, 2.f, 1024, 128, true, true, 1234ULL}, - {1.f, 2.f, 1024, 256, true, true, 1234ULL}, - {-1.f, 2.f, 1024, 32, false, true, 1234ULL}, - {-1.f, 2.f, 1024, 64, false, true, 1234ULL}, - {-1.f, 2.f, 1024, 128, false, true, 1234ULL}, - {-1.f, 2.f, 1024, 256, false, true, 1234ULL}, - {-1.f, 2.f, 1024, 257, false, true, 1234ULL}}; +const std::vector> inputsf = { + {1.f, 2.f, 1024, 32, true, false, 1234ULL}, {1.f, 2.f, 1024, 64, true, false, 1234ULL}, + {1.f, 2.f, 1024, 128, true, false, 1234ULL}, {1.f, 2.f, 1024, 256, true, false, 1234ULL}, + {-1.f, 2.f, 1024, 32, false, false, 1234ULL}, {-1.f, 2.f, 1024, 64, false, false, 1234ULL}, + {-1.f, 2.f, 1024, 128, false, false, 1234ULL}, {-1.f, 2.f, 1024, 256, false, false, 1234ULL}, + {-1.f, 2.f, 1024, 256, false, false, 1234ULL}, {-1.f, 2.f, 1024, 257, false, false, 1234ULL}, + {1.f, 2.f, 1024, 32, true, true, 1234ULL}, {1.f, 2.f, 1024, 64, true, true, 1234ULL}, + {1.f, 2.f, 1024, 128, true, true, 1234ULL}, {1.f, 2.f, 1024, 256, true, true, 1234ULL}, + {-1.f, 2.f, 1024, 32, false, true, 1234ULL}, {-1.f, 2.f, 1024, 64, false, true, 1234ULL}, + {-1.f, 2.f, 1024, 128, false, true, 1234ULL}, {-1.f, 2.f, 1024, 256, false, true, 1234ULL}, + {-1.f, 2.f, 1024, 257, false, true, 1234ULL}, {-1.f, 2.f, 700, 13, false, true, 1234ULL}, + {10.f, 2.f, 500000, 811, false, true, 1234ULL}}; const std::vector> inputsd = {{1.0, 2.0, 1024, 32, true, false, 1234ULL}, {1.0, 2.0, 1024, 64, true, false, 1234ULL},