rapidsai · rapids-bot · May 2, 2023 · Apr 22, 2023 · Apr 24, 2023 · Apr 24, 2023
@@ -1315,56 +1315,56 @@ __global__ void __launch_bounds__(128, 8)
   }
 }
 
-// blockDim(128, 1, 1)
-__global__ void __launch_bounds__(128) gpuDecideCompression(device_span<EncColumnChunk> chunks)
+constexpr int decide_compression_warps_in_block = 4;
+constexpr int decide_compression_block_size =
+  decide_compression_warps_in_block * cudf::detail::warp_size;
+
+// blockDim(decide_compression_block_size, 1, 1)
+__global__ void __launch_bounds__(decide_compression_block_size)
+  gpuDecideCompression(device_span<EncColumnChunk> chunks)
 {
-  // After changing the way structs are loaded from coop to normal, this kernel has no business
-  // being launched with 128 thread block. It can easily be a single warp.
-  __shared__ __align__(8) EncColumnChunk ck_g;
-  __shared__ __align__(4) unsigned int error_count;
+  __shared__ __align__(8) EncColumnChunk ck_g[decide_compression_warps_in_block];
+  __shared__ __align__(4) unsigned int compression_error[decide_compression_warps_in_block];
   using warp_reduce = cub::WarpReduce<uint32_t>;
-  __shared__ typename warp_reduce::TempStorage temp_storage[2];
-  __shared__ volatile bool has_compression;
+  __shared__ typename warp_reduce::TempStorage temp_storage[decide_compression_warps_in_block][2];
 
-  uint32_t t                      = threadIdx.x;
-  uint32_t uncompressed_data_size = 0;
-  uint32_t compressed_data_size   = 0;
-  uint32_t num_pages;
+  auto const lane_id  = threadIdx.x % cudf::detail::warp_size;
+  auto const warp_id  = threadIdx.x / cudf::detail::warp_size;
+  auto const chunk_id = blockIdx.x * decide_compression_warps_in_block + warp_id;
 
-  if (t == 0) {
-    ck_g = chunks[blockIdx.x];
-    atomicAnd(&error_count, 0);
-    has_compression = false;
+  if (chunk_id >= chunks.size()) { return; }
+
+  if (lane_id == 0) {
+    ck_g[warp_id]              = chunks[chunk_id];
+    compression_error[warp_id] = 0;
   }
-  __syncthreads();
-  if (t < 32) {
-    num_pages = ck_g.num_pages;
-    for (uint32_t page = t; page < num_pages; page += 32) {
-      auto& curr_page         = ck_g.pages[page];
-      uint32_t page_data_size = curr_page.max_data_size;
-      uncompressed_data_size += page_data_size;
-      if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
-        has_compression = true;
-        compressed_data_size += comp_res->bytes_written;
-        if (comp_res->status != compression_status::SUCCESS) { atomicAdd(&error_count, 1); }
+  __syncwarp();
+
+  uint32_t uncompressed_data_size = 0;
+  uint32_t compressed_data_size   = 0;
+  auto const num_pages            = ck_g[warp_id].num_pages;
+  for (auto page_id = lane_id; page_id < num_pages; page_id += cudf::detail::warp_size) {
+    auto const& curr_page     = ck_g[warp_id].pages[page_id];
+    auto const page_data_size = curr_page.max_data_size;
+    uncompressed_data_size += page_data_size;
+    if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
+      compressed_data_size += comp_res->bytes_written;
+      if (comp_res->status != compression_status::SUCCESS) {
+        atomicOr(&compression_error[warp_id], 1);
       }
     }
-    uncompressed_data_size = warp_reduce(temp_storage[0]).Sum(uncompressed_data_size);
-    compressed_data_size   = warp_reduce(temp_storage[1]).Sum(compressed_data_size);
   }
-  __syncthreads();
-  if (t == 0) {
-    bool is_compressed;
-    if (has_compression) {
-      uint32_t compression_error = atomicAdd(&error_count, 0);
-      is_compressed = (!compression_error && compressed_data_size < uncompressed_data_size);
-    } else {
-      is_compressed = false;
-    }
-    chunks[blockIdx.x].is_compressed = is_compressed;
-    chunks[blockIdx.x].bfr_size      = uncompressed_data_size;
-    chunks[blockIdx.x].compressed_size =
-      (is_compressed) ? compressed_data_size : uncompressed_data_size;
+  uncompressed_data_size = warp_reduce(temp_storage[warp_id][0]).Sum(uncompressed_data_size);
+  compressed_data_size   = warp_reduce(temp_storage[warp_id][1]).Sum(compressed_data_size);
+  __syncwarp();
+
+  if (lane_id == 0) {
+    auto const write_compressed = compressed_data_size != 0 and compression_error[warp_id] == 0 and
+                                  compressed_data_size < uncompressed_data_size;
+    chunks[chunk_id].is_compressed = write_compressed;
+    chunks[chunk_id].bfr_size      = uncompressed_data_size;
+    chunks[chunk_id].compressed_size =
+      write_compressed ? compressed_data_size : uncompressed_data_size;
   }
 }
 
@@ -2167,7 +2167,9 @@ void EncodePages(device_span<gpu::EncPage> pages,
 
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
 {
-  gpuDecideCompression<<<chunks.size(), 128, 0, stream.value()>>>(chunks);
+  auto const num_blocks =
+    util::div_rounding_up_safe<int>(chunks.size(), decide_compression_warps_in_block);
+  gpuDecideCompression<<<num_blocks, decide_compression_block_size, 0, stream.value()>>>(chunks);
 }
 
 void EncodePageHeaders(device_span<EncPage> pages,