Skip to content

Commit

Permalink
Use a smaller grid size for TBE bwd cta_per_row (#1353)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1353

Same idea as D39720886 (e2bfc2e)

Reviewed By: jspark1105

Differential Revision: D39760002

fbshipit-source-id: 2b858b25bcd103ac48c2963f7383f759d4198ac7
  • Loading branch information
sryap committed Sep 26, 2022
1 parent 2f1c351 commit 105df12
Showing 1 changed file with 19 additions and 3 deletions.
22 changes: 19 additions & 3 deletions fbgemm_gpu/codegen/embedding_backward_split_template.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ constexpr size_t kBackwardMaxThreads = 512;
using Tensor = at::Tensor;
using namespace fbgemm_gpu;

namespace {

// Based on the empirical study, max grid size that is 64x larger than the
// number of SMs gives good performance across the board
constexpr int MAX_THREAD_BLOCKS_FACTOR = 64;

int get_max_thread_blocks_() {
return MAX_THREAD_BLOCKS_FACTOR * at::cuda::getCurrentDeviceProperties()->multiProcessorCount;
}

} // namespace

__global__ __launch_bounds__(kMaxThreads) void
split_embedding_backward_codegen_{{ optimizer }}_{{ wdesc }}_find_long_segments(
const at::PackedTensorAccessor32<int32_t, 1, at::RestrictPtrTraits>
Expand Down Expand Up @@ -1088,6 +1100,10 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
{use_deterministic_algorithms ? 0 : grad_accum_counter.numel(), max_D},
grad_output.options().dtype(std::is_same<cache_t, double>::value ? at::kDouble : at::kFloat));
int32_t grid_size = std::min(
div_round_up(long_run_ids.numel(), kMaxThreads),
get_max_thread_blocks_());
// Check https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#shared-memory-7-x
// "Compute capability 7.x devices allow a single thread block to
// address the full capacity of shared memory: 96 KB on Volta,
Expand Down Expand Up @@ -1127,7 +1143,7 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
{% endif %}
kMaxVecsPerThread,
kThreadGroupSize>
<<<div_round_up(long_run_ids.numel(), kMaxThreads),
<<<grid_size,
dim3(kThreadGroupSize, BT_block_size),
BT_block_size * sizeof(at::acc_type<{{ "scalar_t" if dense else "cache_t" }}, true>) * 4 * kWarpSize *
kMaxVecsPerThread,
Expand Down Expand Up @@ -1182,9 +1198,9 @@ split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_
use_deterministic_algorithms,
{{ args.split_kernel_arg_constructors | join(", ") }});
C10_CUDA_KERNEL_LAUNCH_CHECK();
int32_t grid_size = std::min(
grid_size = std::min(
div_round_up(sorted_linear_indices_run.numel(), kBackwardMaxThreads / kThreadGroupSize),
64 * at::cuda::getCurrentDeviceProperties()->multiProcessorCount);
get_max_thread_blocks_());
#ifndef __HIP_PLATFORM_HCC__
cudaFuncSetAttribute(
split_embedding{{ "_nobag" if nobag else "" }}_backward_codegen_{{ optimizer }}_{{ wdesc }}_kernel_warp_per_row_1<
Expand Down

0 comments on commit 105df12

Please sign in to comment.