Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add memchecks to sparse ops #2594

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion fbgemm_gpu/src/sparse_ops/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

#include "fbgemm_gpu/embedding_backward_template_helpers.cuh"
#include "fbgemm_gpu/fbgemm_cuda_utils.cuh"
#include "fbgemm_gpu/fbgemm_tensor_accessor.h"
#include "fbgemm_gpu/split_embeddings_utils.cuh"

#ifdef USE_ROCM
Expand Down Expand Up @@ -61,7 +62,7 @@ template <
typename scalar_t,
int ndim,
template <typename U> class PtrTraits = at::DefaultPtrTraits>
at::PackedTensorAccessor64<scalar_t, ndim, PtrTraits>
pta::PackedTensorAccessor64<scalar_t, ndim, PtrTraits>
dummy_packed_accessor64() {
std::array<int64_t, ndim> zeros{};
return {nullptr, zeros.data(), zeros.data()};
Expand Down
11 changes: 6 additions & 5 deletions fbgemm_gpu/src/sparse_ops/sparse_batched_unary_embeddings.cu
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ __launch_bounds__(kMaxThreads) void batched_unary_embeddings_backward_kernel(
const index_t* __restrict__ table_offsets,
scalar_t* __restrict__ grad_weight, // [N * sum_E * 1] (embedding
// dimension is 1)
const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
sorted_linear_indices_run,
const int32_t* __restrict__ sorted_linear_indices_cumulative_run_lengths,
const int32_t* __restrict__ sorted_infos,
Expand Down Expand Up @@ -225,6 +225,9 @@ DLL_PUBLIC Tensor batched_unary_embeddings_backward_cuda(
grad_output.scalar_type(),
"batched_unary_embeddings_backward_kernel",
[&] {
#ifdef FBGEMM_GPU_MEMCHECK
const auto func_name = "batched_unary_embeddings_backward_kernel";
#endif
batched_unary_embeddings_backward_kernel<scalar_t>
<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
N,
Expand All @@ -233,10 +236,8 @@ DLL_PUBLIC Tensor batched_unary_embeddings_backward_cuda(
grad_output.data_ptr<scalar_t>(),
table_offsets.data_ptr<index_t>(),
grad_weight.data_ptr<scalar_t>(),
sorted_linear_indices_run.packed_accessor32<
index_t,
1,
at::RestrictPtrTraits>(),
MAKE_PTA_WITH_NAME(
func_name, sorted_linear_indices_run, index_t, 1, 32),
sorted_linear_indices_cumulative_run_lengths
.data_ptr<int32_t>(),
infos_sorted.data_ptr<int32_t>(),
Expand Down
65 changes: 34 additions & 31 deletions fbgemm_gpu/src/sparse_ops/sparse_index_add.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,16 @@ namespace fbgemm_gpu {
template <typename index_t, typename scalar_t, int UNROLL_FACTOR>
__global__
__launch_bounds__(kMaxThreads) void index_add_2d_with_unique_indices_kernel(
const at::PackedTensorAccessor32<scalar_t, 2, at::RestrictPtrTraits>
const pta::PackedTensorAccessor32<scalar_t, 2, at::RestrictPtrTraits>
out_grad,
const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
const pta::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
unique_indices,
const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
orig_indices,
const at::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits> offsets,
at::PackedTensorAccessor64<scalar_t, 2> in_deduped_grad,
const pta::PackedTensorAccessor32<int64_t, 1, at::RestrictPtrTraits>
offsets,
pta::PackedTensorAccessor64<scalar_t, 2, at::RestrictPtrTraits>
in_deduped_grad,
const int stride_D,
const int rounded_D,
const int remaining_D,
Expand Down Expand Up @@ -148,35 +150,36 @@ DLL_PUBLIC Tensor index_add_with_unique_indices_cuda(
cuda_calc_xblock_count(num_unique_indices, 1),
(D + stride_D - 1) / stride_D,
1);

#ifdef FBGEMM_GPU_MEMCHECK
const auto func_name = "index_add_2d_with_unique_indices_kernel";
#endif
index_add_2d_with_unique_indices_kernel<
index_t,
scalar_t,
UNROLL_FACTOR><<<
grid_size,
block_size,
0,
at::cuda::getCurrentCUDAStream()>>>(
grad_output_reshaped
.packed_accessor32<scalar_t, 2, at::RestrictPtrTraits>(),
consecutive_indices ? dummy_packed_accessor32<
index_t,
1,
at::RestrictPtrTraits>()
: unique_indices.packed_accessor32<
index_t,
1,
at::RestrictPtrTraits>(),
orig_indices
.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
offsets
.packed_accessor32<int64_t, 1, at::RestrictPtrTraits>(),
input_grad.packed_accessor64<scalar_t, 2>(),
stride_D, // Pass constants as kernel args
rounded_D,
remaining_D,
consecutive_indices,
consecutive_range_start);
UNROLL_FACTOR>
<<<grid_size,
block_size,
0,
at::cuda::getCurrentCUDAStream()>>>(
MAKE_PTA_WITH_NAME(
func_name, grad_output_reshaped, scalar_t, 2, 32),
consecutive_indices
? dummy_packed_accessor32<
index_t,
1,
at::RestrictPtrTraits>()
: MAKE_PTA_WITH_NAME(
func_name, unique_indices, index_t, 1, 32),
MAKE_PTA_WITH_NAME(
func_name, orig_indices, int64_t, 1, 32),
MAKE_PTA_WITH_NAME(func_name, offsets, int64_t, 1, 32),
MAKE_PTA_WITH_NAME(
func_name, input_grad, scalar_t, 2, 64),
stride_D, // Pass constants as kernel args
rounded_D,
remaining_D,
consecutive_indices,
consecutive_range_start);
C10_CUDA_KERNEL_LAUNCH_CHECK();
});
});
Expand Down
47 changes: 25 additions & 22 deletions fbgemm_gpu/src/sparse_ops/sparse_index_select.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ template <
int UNROLL_FACTOR,
bool indices_sorted>
__global__ __launch_bounds__(kMaxThreads) void index_select_2d_kernel(
const at::PackedTensorAccessor64<scalar_t, 2, at::RestrictPtrTraits> input,
const at::PackedTensorAccessor64<index_t, 1, at::RestrictPtrTraits> indices,
const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
const pta::PackedTensorAccessor64<scalar_t, 2, at::RestrictPtrTraits> input,
const pta::PackedTensorAccessor64<index_t, 1, at::RestrictPtrTraits>
indices,
const pta::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
orig_indices,
at::PackedTensorAccessor64<scalar_t, 2> output,
pta::PackedTensorAccessor64<scalar_t, 2, at::RestrictPtrTraits> output,
TORCH_DSA_KERNEL_ARGS) {
const int N = indices.size(0);
const int input_size = input.size(0);
Expand Down Expand Up @@ -70,24 +71,26 @@ DLL_PUBLIC Tensor index_select_cuda(

const int UNROLL_FACTOR = 2;

#define LAUNCH_INDEX_SELECT(INDICES_SORTED) \
TORCH_DSA_KERNEL_LAUNCH( \
(index_select_2d_kernel< \
index_t, \
scalar_t, \
UNROLL_FACTOR, \
INDICES_SORTED>), \
cuda_calc_xblock_count(N, 1), \
std::min(div_round_up(D, UNROLL_FACTOR), kMaxThreads), \
0, \
at::cuda::getCurrentCUDAStream(), \
input_reshaped.packed_accessor64<scalar_t, 2, at::RestrictPtrTraits>(), \
indices.packed_accessor64<index_t, 1, at::RestrictPtrTraits>(), \
INDICES_SORTED \
? orig_indices \
.packed_accessor64<int64_t, 1, at::RestrictPtrTraits>() \
: dummy_packed_accessor64<int64_t, 1, at::RestrictPtrTraits>(), \
output.packed_accessor64<scalar_t, 2>());
#define LAUNCH_INDEX_SELECT(INDICES_SORTED) \
{ \
[[maybe_unused]] const auto func_name = "index_select_2d_kernel"; \
TORCH_DSA_KERNEL_LAUNCH( \
(index_select_2d_kernel< \
index_t, \
scalar_t, \
UNROLL_FACTOR, \
INDICES_SORTED>), \
cuda_calc_xblock_count(N, 1), \
std::min(div_round_up(D, UNROLL_FACTOR), kMaxThreads), \
0, \
at::cuda::getCurrentCUDAStream(), \
MAKE_PTA_WITH_NAME(func_name, input_reshaped, scalar_t, 2, 64), \
MAKE_PTA_WITH_NAME(func_name, indices, index_t, 1, 64), \
INDICES_SORTED \
? MAKE_PTA_WITH_NAME(func_name, orig_indices, int64_t, 1, 64) \
: dummy_packed_accessor64<int64_t, 1, at::RestrictPtrTraits>(), \
MAKE_PTA_WITH_NAME(func_name, output, scalar_t, 2, 64)); \
}

AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "index_add_2d_kernel_1", [&] {
FBGEMM_DISPATCH_FLOAT_AND_HALF(
Expand Down
7 changes: 5 additions & 2 deletions fbgemm_gpu/src/sparse_ops/sparse_zipf.cu
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ __device__ long rk_zipf(rk_state* state, double a) {
__global__ void zipf_kernel(
const double a,
const int64_t seed,
at::PackedTensorAccessor64<long, 1, at::RestrictPtrTraits> y) {
pta::PackedTensorAccessor64<long, 1, at::RestrictPtrTraits> y) {
rk_state internal_state;
auto N = y.size(0);
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
Expand All @@ -99,12 +99,15 @@ zipf_cuda(const double a, const int64_t n, const int64_t seed) {
{n},
at::TensorOptions().dtype(at::kLong).device(
at::kCUDA, at::cuda::current_device()));
#ifdef FBGEMM_GPU_MEMCHECK
const auto func_name = "zipf_kernel";
#endif
zipf_kernel<<<
cuda_calc_xblock_count(n, kMaxThreads),
kMaxThreads,
0,
at::cuda::getCurrentCUDAStream()>>>(
a, seed, y.packed_accessor64<long, 1, at::RestrictPtrTraits>());
a, seed, MAKE_PTA_WITH_NAME(func_name, y, long, 1, 64));
C10_CUDA_KERNEL_LAUNCH_CHECK();

return y;
Expand Down
7 changes: 6 additions & 1 deletion fbgemm_gpu/test/sparse/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,12 @@ def extend_test_class(
"", os.path.dirname(__file__), "failures_dict.json"
)

additional_decorators = additional_decorators or {}
additional_decorators = (additional_decorators or {}) | {
"test_pt2_compliant_tag_fbgemm_permute_2D_sparse_data": [
# This operator has been grandfathered in. We need to fix this test failure.
unittest.expectedFailure,
],
}

# Only generate tests for PyTorch 2.2+
if (
Expand Down
11 changes: 10 additions & 1 deletion fbgemm_gpu/test/sparse/failures_dict.json
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,16 @@
}
},
"fbgemm::permute_1D_sparse_data": {},
"fbgemm::permute_2D_sparse_data": {},
"fbgemm::permute_2D_sparse_data": {
"PermuteEmbeddingsTest.test_aot_dispatch_dynamic__test_permute_embeddings": {
"comment": "",
"status": "xfail"
},
"PermuteIndicesTest.test_aot_dispatch_dynamic__test_permute_indices": {
"comment": "",
"status": "xfail"
}
},
"fbgemm::permute_sequence_embeddings": {
"PermuteEmbeddingsTest.test_aot_dispatch_dynamic__test_permute_embeddings": {
"comment": "",
Expand Down
Loading