Skip to content

Commit

Permalink
clean up unused permute_duplicate_pooled_embs (#2811)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #2811

# context
* this `permute_duplicate_pooled_embs` is never used in prod
* it doesn't support backward
* there is another op can provide better feature
* it was introduced by D48090591
* it adds burden to maintain this op in the codebase.

Reviewed By: AGZain

Differential Revision: D54917144

fbshipit-source-id: 3c77ad732560b3820dc50bfa45ad9aa015b3f83d
  • Loading branch information
TroyGarden authored and facebook-github-bot committed Jul 10, 2024
1 parent 9f786a1 commit 1b63049
Show file tree
Hide file tree
Showing 14 changed files with 14 additions and 492 deletions.
15 changes: 0 additions & 15 deletions fbgemm_gpu/fbgemm_gpu/sparse_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -951,16 +951,6 @@ def permute_pooled_embs_split_abstract(
return torch.empty_like(pooled_embs)


def permute_duplicate_pooled_embs_split_abstract(
pooled_embs: Tensor,
offset_dim_list: Tensor,
permute_list: Tensor,
inv_offset_dim_list: Tensor,
inv_permute_list: Tensor,
) -> Tensor:
return torch.empty_like(pooled_embs)


def _setup() -> None:
# pyre-ignore[16]
_setup.done = getattr(_setup, "done", False)
Expand Down Expand Up @@ -1083,11 +1073,6 @@ def impl_autograd(op_name, fn, setup_context: Optional[Callable] = None) -> None
impl_abstract(
"fbgemm::permute_pooled_embs_split", permute_pooled_embs_split_abstract
)
impl_abstract(
"fbgemm::permute_duplicate_pooled_embs_split",
permute_duplicate_pooled_embs_split_abstract,
)

_setup.done = True


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@ class PermutePooledEmbsFunction
const at::Tensor& offset_dim_list,
const at::Tensor& permute_list,
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list,
const bool& allow_duplicates = false);
const at::Tensor& inv_permute_list);

static variable_list backward(
AutogradContext* ctx,
Expand Down
18 changes: 1 addition & 17 deletions fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,6 @@ namespace fbgemm_gpu {

///@ingroup permute-pooled-embs-cpu
at::Tensor permute_pooled_embs_cpu_impl(
const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const at::Tensor& offset_dim_list,
const at::Tensor& permute_list,
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list,
const bool& allow_duplicates);

at::Tensor permute_duplicate_pooled_embs_cpu(
const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const at::Tensor& offset_dim_list,
const at::Tensor& permute_list,
Expand All @@ -43,20 +35,12 @@ at::Tensor permute_pooled_embs_cpu(
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list);

at::Tensor permute_duplicate_pooled_embs_gpu(
const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const at::Tensor& offset_dim_list,
const at::Tensor& permute_list,
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list);

at::Tensor permute_pooled_embs_gpu_impl(
const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const at::Tensor& offset_dim_list,
const at::Tensor& permute_list,
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list,
const bool& allow_duplicates);
const at::Tensor& inv_permute_list);

at::Tensor permute_pooled_embs_gpu(
const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
Expand Down
21 changes: 0 additions & 21 deletions fbgemm_gpu/include/fbgemm_gpu/permute_pooled_embedding_ops_split.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,6 @@ at::Tensor permute_pooled_embs_split_gpu_impl(
const at::Tensor& inv_permute_list,
const bool& allow_duplicates);

// Implementation of permute_pooled_embs_split for GPU for the duplicate
// permutations use case. This calls the permute_pooled_embs_split_gpu_impl
// function.
///@ingroup permute-duplicate-pooled-embs-gpu
at::Tensor permute_duplicate_pooled_embs_split_gpu(
const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const at::Tensor& offset_dim_list,
const at::Tensor& permute_list,
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list);

///@ingroup permute-pooled-embs-gpu
at::Tensor permute_pooled_embs_split_gpu(
const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
Expand All @@ -60,16 +49,6 @@ at::Tensor permute_pooled_embs_auto_grad_split_cpu(
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list);

// Implementation of permute_pooled_embs_auto_grad_split for GPU for the
// duplicate permutations use case.
///@ingroup permute-duplicate-pooled-embs-gpu
at::Tensor permute_duplicate_pooled_embs_auto_grad_split_gpu(
const at::Tensor& pooled_embs,
const at::Tensor& offset_dim_list,
const at::Tensor& permute_list,
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list);

///@ingroup permute-pooled-embs-gpu
at::Tensor permute_pooled_embs_auto_grad_split_gpu(
const at::Tensor& pooled_embs,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,21 @@ Variable PermutePooledEmbsFunction::forward(
const Tensor& offset_dim_list,
const Tensor& permute_list,
const Tensor& inv_offset_dim_list,
const Tensor& inv_permute_list,
const bool& allow_duplicates) {
const Tensor& inv_permute_list) {
ctx->saved_data["offset_dim_list"] = offset_dim_list;
ctx->saved_data["permute_list"] = permute_list;
ctx->saved_data["inv_offset_dim_list"] = inv_offset_dim_list;
ctx->saved_data["inv_permute_list"] = inv_permute_list;
ctx->saved_data["allow_duplicates"] = allow_duplicates;
TORCH_CHECK(
offset_dim_list.scalar_type() == at::ScalarType::Long,
"offset_dim_list needs to have long/int64 type");
TORCH_CHECK(
permute_list.scalar_type() == at::ScalarType::Long,
"permute_list needs to have long/int64 type");

const auto schema = allow_duplicates ? "fbgemm::permute_duplicate_pooled_embs"
: "fbgemm::permute_pooled_embs";
const auto permute_pooled_embs_op =
torch::Dispatcher::singleton()
.findSchemaOrThrow(schema, "")
.findSchemaOrThrow("fbgemm::permute_pooled_embs", "")
.typed<decltype(permute_pooled_embs_cpu)>();
return permute_pooled_embs_op.call(
pooled_embs,
Expand All @@ -67,10 +63,6 @@ variable_list PermutePooledEmbsFunction::backward(
const auto& inv_offset_dim_list =
ctx->saved_data["inv_offset_dim_list"].toTensor();
const auto& inv_permute_list = ctx->saved_data["inv_permute_list"].toTensor();
const auto& allow_duplicates = ctx->saved_data["allow_duplicates"].toBool();
TORCH_CHECK(
allow_duplicates == false,
"permute_pooled_embs does not support allow_duplicates in backward!");
variable_list grad_inputs(6);
static auto permute_pooled_embs_op =
torch::Dispatcher::singleton()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,25 +22,6 @@
using Tensor = at::Tensor;

namespace fbgemm_gpu {

Tensor permute_duplicate_pooled_embs_gpu(
const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const Tensor& offset_dim_list,
const Tensor& permute_list,
const Tensor& inv_offset_dim_list,
const Tensor& inv_permute_list) {
TORCH_CHECK(offset_dim_list.numel() > 0);
TORCH_CHECK(inv_offset_dim_list.numel() > 0);

return permute_pooled_embs_gpu_impl(
pooled_embs,
offset_dim_list,
permute_list,
inv_offset_dim_list,
inv_permute_list,
true);
}

Tensor permute_pooled_embs_gpu(
const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const Tensor& offset_dim_list,
Expand All @@ -49,23 +30,6 @@ Tensor permute_pooled_embs_gpu(
const Tensor& inv_permute_list) {
TORCH_CHECK(offset_dim_list.numel() == permute_list.numel() + 1);
TORCH_CHECK(offset_dim_list.numel() == inv_offset_dim_list.numel());

return permute_pooled_embs_gpu_impl(
pooled_embs,
offset_dim_list,
permute_list,
inv_offset_dim_list,
inv_permute_list,
false);
}

Tensor permute_pooled_embs_gpu_impl(
const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const Tensor& offset_dim_list,
const Tensor& permute_list,
const Tensor& inv_offset_dim_list,
const Tensor& inv_permute_list,
const bool& allow_duplicates = false) {
// inv_permute_list is not being used so it's not checked here.
TENSORS_ON_SAME_CUDA_GPU_IF_NOT_OPTIONAL(
pooled_embs, offset_dim_list, permute_list, inv_offset_dim_list);
Expand All @@ -86,10 +50,8 @@ Tensor permute_pooled_embs_gpu_impl(

// Last index in inv_offset_dim_list contains the size of output.
// This will result in a D -> H sync.
const int64_t permuted_embs_dim_sum =
allow_duplicates ? inv_offset_dim_list[-1].item<int64_t>() : dim_sum;
Tensor permuted_pooled_embs = at::empty(
{pooled_embs_contiguous.size(0), permuted_embs_dim_sum},
{pooled_embs_contiguous.size(0), dim_sum},
pooled_embs_contiguous.options());

// This kernel is moving D elements per warp.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@ using Tensor = at::Tensor;

namespace fbgemm_gpu {

Tensor permute_pooled_embs_cpu_impl(
Tensor permute_pooled_embs_cpu(
const Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const Tensor& offset_dim_list,
const Tensor& permute_list,
const Tensor& inv_offset_dim_list,
const Tensor& inv_permute_list,
const bool& allow_duplicates) {
const Tensor& inv_permute_list) {
TORCH_CHECK(
offset_dim_list.scalar_type() == at::ScalarType::Long,
"offset_dim_list needs to have long/int64 type")
Expand All @@ -30,7 +29,7 @@ Tensor permute_pooled_embs_cpu_impl(
"permute_list needs to have long/int64 type")
auto permute = permute_list.data_ptr<int64_t>();
const auto n = permute_list.numel();
const auto dims_size = allow_duplicates ? offset_dim_list.numel() : n;
const auto dims_size = n;
std::vector<int64_t> dims;
dims.reserve(dims_size - 1);
for (const auto i : c10::irange(1, dims_size)) {
Expand All @@ -45,37 +44,6 @@ Tensor permute_pooled_embs_cpu_impl(
return at::cat(permuted_ts, 1);
}

at::Tensor permute_pooled_embs_cpu(
const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const at::Tensor& offset_dim_list,
const at::Tensor& permute_list,
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list) {
return permute_pooled_embs_cpu_impl(
pooled_embs,
offset_dim_list,
permute_list,
inv_offset_dim_list,
inv_permute_list,
false);
}

///@ingroup permute-duplicate-pooled-embs-cpu
at::Tensor permute_duplicate_pooled_embs_cpu(
const at::Tensor& pooled_embs, // [B_local][Sum_T_global(D)]
const at::Tensor& offset_dim_list,
const at::Tensor& permute_list,
const at::Tensor& inv_offset_dim_list,
const at::Tensor& inv_permute_list) {
return permute_pooled_embs_cpu_impl(
pooled_embs,
offset_dim_list,
permute_list,
inv_offset_dim_list,
inv_permute_list,
true);
}

///@ingroup permute-pooled-embs-cpu
at::Tensor permute_pooled_embs_auto_grad(
const Tensor& pooled_embs,
Expand All @@ -88,8 +56,7 @@ at::Tensor permute_pooled_embs_auto_grad(
offset_dim_list,
permute_list,
inv_offset_dim_list,
inv_permute_list,
false);
inv_permute_list);
}

///@ingroup permute-pooled-embs-cpu
Expand All @@ -104,24 +71,7 @@ at::Tensor permute_pooled_embs_auto_grad_cpu(
offset_dim_list,
permute_list,
inv_offset_dim_list,
inv_permute_list,
false);
}

///@ingroup permute-duplicate-pooled-embs-cpu
at::Tensor permute_duplicate_pooled_embs_auto_grad_cpu(
const Tensor& pooled_embs,
const Tensor& offset_dim_list,
const Tensor& permute_list,
const Tensor& inv_offset_dim_list,
const Tensor& inv_permute_list) {
return PermutePooledEmbsFunction::apply(
pooled_embs,
offset_dim_list,
permute_list,
inv_offset_dim_list,
inv_permute_list,
true);
inv_permute_list);
}

at::Tensor permute_pooled_embs_meta(
Expand Down Expand Up @@ -151,10 +101,6 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
m.def(
"permute_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor",
{PT2_COMPLIANT_TAG});
m.def(
"permute_duplicate_pooled_embs(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
m.def(
"permute_duplicate_pooled_embs_auto_grad(Tensor pooled_embs, Tensor offset_dim_list, Tensor permute_list, Tensor inv_offset_dim_list, Tensor inv_permute_list) -> Tensor");
}

FBGEMM_OP_DISPATCH(
Expand All @@ -165,15 +111,6 @@ FBGEMM_OP_DISPATCH(
CPU,
"permute_pooled_embs_auto_grad",
fbgemm_gpu::permute_pooled_embs_auto_grad_cpu);
FBGEMM_OP_DISPATCH(
CPU,
"permute_duplicate_pooled_embs",
fbgemm_gpu::permute_duplicate_pooled_embs_cpu);
FBGEMM_OP_DISPATCH(
CPU,
"permute_duplicate_pooled_embs_auto_grad",
fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_cpu);

FBGEMM_OP_DISPATCH(
Meta,
"permute_pooled_embs",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,37 +30,13 @@ Tensor permute_pooled_embs_auto_grad_gpu(
offset_dim_list,
permute_list,
inv_offset_dim_list,
inv_permute_list,
false);
inv_permute_list);
}

///@ingroup permute-duplicate-pooled-embs-gpu
Tensor permute_duplicate_pooled_embs_auto_grad_gpu(
const Tensor& pooled_embs,
const Tensor& offset_dim_list,
const Tensor& permute_list,
const Tensor& inv_offset_dim_list,
const Tensor& inv_permute_list) {
return PermutePooledEmbsFunction::apply(
pooled_embs,
offset_dim_list,
permute_list,
inv_offset_dim_list,
inv_permute_list,
true);
}

} // namespace fbgemm_gpu

TORCH_LIBRARY_FRAGMENT(fbgemm, m) {
DISPATCH_TO_CUDA("permute_pooled_embs", fbgemm_gpu::permute_pooled_embs_gpu);
DISPATCH_TO_CUDA(
"permute_pooled_embs_auto_grad",
fbgemm_gpu::permute_pooled_embs_auto_grad_gpu);
DISPATCH_TO_CUDA(
"permute_duplicate_pooled_embs",
fbgemm_gpu::permute_duplicate_pooled_embs_gpu);
DISPATCH_TO_CUDA(
"permute_duplicate_pooled_embs_auto_grad",
fbgemm_gpu::permute_duplicate_pooled_embs_auto_grad_gpu);
}
Loading

0 comments on commit 1b63049

Please sign in to comment.