Skip to content

Commit

Permalink
Back out "refactor grad output non-contiguous handler" (#1225)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1225

Original commit changeset: ce8b7f9658c9

Original Phabricator Diff: D37988742 (308dd51)

As commented in D37988742 (308dd51), for alignment=16 + stride=16 case, it might be still efficient to do contiguous() instead of copy.

Long term, we still need to add alignment access support in Vec4T accessor.

Reviewed By: xing-liu, sryap

Differential Revision: D38218502

fbshipit-source-id: 703804846beb04eadbd1af528e990fbce89e30e4
  • Loading branch information
jianyuh authored and facebook-github-bot committed Jul 30, 2022
1 parent 2e14df6 commit de788f2
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 7 deletions.
13 changes: 8 additions & 5 deletions fbgemm_gpu/codegen/embedding_backward_dense_host.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,13 +174,14 @@ class SplitLookupFunction_Dense_Op
using torch::autograd::Variable;

auto grad_output = grad_outputs[0];

// FIXME: to support aligned memory access in Vec4T load/store function
// 16 for FP32 and 8 for FP16
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) {
grad_output = grad_output.contiguous();
}
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
grad_output = at::empty_like(grad_output).copy_(grad_output);
} else if (!grad_output.is_contiguous()) {
grad_output = grad_output.contiguous();
}

if (!indice_weights.defined()) {
Expand Down Expand Up @@ -330,10 +331,12 @@ class SplitNoBagLookupFunction_Dense_Op
auto grad_output = grad_outputs[0];
// FIXME: to support aligned memory access in Vec4T load/store function
// 16 for FP32 and 8 for FP16
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) {
grad_output = grad_output.contiguous();
}
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
grad_output = at::empty_like(grad_output).copy_(grad_output);
} else if (!grad_output.is_contiguous()) {
grad_output = grad_output.contiguous();
}

auto grad_dev_weights =
Expand Down
6 changes: 4 additions & 2 deletions fbgemm_gpu/codegen/embedding_backward_split_host_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,10 +276,12 @@ class Split{{ "NoBag" if nobag else "" }}LookupFunction_{{ optimizer }}_Op :
auto grad_output = gradient_clipping ? clamp(grad_outputs[0], -max_gradient, max_gradient) : grad_outputs[0];
// FIXME: to support aligned memory access in Vec4T load/store function
// 16 for FP32 and 8 for FP16
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0 ||
grad_output.stride(1) != 1 || grad_output.stride(0) % 4 != 0) {
grad_output = grad_output.contiguous();
}
if (reinterpret_cast<uint64_t>(grad_output.data_ptr()) % 16 != 0) {
grad_output = at::empty_like(grad_output).copy_(grad_output);
} else if (!grad_output.is_contiguous()) {
grad_output = grad_output.contiguous();
}

{% if not nobag %}
Expand Down

0 comments on commit de788f2

Please sign in to comment.