Skip to content

Commit

Permalink
gpu: nvidia: convolution: bugfix sum post op with int8
Browse files Browse the repository at this point in the history
  • Loading branch information
t4c1 authored and dzarukin committed Nov 1, 2024
1 parent e9d0fdb commit 7486ed8
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 21 deletions.
4 changes: 0 additions & 4 deletions src/gpu/nvidia/cudnn_convolution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,8 @@ status_t cudnn_convolution_fwd_t::execute_convolution(

if (pd()->use_temp_dst()) {
memory_storage_t *temp_dst_mem = scratch_storage.get();
memory_storage_t *temp_reorder_mem = scratch_storage_2.get();
temp_dst = xpu::sycl::interop_memory_arg_t<
::sycl::access::mode::read_write>(temp_dst_mem, cgh);
temp_reorder = xpu::sycl::interop_memory_arg_t<
::sycl::access::mode::read_write>(temp_reorder_mem, cgh);
}

xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
Expand All @@ -85,7 +82,6 @@ status_t cudnn_convolution_fwd_t::execute_convolution(
args.push_back(arg_scratch.get_native_pointer(ih));
args.push_back(arg_filter_scratch.get_native_pointer(ih));
args.push_back(temp_dst.get_native_pointer(ih));
args.push_back(temp_reorder.get_native_pointer(ih));
args.push_back(arg_src_scale.get_native_pointer(ih));
args.push_back(arg_wei_scale.get_native_pointer(ih));
args.push_back(arg_dst_scale.get_native_pointer(ih));
Expand Down
4 changes: 0 additions & 4 deletions src/gpu/nvidia/cudnn_convolution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,6 @@ struct cudnn_convolution_fwd_t : public gpu::primitive_t {
CHECK(sycl_engine->create_memory_storage(
&scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
scratch_storage.reset(scratch_ptr);

CHECK(sycl_engine->create_memory_storage(
&scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
scratch_storage_2.reset(scratch_ptr);
}
if (impl && impl->use_scales_dst()) {
CHECK(sycl_engine->create_memory_storage(
Expand Down
33 changes: 20 additions & 13 deletions src/gpu/nvidia/cudnn_convolution_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -511,14 +511,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
const float beta = 0.0f;
if (flip_formats) {
CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
reorder_dst_desc, src, &beta, descs[y], dst);
reorder_dst_desc, src, &beta, y_fp32_desc, dst);
} else {
CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, descs[y],
src, &beta, reorder_dst_desc, dst);
CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
y_fp32_desc, src, &beta, reorder_dst_desc, dst);
}
}

void execute_f32_sum(cudnnHandle_t handle, void *y, void *y_fp32_data,
void execute_f32_dst_sum(cudnnHandle_t handle, void *y, void *y_fp32_data,
float alpha_, float beta_) const {
float alpha1 = 0.0f;
float alpha2 = alpha_;
Expand All @@ -528,6 +528,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
y_fp32_data);
}

void execute_f32_src_sum(cudnnHandle_t handle, void *x, void *y,
float alpha_, float beta_) const {
float alpha = alpha_;
float beta = beta_;
CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &alpha, descs[io::y], x,
&beta, y_fp32_desc, y);
}

void execute_eltwise(cudnnHandle_t handle, void *src, void *dst) const {
float alpha = 1.0f;
float beta = 0.0f;
Expand All @@ -551,8 +559,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
const std::vector<void *> &args) const override {
auto x = args[0], weights = args[1], y = args[2], bias = args[3],
scratchpad = args[4], post_op_scratch = args[6],
post_op_reorder = args[7], src_scale = args[8],
wei_scale = args[9], dst_scale = args[10];
src_scale = args[7], wei_scale = args[8], dst_scale = args[9];
void *output = use_temp_dst_ ? post_op_scratch : y;
if (using_transformed_filter()) {
auto w_scratch = args[5];
Expand All @@ -561,7 +568,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
}

float *y_fp32_data = nullptr;
if (y_f32_is_required()) { y_fp32_data = (float *)args[11]; }
if (y_f32_is_required()) { y_fp32_data = (float *)args[10]; }

bool fused = conv_bias || conv_bias_eltwise;

Expand All @@ -581,7 +588,8 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
}
}

auto &y_desc = y_f32_is_required() ? y_fp32_desc : descs[io::y];
auto &y_desc = (y_f32_is_required() || use_temp_dst_) ? y_fp32_desc
: descs[io::y];
void *y_data = y_f32_is_required() ? y_fp32_data : output;

if (fused) {
Expand Down Expand Up @@ -619,12 +627,11 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
switch (post_ops[i]) {
case dnnl_sum:
if (need_reorder) {
execute_reorder(handle, y, post_op_reorder, true);
execute_sum(handle, post_op_reorder, post_op_scratch,
sum_scale, 1.0f);
execute_f32_src_sum(
handle, y, post_op_scratch, sum_scale, 1.0f);
} else if (last_op) {
if (y_f32_is_required()) {
execute_f32_sum(
execute_f32_dst_sum(
handle, y, y_fp32_data, 1.0f, sum_scale);
} else {
execute_sum(handle, post_op_scratch, y, 1.0f,
Expand Down Expand Up @@ -687,7 +694,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
// The scratchpad size will need to be modified in
// cases where the dst_scaling is used and the output
// uses s8 values.
if (use_scales_dst_) {
if (use_scales_dst_ || use_temp_dst_) {
CHECK(create_and_set_tensor_descriptor(&y_fp32_desc,
CUDNN_DATA_FLOAT, ndims[y], dims[y], strides[y]));
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize,
Expand Down

0 comments on commit 7486ed8

Please sign in to comment.