Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

unify fluid::CUDADeviceContext and phi::GpuContext #44723

Merged
merged 3 commits into from
Jul 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions paddle/fluid/framework/details/eager_deletion_op_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,6 @@
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h"

namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle

namespace paddle {
namespace framework {
class GarbageCollector;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,6 @@
#include "paddle/fluid/platform/place.h"

namespace paddle {

namespace platform {
class CUDADeviceContext;
} // namespace platform

namespace memory {
namespace allocation {

Expand Down
7 changes: 0 additions & 7 deletions paddle/fluid/operators/cudnn_lstm_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@ limitations under the License. */
#include "paddle/fluid/operators/miopen_lstm_cache.h"
#endif

namespace paddle {
namespace platform {
class CUDADeviceContext;

} // namespace platform
} // namespace paddle

namespace paddle {
namespace operators {

Expand Down
7 changes: 4 additions & 3 deletions paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
#endif

size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
// first sum pool
FusedSeqpoolKernelNormal<<<config.block_per_grid.x,
config.thread_per_block.x,
Expand All @@ -209,7 +209,8 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
// not need show click input
N = static_cast<size_t>(batch_size * slot_num *
(embedding_size - cvm_offset));
platform::GpuLaunchConfig config = GetGpuLaunchConfig1D(dev_ctx, N);
platform::GpuLaunchConfig config =
platform::GetGpuLaunchConfig1D(dev_ctx, N);
FusedCVMKernelNoCVM<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
Expand Down Expand Up @@ -391,7 +392,7 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
#endif

size_t N = static_cast<size_t>(batch_size * slot_num * embedding_size);
auto config = GetGpuLaunchConfig1D(dev_ctx, N);
auto config = platform::GetGpuLaunchConfig1D(dev_ctx, N);
if (use_cvm) {
// join grad
FusedSeqpoolCVMGradKernelWithCVM<<<config.block_per_grid.x,
Expand Down
7 changes: 0 additions & 7 deletions paddle/fluid/operators/gru_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,6 @@ limitations under the License. */

#include "paddle/fluid/operators/gru_op.h"

namespace paddle {
namespace platform {
class CUDADeviceContext;

} // namespace platform
} // namespace paddle

namespace paddle {
namespace operators {

Expand Down
5 changes: 0 additions & 5 deletions paddle/fluid/operators/math/cross_entropy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -150,11 +150,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
}
}

template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
template class CrossEntropyFunctor<platform::CUDADeviceContext,
platform::float16>;

template class CrossEntropyFunctor<phi::GPUContext, float>;
template class CrossEntropyFunctor<phi::GPUContext, double>;
template class CrossEntropyFunctor<phi::GPUContext, platform::float16>;
Expand Down
24 changes: 0 additions & 24 deletions paddle/fluid/operators/math/im2col.cu
Original file line number Diff line number Diff line change
Expand Up @@ -308,24 +308,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
}
};

template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
platform::CUDADeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
phi::GPUContext,
float>;
Expand Down Expand Up @@ -576,25 +564,13 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
}
};

template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
double>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
float>;
template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
double>;

template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
float>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
platform::CUDADeviceContext,
double>;
template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
phi::GPUContext,
float>;
Expand Down
6 changes: 0 additions & 6 deletions paddle/fluid/operators/math/maxouting.cu
Original file line number Diff line number Diff line change
Expand Up @@ -173,12 +173,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
axis);
}

template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;

template class MaxOutFunctor<platform::CUDADeviceContext, float>;
template class MaxOutFunctor<platform::CUDADeviceContext, double>;

template class MaxOutGradFunctor<phi::GPUContext, float>;
template class MaxOutGradFunctor<phi::GPUContext, double>;

Expand Down
6 changes: 0 additions & 6 deletions paddle/fluid/operators/math/sample_prob.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,6 @@ limitations under the License. */
#include "paddle/fluid/operators/math/sampler.h"
#include "paddle/phi/core/ddim.h"

namespace paddle {
namespace platform {
class CUDADeviceContext;
} // namespace platform
} // namespace paddle

namespace paddle {
namespace operators {
namespace math {
Expand Down
161 changes: 2 additions & 159 deletions paddle/fluid/operators/math/selected_rows_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -133,77 +133,6 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
}
} // namespace

template <typename T>
struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input1,
const framework::Tensor& input2,
framework::Tensor* output) {
auto in1_height = input1.height();
auto in2_dims = input2.dims();
auto out_dims = output->dims();
PADDLE_ENFORCE_EQ(
in1_height,
in2_dims[0],
platform::errors::InvalidArgument(
"The two inputs height must be equal."
"But received first input height = [%d], first input height = [%d]",
in1_height,
in2_dims[0]));
PADDLE_ENFORCE_EQ(
in1_height,
out_dims[0],
platform::errors::InvalidArgument(
"The input and output height must be equal."
"But received input height = [%d], output height = [%d]",
in1_height,
out_dims[0]));

auto& in1_value = input1.value();
auto& in1_rows = input1.rows();

int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(
in1_row_numel,
input2.numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]",
in1_row_numel,
input2.numel() / in1_height));
PADDLE_ENFORCE_EQ(
in1_row_numel,
output->numel() / in1_height,
platform::errors::InvalidArgument(
"The input and output width must be equal."
"But received input width = [%d], output width = [%d]",
in1_row_numel,
output->numel() / in1_height));

auto* in1_data = in1_value.data<T>();
auto* in2_data = input2.data<T>();
auto* out_data = output->data<T>();

phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
functor(context, output, static_cast<T>(0));

const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(in1_rows.size(), 1);
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
SelectedRowsAddTensorKernel<T, block_size>
<<<grid, threads, 0, context.stream()>>>(
in1_data,
mixv_in1_rows.CUDAData(context.GetPlace()),
out_data,
in1_row_numel);

auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
}
};

template <typename T>
struct SelectedRowsAddTensor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
Expand Down Expand Up @@ -275,12 +204,6 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
}
};

template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;
template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
platform::float16>;

template struct SelectedRowsAddTensor<phi::GPUContext, float>;
template struct SelectedRowsAddTensor<phi::GPUContext, double>;
template struct SelectedRowsAdd<phi::GPUContext, platform::float16>;
Expand Down Expand Up @@ -363,50 +286,6 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
}
} // namespace

template <typename T>
struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input1,
framework::Tensor* input2) {
auto in1_height = input1.height();
auto in2_dims = input2->dims();
PADDLE_ENFORCE_EQ(
in1_height,
in2_dims[0],
platform::errors::InvalidArgument("The two inputs height must be equal."
"But received first input height = "
"[%d], second input height = [%d]",
in1_height,
in2_dims[0]));

auto& in1_value = input1.value();
auto& in1_rows = input1.rows();

int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(
in1_row_numel,
input2->numel() / in1_height,
platform::errors::InvalidArgument(
"The two inputs width must be equal."
"But received first input width = [%d], second input width = [%d]",
in1_row_numel,
input2->numel() / in1_height));

auto* in1_data = in1_value.data<T>();
auto* in2_data = input2->data<T>();
const int block_size = 256;
dim3 threads(block_size, 1);
dim3 grid(in1_rows.size(), 1);
paddle::framework::MixVector<int64_t> mixv_in1_rows(&in1_rows);
SelectedRowsAddToTensorKernel<T, block_size>
<<<grid, threads, 0, context.stream()>>>(
in1_data,
mixv_in1_rows.CUDAData(context.GetPlace()),
in2_data,
in1_row_numel);
}
};

template <typename T>
struct SelectedRowsAddToTensor<phi::GPUContext, T> {
void operator()(const phi::GPUContext& context,
Expand Down Expand Up @@ -451,12 +330,6 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
}
};

template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
platform::float16>;
template struct SelectedRowsAddToTensor<phi::GPUContext, float>;
template struct SelectedRowsAddToTensor<phi::GPUContext, double>;
template struct SelectedRowsAddToTensor<phi::GPUContext, int>;
Expand Down Expand Up @@ -625,34 +498,6 @@ struct MergeAddImpl {
}
};

template <typename T>
struct MergeAdd<platform::CUDADeviceContext, T> {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input,
const bool sorted_result) {
return MergeAddImpl<platform::CUDADeviceContext, T>()(
context, input, sorted_result);
}

void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(
context, input, output, sorted_result);
}

void operator()(const platform::CUDADeviceContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(
context, inputs, output, sorted_result);
}
};

template <typename T>
struct MergeAdd<phi::GPUContext, T> {
// unary functor, merge by adding duplicated rows in
Expand All @@ -678,10 +523,8 @@ struct MergeAdd<phi::GPUContext, T> {
}
};

#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<phi::GPUContext, dtype>;

TEMPLATE_SPECIALIZED_FOR_MERGEADD(float)
Expand Down
Loading