Skip to content

Commit

Permalink
Replacing CudaAsyncBuffer with TArray to improve perf (#3303)
Browse files Browse the repository at this point in the history
* removing using CudaAsyncBuffer

* Keep CudaAsyncBuffer for these ops: non_max_suppression, cudnn_rnn_base, concat, split

* fix windows build error

* fix windows build error.

* fix build error

* fix windows build error

Co-authored-by: Weixing Zhang <[email protected]>
  • Loading branch information
weixingzhang and weixingzhang authored Mar 24, 2020
1 parent ef7b98f commit fef7989
Show file tree
Hide file tree
Showing 32 changed files with 424 additions and 358 deletions.
5 changes: 2 additions & 3 deletions onnxruntime/contrib_ops/cuda/activation/activations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@ namespace cuda {
Status x<T>::ComputeInternal(OpKernelContext* context) const { \
UnaryElementwisePreparation p; \
UnaryElementwise::Prepare(context, &p); \
CudaAsyncBuffer<Ctx##x> func_ctx(this, MakeFuncCtx(), 1); \
if (!std::is_same<CtxNull, Ctx##x>::value) ORT_RETURN_IF_ERROR(func_ctx.CopyToGpu()); \
Ctx##x func_ctx = MakeFuncCtx(); \
Impl_##x<typename ToCudaType<T>::MappedType>( \
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
func_ctx.GpuPtr(), p.output_tensor->Shape().Size()); \
&func_ctx, p.output_tensor->Shape().Size()); \
\
return Status::OK(); \
}
Expand Down
5 changes: 2 additions & 3 deletions onnxruntime/core/providers/cuda/activation/activations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,11 @@ namespace cuda {
Status x<T>::ComputeInternal(OpKernelContext* context) const { \
UnaryElementwisePreparation p; \
UnaryElementwise::Prepare(context, &p); \
CudaAsyncBuffer<Ctx##x> func_ctx(this, MakeFuncCtx(), 1); \
if (!std::is_same<CtxNull, Ctx##x>::value) ORT_RETURN_IF_ERROR(func_ctx.CopyToGpu()); \
Ctx##x func_ctx = MakeFuncCtx(); \
Impl_##x<typename ToCudaType<T>::MappedType>( \
reinterpret_cast<const typename ToCudaType<T>::MappedType*>(p.input_tensor->template Data<T>()), \
reinterpret_cast<typename ToCudaType<T>::MappedType*>(p.output_tensor->template MutableData<T>()), \
func_ctx.GpuPtr(), p.output_tensor->Shape().Size()); \
&func_ctx, p.output_tensor->Shape().Size()); \
\
return Status::OK(); \
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ template <typename InT, typename OutT, typename FuncT, int NumThreadsPerBlock, i
__global__ void _UnaryElementWise(
const InT* input_data,
OutT* output_data,
const FuncT& functor,
const FuncT functor,
CUDA_LONG N) {
CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x;
InT value[NumElementsPerThread];
Expand Down
9 changes: 4 additions & 5 deletions onnxruntime/core/providers/cuda/math/topk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,16 @@ TopK<inputk>::TopK(const OpKernelInfo& info) : CudaKernel(info) {
#define TOPKIMPL(T) TopKImpl<T>(this, tensor_X->Data<T>(), \
static_cast<T*>(tensor_V->MutableDataRaw()), \
static_cast<int64_t*>(tensor_I->MutableDataRaw()), \
elem_nums_cuda.GpuPtr(), \
elem_nums_cuda, \
elem_nums.size(), \
axis, K_, largest_, sorted_, N, dimension)

template <bool inputk>
Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
auto tensor_X = ctx->Input<Tensor>(0);
ORT_ENFORCE(nullptr != tensor_X);
auto rank = static_cast<int64_t>(tensor_X->Shape().NumDimensions());
auto axis = axis_ < 0 ? rank + axis_ : axis_;
int32_t rank = static_cast<int32_t>(tensor_X->Shape().NumDimensions());
int32_t axis = static_cast<int32_t>(axis_ < 0 ? rank + axis_ : axis_);
ORT_ENFORCE(axis > -1 && axis < rank);

if (inputk) {
Expand All @@ -80,8 +80,7 @@ Status TopK<inputk>::ComputeInternal(OpKernelContext* ctx) const {
}

auto N = elem_nums[0] / dimension;
CudaAsyncBuffer<int64_t> elem_nums_cuda(this, elem_nums);
ORT_RETURN_IF_ERROR(elem_nums_cuda.CopyToGpu());
TArray<int64_t> elem_nums_cuda(elem_nums);

auto prim_type = tensor_X->DataType()->AsPrimitiveDataType();
if (prim_type == nullptr) {
Expand Down
14 changes: 7 additions & 7 deletions onnxruntime/core/providers/cuda/math/topk_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ struct KV {
#define LESS(n, m) ((n) <= (m) ? (n) : (m))

template <typename T>
__global__ void BitonicTopK(const T* X, T* V, int64_t* I, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t aligned_K, int64_t largest, int64_t sorted, int64_t dimension, int64_t aligned_dimension, T type_min, T type_max) {
__global__ void BitonicTopK(const T* X, T* V, int64_t* I, const TArray<int64_t> elem_nums, size_t size, int32_t axis, int64_t K, int64_t aligned_K, int64_t largest, int64_t sorted, int64_t dimension, int64_t aligned_dimension, T type_min, T type_max) {
auto tid = threadIdx.x;
auto bid = blockIdx.x;
extern __shared__ char shared_mem[];
Expand Down Expand Up @@ -192,7 +192,7 @@ __device__ void SetByte(double* d, int64_t byte) {
}

template<typename T, int64_t THREADS, int64_t KPT>
__global__ void RadixTopK(const T* X, T* V, int64_t* I, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t dimension, int64_t XPT, T type_min, T type_max) {
__global__ void RadixTopK(const T* X, T* V, int64_t* I, const TArray<int64_t> elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t dimension, int64_t XPT, T type_min, T type_max) {
auto tid = threadIdx.x;
auto bid = blockIdx.x;
extern __shared__ char shared_mem[];
Expand Down Expand Up @@ -342,7 +342,7 @@ __global__ void RadixTopK(const T* X, T* V, int64_t* I, const int64_t* elem_nums
}

template <typename T>
__global__ void FillInput(const T* input_x, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t offset, int64_t dimension) {
__global__ void FillInput(const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t> elem_nums, size_t size, int32_t axis, int64_t K, int64_t offset, int64_t dimension) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, dimension);
auto left = offset / (axis == size - 1 ? 1 : elem_nums[axis + 1]) * elem_nums[axis];
auto right = axis == size - 1 ? 0 : offset % elem_nums[axis + 1];
Expand All @@ -352,7 +352,7 @@ __global__ void FillInput(const T* input_x, T* output_v, int64_t* output_i, cons
}

template <typename T>
__global__ void FillOutput(const T* input_v, const int64_t* input_i, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t offset, int64_t dimension) {
__global__ void FillOutput(const T* input_v, const int64_t* input_i, T* output_v, int64_t* output_i, const TArray<int64_t> elem_nums, size_t size, int32_t axis, int64_t K, int64_t offset, int64_t dimension) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, K);
auto left = offset / (axis == size - 1 ? 1 : elem_nums[axis + 1]) * elem_nums[axis] * K / dimension;
auto right = axis == size - 1 ? 0 : offset % elem_nums[axis + 1];
Expand All @@ -369,7 +369,7 @@ __global__ void ExcludeOutput(int64_t* output_i, int64_t K, int64_t dimension) {
}

template <typename T>
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) {
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension) {
auto aligned_K = ALIGN(K);
auto aligned_dimension = ALIGN(dimension);
if (aligned_dimension <= GridDim::maxThreadsPerBlock) {
Expand Down Expand Up @@ -419,9 +419,9 @@ Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t
const T* input_x, \
T* output_v, \
int64_t* output_i, \
const int64_t* elem_nums, \
const TArray<int64_t>& elem_nums, \
size_t size, \
int64_t axis, \
int32_t axis, \
int64_t K, \
int64_t largest, \
int64_t sorted, \
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/cuda/math/topk_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace onnxruntime {
namespace cuda {

template <typename T>
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const int64_t* elem_nums, size_t size, int64_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);
Status TopKImpl(const CudaKernel* kernel, const T* input_x, T* output_v, int64_t* output_i, const TArray<int64_t>& elem_nums, size_t size, int32_t axis, int64_t K, int64_t largest, int64_t sorted, int64_t N, int64_t dimension);

} // namespace cuda
} // namespace onnxruntime
20 changes: 12 additions & 8 deletions onnxruntime/core/providers/cuda/tensor/expand.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,17 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const {
CalcEffectiveDims(input_dims, output_dims);
int rank = gsl::narrow_cast<int>(output_dims.size());

CudaAsyncBuffer<fast_divmod> fdm_output_strides(this, rank);
ORT_ENFORCE(CalculateFdmStrides(fdm_output_strides.CpuSpan(), output_dims));
TensorPitches original_input_strides(input_dims);
TensorPitches original_output_strides(output_dims);

CudaAsyncBuffer<int64_t> input_view_strides(this, rank);
TensorPitches::Calculate(input_view_strides.CpuSpan(), input_dims);
for (int i = 0; i < rank; ++i) {
if (input_dims[i] == 1) input_view_strides.CpuSpan()[i] = 0;
TArray<int64_t> input_strides(rank);
for (auto i = 0; i < rank; i++) {
input_strides[i] = input_dims[i] == 1 ? 0 : original_input_strides[i];
}

TArray<fast_divmod> output_strides(rank);
for (auto i = 0; i < rank; i++) {
output_strides[i] = fast_divmod(static_cast<int>(original_output_strides[i]));
}

return ExpandImpl(
Expand All @@ -99,8 +103,8 @@ Status Expand::ComputeInternal(OpKernelContext* ctx) const {
gsl::narrow_cast<int>(input_data_tensor.Shape().Size()),
input_data_tensor.DataRaw(),
output_tensor.MutableDataRaw(),
fdm_output_strides,
input_view_strides);
output_strides,
input_strides);
}


Expand Down
24 changes: 11 additions & 13 deletions onnxruntime/core/providers/cuda/tensor/expand_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ __global__ void ExpandKernel(
const int N,
const T* input_data,
T* output_data,
const fast_divmod* fdm_output_strides,
const int64_t* input_view_strides) {
const TArray<fast_divmod> output_strides,
const TArray<int64_t> input_strides) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);

int dim, r = id, input_index = 0;
for (int i = 0; i < rank; ++i) {
fdm_output_strides[i].divmod(r, dim, r);
input_index += dim * input_view_strides[i];
output_strides[i].divmod(r, dim, r);
input_index += dim * input_strides[i];
}
output_data[id] = input_data[input_index];
}
Expand Down Expand Up @@ -114,9 +114,9 @@ Status ExpandImpl(
const int N_input,
const void* input_data,
void* output_data,
CudaKernel::CudaAsyncBuffer<fast_divmod>& fdm_output_strides,
CudaKernel::CudaAsyncBuffer<int64_t>& input_view_strides) {
const int rank = static_cast<int>(fdm_output_strides.count());
const TArray<fast_divmod>& output_strides,
const TArray<int64_t>& input_strides) {
const int rank = static_cast<int>(output_strides.size_);
if (rank == 1) {
if (N_input == N_output) {
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(output_data, input_data, N_output * element_size, cudaMemcpyDeviceToDevice));
Expand All @@ -125,20 +125,18 @@ Status ExpandImpl(
}
} else if (rank == 2) {
return Expand2D(element_size, N_output, input_data, output_data,
fdm_output_strides.CpuSpan()[0],
static_cast<int>(input_view_strides.CpuSpan()[0]),
static_cast<int>(input_view_strides.CpuSpan()[1]));
output_strides[0],
static_cast<int>(input_strides[0]),
static_cast<int>(input_strides[1]));
}

int blocksPerGrid = gsl::narrow_cast<int>(CeilDiv(N_output, GridDim::maxThreadsPerBlock));
fdm_output_strides.CopyToGpu();
input_view_strides.CopyToGpu();

#define EXPAND_ON(TYPE) \
case sizeof(TYPE): \
ExpandKernel<<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0>>>( \
rank, N_output, reinterpret_cast<const TYPE*>(input_data), reinterpret_cast<TYPE*>(output_data), \
fdm_output_strides.GpuPtr(), input_view_strides.GpuPtr()); \
output_strides, input_strides); \
break

switch (element_size) {
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/cuda/tensor/expand_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ Status ExpandImpl(
const int N_input,
const void* input_data,
void* output_data,
CudaKernel::CudaAsyncBuffer<fast_divmod>& fdm_output_strides,
CudaKernel::CudaAsyncBuffer<int64_t>& input_view_strides);
const TArray<fast_divmod>& output_strides,
const TArray<int64_t>& input_strides);


} // namespace cuda
Expand Down
22 changes: 11 additions & 11 deletions onnxruntime/core/providers/cuda/tensor/gather_elements.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const {
const auto* indices_tensor = context->Input<Tensor>(1);
const auto& indices_shape = indices_tensor->Shape();
const auto& indices_dims = indices_shape.GetDims();
const int64_t indices_rank = static_cast<int64_t>(indices_dims.size());
const int32_t indices_rank = static_cast<int32_t>(indices_dims.size());
const int64_t indices_size = indices_shape.Size();

// Handle negative axis if any
Expand All @@ -51,13 +51,13 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const {
return Status::OK();

TensorPitches input_strides(input_dims);
CudaAsyncBuffer<int64_t> gpu_input_strides(this, input_strides);
TArray<int64_t> gpu_input_strides(input_strides);

CudaAsyncBuffer<fast_divmod> fdm_indices_strides(this, indices_rank);
ORT_ENFORCE(CalculateFdmStrides(fdm_indices_strides.CpuSpan(), indices_dims));

ORT_RETURN_IF_ERROR(gpu_input_strides.CopyToGpu());
ORT_RETURN_IF_ERROR(fdm_indices_strides.CopyToGpu());
TArray<fast_divmod> fdm_indices_strides(indices_rank);
TensorPitches indices_strides(indices_dims);
for (auto i = 0; i < indices_rank; i++) {
fdm_indices_strides[i] = fast_divmod(static_cast<int>(indices_strides[i]));
}

size_t element_size = input_tensor->DataType()->Size();

Expand All @@ -67,10 +67,10 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const {
input_rank,
input_tensor->DataRaw(),
input_dims[axis],
gpu_input_strides.GpuPtr(),
gpu_input_strides,
indices_data,
indices_size,
fdm_indices_strides.GpuPtr(),
fdm_indices_strides,
axis,
output_tensor->MutableDataRaw(),
element_size);
Expand All @@ -81,10 +81,10 @@ Status GatherElements::ComputeInternal(OpKernelContext* context) const {
input_rank,
input_tensor->DataRaw(),
input_dims[axis],
gpu_input_strides.GpuPtr(),
gpu_input_strides,
indices_data,
indices_size,
fdm_indices_strides.GpuPtr(),
fdm_indices_strides,
axis,
output_tensor->MutableDataRaw(),
element_size);
Expand Down
16 changes: 8 additions & 8 deletions onnxruntime/core/providers/cuda/tensor/gather_elements_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ __global__ void _GatherElementsKernel(
const int64_t rank,
const T* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t> input_strides,
const Tin* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod> indices_strides,
const int64_t axis,
T* output_data) {
CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(indices_index, indices_size);
Expand Down Expand Up @@ -43,10 +43,10 @@ void GatherElementsImpl(
const int64_t rank,
const void* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t>& input_strides,
const Tin* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod>& indices_strides,
const int64_t axis,
void* output_data,
size_t element_size) {
Expand Down Expand Up @@ -95,10 +95,10 @@ template void GatherElementsImpl<int32_t>(
const int64_t rank,
const void* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t>& input_strides,
const int32_t* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod>& indices_strides,
const int64_t axis,
void* output_data,
size_t element_size);
Expand All @@ -107,10 +107,10 @@ template void GatherElementsImpl<int64_t>(
const int64_t rank,
const void* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t>& input_strides,
const int64_t* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod>& indices_strides,
const int64_t axis,
void* output_data,
size_t element_size);
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/providers/cuda/tensor/gather_elements_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ void GatherElementsImpl(
const int64_t rank, // both inputs have same rank and this is validated in the main Compute
const void* input_data,
const int64_t input_dim_along_axis,
const int64_t* input_strides,
const TArray<int64_t>& input_strides,
const Tin* indices_data,
const int64_t indices_size,
const fast_divmod* indices_strides,
const TArray<fast_divmod>& indices_strides,
const int64_t axis,
void* output_data,
size_t element_size);
Expand Down
Loading

0 comments on commit fef7989

Please sign in to comment.