Skip to content

Commit

Permalink
Merge branch 'feat/replace_opplugin_by_aclnn' of https://github.com/D…
Browse files Browse the repository at this point in the history
…eepLink-org/DIOPI into feat/replace_opplugin_by_aclnn
  • Loading branch information
yangbofun committed Jun 12, 2024
2 parents d7be303 + 532bb11 commit 20decf8
Show file tree
Hide file tree
Showing 10 changed files with 115 additions and 191 deletions.
1 change: 1 addition & 0 deletions impl/ascend/aclnn/adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ inline aclTensor* createAclTensorFromDiopiTensor(diopiConstTensorHandle_t tensor
if (tensor == nullptr) {
return nullptr;
}

diopiSize_t shape{};
diopiGetTensorShape(tensor, &shape);
diopiSize_t stride{};
Expand Down
2 changes: 2 additions & 0 deletions impl/ascend/ascend_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,8 @@ aclFormat inferAclDataFormat(int64_t dim, const int64_t* shape, const int64_t* s
return ACL_FORMAT_NHWC;
}
std::call_once(warningFlag, warnOnUnsupportedFormat, __FILE__, __LINE__, __FUNCTION__);
} else if (dim == 3) {
return ACL_FORMAT_NCL;
}
return ACL_FORMAT_ND;
}
Expand Down
5 changes: 5 additions & 0 deletions impl/ascend/device_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,11 @@
rtol=5e-2,
atol_half=5e-2,
rtol_half=5e-2,
para=dict(
# for aclnnGroupNorm, eps must be larger than 0.
# aclnnGoupNorm do not support float16 input
eps=[Skip(-1), Skip(0)],
),
tensor_para=dict(
args=[
{
Expand Down
154 changes: 28 additions & 126 deletions impl/ascend/functions/batch_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,146 +4,48 @@
* @copyright (c) 2023, DeepLink.
*/

#include "../common/acloprunner.hpp"
#include "../aclnn/acl_scalar.hpp"
#include "../aclnn/adaptor.hpp"

namespace impl {
namespace ascend {

void updateInputAscendTensorDim(AscendTensor& inputAt, bool training) {
int64_t dim = inputAt.dim();
if (2 == dim) {
inputAt.unsqueeze(2);
inputAt.unsqueeze(3);
} else if (3 == dim) {
inputAt.unsqueeze(3);
} else if (5 == dim && !training) {
std::vector<int64_t> shape4d{inputAt.shape(0), inputAt.shape(1), inputAt.shape(2), inputAt.shape(3) * inputAt.shape(4)};
inputAt.view(shape4d);
}
}

void batchNormBackwardTrainingUpdate(diopiContextHandle_t ctx, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias, AscendTensor gradOutputAt,
AscendTensor inputAt, diopiConstTensorHandle_t saveMean, diopiConstTensorHandle_t saveInvstd, double eps) {
std::string name = (inputAt.dim() == 5) ? "BN3DTrainingUpdateGrad" : "BNTrainingUpdateGrad";
AclOpRunner<4, 2>(name, ctx)
.addInput(gradOutputAt)
.addInput(inputAt)
.addInput(saveMean)
.addInput(saveInvstd)
.addOutput(gradWeight)
.addOutput(gradBias)
.setAttr<float>("epsilon", static_cast<float>(eps))
.run();
}

void batchNormBackwardTrainingReduceNocheck(diopiContextHandle_t ctx, AscendTensor gradInputAt, diopiConstTensorHandle_t gradWeight,
diopiConstTensorHandle_t gradBias, AscendTensor gradOutputAt, AscendTensor inputAt, diopiConstTensorHandle_t weight,
diopiConstTensorHandle_t saveMean, diopiConstTensorHandle_t saveInvstd, double eps) {
std::string name = (inputAt.dim() == 5) ? "BN3DTrainingReduceGrad" : "BNTrainingReduceGrad";
AclOpRunner<7, 1>(name, ctx)
.addInput(gradOutputAt)
.addInput(inputAt)
.addInput(gradWeight)
.addInput(gradBias)
.addInput(weight)
.addInput(saveMean)
.addInput(saveInvstd)
.addOutput(gradInputAt)
.setAttr<float>("epsilon", static_cast<float>(eps))
.run();
}

diopiError_t diopiBatchNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t saveMean, diopiTensorHandle_t saveInvstd,
diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, diopiTensorHandle_t runningMean,
diopiTensorHandle_t runningVar, bool training, double momentum, double eps) {
AscendTensor inputAt(input), outputAt(out);
updateInputAscendTensorDim(inputAt, training);
outputAt.view(inputAt.getAclMemShape());

std::vector<int64_t> batchShapeV{inputAt.shape(1)};
diopiSize_t batchShapeSizeT{batchShapeV.data(), static_cast<int64_t>(batchShapeV.size())};
diopiTensorHandle_t weightTemp = createTensorIfNullptrOrConstCast(ctx, weight, batchShapeSizeT, inputAt.dtype(), true, 1);
diopiTensorHandle_t biasTemp = createTensorIfNullptrOrConstCast(ctx, bias, batchShapeSizeT, inputAt.dtype(), true, 0);
diopiTensorHandle_t runningMeanTemp = createTensorIfNullptrOrConstCast(ctx, runningMean, batchShapeSizeT, inputAt.dtype(), true, 0);
diopiTensorHandle_t runningVarTemp = createTensorIfNullptrOrConstCast(ctx, runningVar, batchShapeSizeT, inputAt.dtype(), true, 1);

if (!training) {
AclOpRunner<5, 1>("BNInfer", ctx)
.addInput(inputAt)
.addInput(weightTemp)
.addInput(biasTemp)
.addInput(runningMeanTemp)
.addInput(runningVarTemp)
.addOutput(outputAt)
.setAttr("epsilon", static_cast<float>(eps))
.run();

diopiTensorHandle_t runningVarBroadcasted;
makeTensorLike(ctx, &runningVarBroadcasted, input);
AscendTensor runningVarAt(runningVar);
runningVarAt.unsqueeze(0);
runningVarAt.unsqueeze(2);
runningVarAt.unsqueeze(3);
AclOpRunner<2, 1>("BroadcastTo", ctx).addInput(runningVarAt).addConstInput(inputAt.shape()).addOutput(runningVarBroadcasted).run();
} else {
diopiTensorHandle_t sum = nullptr, squareSum = nullptr;
diopiSize_t shape, stride;
diopiGetTensorShape(runningMeanTemp, &shape);
diopiGetTensorStride(runningMeanTemp, &stride);
diopiRequireTensor(ctx, &sum, &shape, &stride, diopiDtype_t::diopi_dtype_float32, diopi_device);
diopiRequireTensor(ctx, &squareSum, &shape, &stride, diopiDtype_t::diopi_dtype_float32, diopi_device);
AclOpRunner<1, 2>("BNTrainingReduce", ctx).addInput(inputAt).addOutput(sum).setAttr("epsilon", static_cast<float>(eps)).addOutput(squareSum).run();
AclOpRunner<7, 5>("BNTrainingUpdate", ctx)
.addInput(inputAt)
.addInput(sum)
.addInput(squareSum)
.addInput(weightTemp)
.addInput(biasTemp)
.addInput(runningMeanTemp)
.addInput(runningVarTemp)
.setAttr("epsilon", static_cast<float>(eps))
.setAttr("factor", static_cast<float>(momentum))
.addOutput(outputAt)
.addOutput(runningMeanTemp)
.addOutput(runningVarTemp)
.addOutput(saveMean)
.addOutput(saveInvstd)
.run();
}
DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNorm, ctx, input, weight, bias, runningMean, runningVar, training, momentum, eps, out, saveMean, saveInvstd);
return diopiSuccess;
}

diopiError_t diopiBatchNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias,
diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight,
diopiConstTensorHandle_t runninMean, diopiConstTensorHandle_t runningVar, diopiConstTensorHandle_t saveMean,
diopiConstTensorHandle_t runningMean, diopiConstTensorHandle_t runningVar, diopiConstTensorHandle_t saveMean,
diopiConstTensorHandle_t saveInvstd, bool training, double eps) {
AscendTensor inputAt(input), gradOutputAt(gradOutput), gradInputAt(gradInput);
updateInputAscendTensorDim(inputAt, training);
gradOutputAt.view(inputAt.getAclMemShape());
gradInputAt.view(inputAt.getAclMemShape());

if (!training) {
batchNormBackwardTrainingUpdate(ctx, gradWeight, gradBias, gradOutputAt, inputAt, runninMean, runningVar, eps);

AclOpRunner<3, 1>("BNInferGrad", ctx)
.addInput(gradOutputAt)
.addInput(weight)
.addInput(runningVar)
.addOutput(gradInputAt)
.setAttr<float>("epsilon", static_cast<float>(eps))
.run();

diopiTensorHandle_t runningVarBroadcasted;
makeTensorLike(ctx, &runningVarBroadcasted, input);
AscendTensor runningVarAt(runningVar);
runningVarAt.unsqueeze(0);
runningVarAt.unsqueeze(2);
runningVarAt.unsqueeze(3);
AclOpRunner<2, 1>("BroadcastTo", ctx).addInput(runningVarAt).addConstInput(inputAt.shape()).addOutput(runningVarBroadcasted).run();
} else {
batchNormBackwardTrainingUpdate(ctx, gradWeight, gradBias, gradOutputAt, inputAt, saveMean, saveInvstd, eps);
batchNormBackwardTrainingReduceNocheck(ctx, gradInputAt, gradWeight, gradBias, gradOutputAt, inputAt, weight, saveMean, saveInvstd, eps);
std::array<bool, 3> gradMask = {true, true, true};
if (nullptr == gradInput) {
gradMask[0] = false;
}
if (nullptr == gradWeight) {
gradMask[1] = false;
}
if (nullptr == gradBias) {
gradMask[2] = false;
}
DIOPI_ASCEND_CALL_ACLNN(aclnnBatchNormBackward,
ctx,
gradOutput,
input,
weight,
runningMean,
runningVar,
saveMean,
saveInvstd,
training,
eps,
gradMask,
gradInput,
gradWeight,
gradBias);
return diopiSuccess;
}

Expand Down
10 changes: 6 additions & 4 deletions impl/ascend/functions/dropout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ diopiError_t npuDropoutOut(diopiContextHandle_t ctx, diopiTensorHandle_t out, di
diopiError_t ret = diopiRequireTensor(ctx, &maskNpu, &maskSize, nullptr, diopi_dtype_uint8, diopi_device);
ASCEND_CHECK_ABORT(ret == diopiSuccess, "[npuDropoutOut] require tensor for mask failed.");

uint64_t seed, offset;
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
const uint64_t seed = gen.first;
const uint64_t offset = gen.second;

DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutGenMask, ctx, inAt.shape(), p, seed, offset, maskNpu);
DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutDoMask, ctx, input, maskNpu, p, out);
Expand Down Expand Up @@ -57,8 +58,9 @@ diopiError_t npuDropout2dOut(diopiContextHandle_t ctx, diopiTensorHandle_t out,
diopiError_t ret = diopiRequireTensor(ctx, &maskNpu, &maskNpuSize, nullptr, diopi_dtype_uint8, diopi_device);
ASCEND_CHECK_ABORT(ret == diopiSuccess, "[npuDropout2dOut] require tensor for mask failed.");

uint64_t seed, offset;
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
const uint64_t seed = gen.first;
const uint64_t offset = gen.second;

DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutGenMask, ctx, inAt.shape(), p, seed, offset, maskNpu);
DIOPI_ASCEND_CALL_ACLNN(aclnnDropoutDoMask, ctx, input2d, maskNpu, p, out2d);
Expand Down
57 changes: 42 additions & 15 deletions impl/ascend/functions/group_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,58 @@
* @copyright (c) 2023, DeepLink.
*/

#include "../common/acloprunner.hpp"
#include <cmath>

#include "../aclnn/acl_scalar.hpp"
#include "../aclnn/adaptor.hpp"
#include "../common/utils.hpp"

namespace impl {
namespace ascend {

DIOPI_API diopiError_t diopiGroupNorm(diopiContextHandle_t ctx, diopiTensorHandle_t out, diopiTensorHandle_t saveMean, diopiTensorHandle_t saveInvstd,
diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight, diopiConstTensorHandle_t bias, int64_t numGroups,
double eps) {
if (0 == AscendTensor(input).numel()) {
AclOpRunner<1, 1>("Fills", ctx).addInput(out).setAttr<float>("value", 0).addOutput(out).run();
AscendTensor inputAt(input);
if (!inputAt.defined() || inputAt.numel() == 0) {
return diopiSuccess;
}

AclOpRunner<3, 3>("GroupNorm", ctx)
.addInput(input)
.addInput(weight)
.addInput(bias)
.setAttr("num_groups", static_cast<int32_t>(numGroups))
.setAttr("epsilon", static_cast<float>(eps))
.setAttr("data_format", std::string{getAclDataFormat(input) == ACL_FORMAT_ND ? "ND" : "NCHW"})
.setAttr("is_training", true)
.addOutput(out)
.addOutput(saveMean)
.addOutput(saveInvstd)
.run();
int64_t n = inputAt.shape(0);
int64_t c = inputAt.shape(1);
int64_t hw = inputAt.numel() / (n * c);

DIOPI_ASCEND_CALL_ACLNN(aclnnGroupNorm, ctx, input, weight, bias, n, c, hw, numGroups, eps, out, saveMean, saveInvstd);
return diopiSuccess;
}

diopiError_t diopiGroupNormBackward(diopiContextHandle_t ctx, diopiTensorHandle_t gradInput, diopiTensorHandle_t gradWeight, diopiTensorHandle_t gradBias,
diopiConstTensorHandle_t gradOutput, diopiConstTensorHandle_t input, diopiConstTensorHandle_t weight,
diopiConstTensorHandle_t mean, diopiConstTensorHandle_t rstd, int64_t numGroups) {
AscendTensor inputAt(input);
AscendTensor gradWeightAt(gradWeight);

if (!inputAt.defined()) {
return diopiSuccess;
}

if (inputAt.numel() == 0) {
DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceZero, ctx, gradBias);
if (inputAt.shape(0) == 0 || inputAt.shape(1) == 0) {
DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceZero, ctx, gradWeight);
} else {
diopiScalar_t nanScalar = constructDiopiScalarT(gradWeightAt.dtype(), std::nanf(""));
DIOPI_ASCEND_CALL_ACLNN(aclnnInplaceFillScalar, ctx, gradWeightAt, &nanScalar);
}
} else {
int64_t n = inputAt.shape(0);
int64_t c = inputAt.shape(1);
int64_t hw = inputAt.numel() / (n * c);

std::array<bool, 3> gradMask = {gradInput != nullptr, gradWeight != nullptr, gradBias != nullptr};
DIOPI_ASCEND_CALL_ACLNN(
aclnnGroupNormBackward, ctx, gradOutput, inputAt, mean, rstd, weight, n, c, hw, numGroups, gradMask, gradInput, gradWeightAt, gradBias);
}
return diopiSuccess;
}

Expand Down
25 changes: 15 additions & 10 deletions impl/ascend/functions/normal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ diopiError_t diopiNormal(diopiContextHandle_t ctx, diopiTensorHandle_t out, doub
return diopiSuccess;
}

uint64_t seed, offset;
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
const uint64_t seed = gen.first;
const uint64_t offset = gen.second;

float meanCast = static_cast<float>(mean);
float rstdCast = static_cast<float>(std);
Expand All @@ -26,8 +27,9 @@ diopiError_t diopiNormal(diopiContextHandle_t ctx, diopiTensorHandle_t out, doub
}

diopiError_t diopiNormalInp(diopiContextHandle_t ctx, diopiTensorHandle_t inout, double mean, double std, diopiGeneratorHandle_t generator) {
uint64_t seed, offset;
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
const uint64_t seed = gen.first;
const uint64_t offset = gen.second;

float meanCast = static_cast<float>(mean);
float rstdCast = static_cast<float>(std);
Expand All @@ -42,8 +44,9 @@ diopiError_t diopiNormalTensor(diopiContextHandle_t ctx, diopiTensorHandle_t out
return diopiSuccess;
}

uint64_t seed, offset;
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
const uint64_t seed = gen.first;
const uint64_t offset = gen.second;

DIOPI_ASCEND_CALL_ACLNN(aclnnNormalTensorTensor, ctx, mean, std, seed, offset, out);
return diopiSuccess;
Expand All @@ -56,8 +59,9 @@ diopiError_t diopiNormalScalarTensor(diopiContextHandle_t ctx, diopiTensorHandle
return diopiSuccess;
}

uint64_t seed, offset;
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
const uint64_t seed = gen.first;
const uint64_t offset = gen.second;

float meanCast = static_cast<float>(mean);
DIOPI_ASCEND_CALL_ACLNN(aclnnNormalFloatTensor, ctx, meanCast, std, seed, offset, out);
Expand All @@ -71,8 +75,9 @@ diopiError_t diopiNormalTensorScalar(diopiContextHandle_t ctx, diopiTensorHandle
return diopiSuccess;
}

uint64_t seed, offset;
DIOPI_CALL(diopiGeneratorGetSeedAndOffset(generator, &seed, &offset));
const std::pair<uint64_t, uint64_t> gen = getSeedAndOffset(ctx, generator, 10);
const uint64_t seed = gen.first;
const uint64_t offset = gen.second;

float rstdCast = static_cast<float>(std);
DIOPI_ASCEND_CALL_ACLNN(aclnnNormalTensorFloat, ctx, mean, rstdCast, seed, offset, out);
Expand Down
Loading

0 comments on commit 20decf8

Please sign in to comment.