From 22affbcb28bf078b0b4f3f2e4cd1ed08f0932a51 Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Wed, 4 Sep 2024 21:07:04 +0800 Subject: [PATCH 1/7] [webgpu-native] Add transpose op --- .../core/providers/webgpu/tensor/transpose.cc | 113 ++++++++++++++++++ .../core/providers/webgpu/tensor/transpose.h | 37 ++++++ .../webgpu/webgpu_execution_provider.cc | 6 +- 3 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 onnxruntime/core/providers/webgpu/tensor/transpose.cc create mode 100644 onnxruntime/core/providers/webgpu/tensor/transpose.h diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc new file mode 100644 index 0000000000000..67f1d164a73f1 --- /dev/null +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -0,0 +1,113 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/inlined_containers.h" +#include "core/providers/webgpu/tensor/transpose.h" +#include "core/providers/cpu/tensor/utils.h" +#include "core/providers/webgpu/shader_variable.h" +#include "core/providers/webgpu/shader_helper.h" + +namespace onnxruntime { +namespace webgpu { + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + Transpose, + kOnnxDomain, + 1, 12, + kCudaExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + Transpose); + +ONNX_OPERATOR_KERNEL_EX( + Transpose, + kOnnxDomain, + 13, + kCudaExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + Transpose); + +ONNX_OPERATOR_KERNEL_EX( + Transpose, + kOnnxDomain, + 17, + kCudaExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + Transpose); + +const std::string permFunctionBody(const std::string& input_name, const std::string& output_name, const gsl::span& perm) { + std::ostringstream ss; + ss.imbue(std::locale::classic()); + + ss << "fn perm(i: " << output_name << "_indices_t" + << ")->" << input_name << "_indices_t " + << "{\n var a: " << input_name << "_indices_t;\n"; + for (auto i = 0; i < perm.size(); ++i) { + ss << " a[" << perm[i] << "] = i[" << i << "];\n"; + } + ss << " return a;\n}\n"; + return ss.str(); +} + +Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const { + const auto input_name{"x"}; + const auto output_name{"y"}; + const auto& input = shader.AddInput(input_name, + ToProgramVariableDataType(Inputs()[0].tensor->GetElementType()), + ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); + const auto& output = shader.AddOutput(output_name, + ToProgramVariableDataType(Outputs()[0].tensor->GetElementType()), + ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); + shader.AppendImplementation(permFunctionBody(input_name, output_name, this->perm_)); + shader.MainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"), + " let indices = ", output.OffsetToIndices("global_idx"),";\n", " let x_indices = perm(indices); \n", + output.SetByOffset("global_idx", input.GetByIndices("x_indices"))); + return Status::OK(); +} + +Status Transpose::ComputeInternal(ComputeContext& context) const { + const auto* input_tensor = context.Input(0); + const TensorShape& input_shape = input_tensor->Shape(); + int32_t rank = gsl::narrow_cast(input_shape.NumDimensions()); + + TensorShapeVector output_dims(rank); + InlinedVector default_perm(rank); + const InlinedVector* p_perm = nullptr; + const auto& status = ComputeOutputShape(*input_tensor, output_dims, default_perm, p_perm); + if (!status.IsOK()) + return status; + TensorShape output_shape(output_dims); + auto* output_tensor = context.Output(0, output_shape); + + SafeInt vec_size = input_tensor->Shape().Size(); + TransposeProgram program{"Transpose", *p_perm}; + program + .Inputs({{input_tensor, ProgramTensorMetadataDependency::Rank}}) + .Outputs({output_tensor}) + .DispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .UniformVariables({ + {static_cast(vec_size)}, + }); + return context.RunProgram(program); +} + +#define WEBGPU_TRANSPOSE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS, TYPE) \ + ONNX_OPERATOR_KERNEL_EX( \ + OP_TYPE, kOnnxDomain, VERSION, kWebGpuExecutionProvider, \ + KernelDefBuilder().TypeConstraint("T", TYPE), \ + KERNEL_CLASS); + +#define WEBGPU_TRANSPOSE_VERSIONED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, KERNEL_CLASS, TYPE) \ + ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ + OP_TYPE, kOnnxDomain, VERSION_FROM, VERSION_TO, kWebGpuExecutionProvider, \ + KernelDefBuilder().TypeConstraint("T", TYPE), \ + KERNEL_CLASS); + +WEBGPU_TRANSPOSE_VERSIONED_KERNEL(Transpose, 1, 12, Transpose, WebGpuSupportedFloatTypes()) +WEBGPU_TRANSPOSE_KERNEL(Transpose, 13, Transpose, WebGpuSupportedFloatTypes()) +WEBGPU_TRANSPOSE_KERNEL(Transpose, 17, Transpose, WebGpuSupportedFloatTypes()) + +} // namespace webgpu +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h new file mode 100644 index 0000000000000..45f828807a8be --- /dev/null +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h @@ -0,0 +1,37 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/webgpu/webgpu_supported_types.h" +#include "core/providers/cpu/tensor/transpose.h" +#include "core/providers/webgpu/webgpu_kernel.h" +#include "core/providers/webgpu/program.h" + +namespace onnxruntime { +namespace webgpu { + +class TransposeProgram final : public Program { + public: + TransposeProgram(const std::string& kernel_name, const gsl::span& permutations) + : Program{kernel_name}, perm_(permutations.begin(), permutations.end()) { + } + + Status GenerateShaderCode(ShaderHelper& sh) const override; + + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}); + + private: + InlinedVector perm_; +}; + +class Transpose final : public WebGpuKernel, public TransposeBase { + public: + Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} { + } + + Status ComputeInternal(ComputeContext& context) const override; +}; + +} // namespace webgpu +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index decc74b59cae6..91642ac2b2308 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -240,6 +240,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, Transpose); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Transpose); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 17, Transpose); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace); @@ -552,8 +553,9 @@ std::unique_ptr RegisterKernels() { // KERNEL_CREATE_INFO_VERSIONED(9, 15, Where), // KERNEL_CREATE_INFO(16, Where), - // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, From 97451481bf2a73f9149e6fa1070fbdeb916380ca Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Thu, 5 Sep 2024 10:23:16 +0800 Subject: [PATCH 2/7] Nit --- onnxruntime/core/providers/webgpu/tensor/transpose.cc | 4 ++-- onnxruntime/core/providers/webgpu/tensor/transpose.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index 67f1d164a73f1..64edfd25f2e88 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -61,8 +61,8 @@ Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const { ToProgramVariableDataType(Outputs()[0].tensor->GetElementType()), ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); shader.AppendImplementation(permFunctionBody(input_name, output_name, this->perm_)); - shader.MainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"), - " let indices = ", output.OffsetToIndices("global_idx"),";\n", " let x_indices = perm(indices); \n", + shader.MainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size"), + " let indices = ", output.OffsetToIndices("global_idx"), ";\n", " let x_indices = perm(indices); \n", output.SetByOffset("global_idx", input.GetByIndices("x_indices"))); return Status::OK(); } diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h index 45f828807a8be..3deea90a6212f 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.h +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h @@ -19,7 +19,7 @@ class TransposeProgram final : public Program { Status GenerateShaderCode(ShaderHelper& sh) const override; - WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}); + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"vec_size", ProgramUniformVariableDataType::Uint32}); private: InlinedVector perm_; From a9c375957c1af373a24312740e59864fa93d944a Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Fri, 6 Sep 2024 11:24:56 +0800 Subject: [PATCH 3/7] Fix comments --- .../core/providers/webgpu/tensor/transpose.cc | 24 +++++++++---------- .../core/providers/webgpu/tensor/transpose.h | 6 ++--- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index 64edfd25f2e88..5dd4c1d61859f 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -37,10 +37,9 @@ ONNX_OPERATOR_KERNEL_EX( .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), Transpose); -const std::string permFunctionBody(const std::string& input_name, const std::string& output_name, const gsl::span& perm) { +const std::string AppendPermFunction(std::string_view input_name, std::string_view output_name, gsl::span perm) { std::ostringstream ss; ss.imbue(std::locale::classic()); - ss << "fn perm(i: " << output_name << "_indices_t" << ")->" << input_name << "_indices_t " << "{\n var a: " << input_name << "_indices_t;\n"; @@ -60,9 +59,11 @@ Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& output = shader.AddOutput(output_name, ToProgramVariableDataType(Outputs()[0].tensor->GetElementType()), ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); - shader.AppendImplementation(permFunctionBody(input_name, output_name, this->perm_)); - shader.MainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size"), - " let indices = ", output.OffsetToIndices("global_idx"), ";\n", " let x_indices = perm(indices); \n", + shader.AppendImplementation(AppendPermFunction(input_name, output_name, this->perm_)); + shader.MainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"), + " let indices = ", output.OffsetToIndices("global_idx"), + ";\n" + " let x_indices = perm(indices); \n", output.SetByOffset("global_idx", input.GetByIndices("x_indices"))); return Status::OK(); } @@ -75,20 +76,19 @@ Status Transpose::ComputeInternal(ComputeContext& context) const { TensorShapeVector output_dims(rank); InlinedVector default_perm(rank); const InlinedVector* p_perm = nullptr; - const auto& status = ComputeOutputShape(*input_tensor, output_dims, default_perm, p_perm); - if (!status.IsOK()) - return status; + ORT_RETURN_IF_ERROR(ComputeOutputShape(*input_tensor, output_dims, default_perm, p_perm)); TensorShape output_shape(output_dims); auto* output_tensor = context.Output(0, output_shape); - SafeInt vec_size = input_tensor->Shape().Size(); - TransposeProgram program{"Transpose", *p_perm}; + uint32_t output_size = gsl::narrow_cast(input_tensor->Shape().Size()); + TransposeProgram program{*p_perm}; program + .CacheHint(absl::StrJoin(*p_perm, "-")) .Inputs({{input_tensor, ProgramTensorMetadataDependency::Rank}}) .Outputs({output_tensor}) - .DispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .DispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) .UniformVariables({ - {static_cast(vec_size)}, + {static_cast(output_size)}, }); return context.RunProgram(program); } diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h index 3deea90a6212f..3ca5674d5dfab 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.h +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h @@ -13,13 +13,13 @@ namespace webgpu { class TransposeProgram final : public Program { public: - TransposeProgram(const std::string& kernel_name, const gsl::span& permutations) - : Program{kernel_name}, perm_(permutations.begin(), permutations.end()) { + TransposeProgram(const gsl::span& permutations) + : Program{"Transpose"}, perm_(permutations.begin(), permutations.end()) { } Status GenerateShaderCode(ShaderHelper& sh) const override; - WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"vec_size", ProgramUniformVariableDataType::Uint32}); + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}); private: InlinedVector perm_; From 92cec7d04457b6245ae798834062289dc8a91fa2 Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Tue, 10 Sep 2024 10:05:00 +0800 Subject: [PATCH 4/7] Rebase --- onnxruntime/core/providers/webgpu/tensor/transpose.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index 5dd4c1d61859f..e4ec2c2628026 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -54,10 +54,8 @@ Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto input_name{"x"}; const auto output_name{"y"}; const auto& input = shader.AddInput(input_name, - ToProgramVariableDataType(Inputs()[0].tensor->GetElementType()), ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); const auto& output = shader.AddOutput(output_name, - ToProgramVariableDataType(Outputs()[0].tensor->GetElementType()), ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); shader.AppendImplementation(AppendPermFunction(input_name, output_name, this->perm_)); shader.MainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"), From ed4134fd67759a0960ba69a1aef524888556cebc Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Tue, 10 Sep 2024 12:25:52 +0800 Subject: [PATCH 5/7] Rebase 2 --- .../core/providers/webgpu/tensor/transpose.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index e4ec2c2628026..b751d2931e7d9 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -58,11 +58,11 @@ Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& output = shader.AddOutput(output_name, ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); shader.AppendImplementation(AppendPermFunction(input_name, output_name, this->perm_)); - shader.MainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"), - " let indices = ", output.OffsetToIndices("global_idx"), - ";\n" - " let x_indices = perm(indices); \n", - output.SetByOffset("global_idx", input.GetByIndices("x_indices"))); + shader.SetMainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"), + " let indices = ", output.OffsetToIndices("global_idx"), + ";\n" + " let x_indices = perm(indices); \n", + output.SetByOffset("global_idx", input.GetByIndices("x_indices"))); return Status::OK(); } @@ -82,10 +82,10 @@ Status Transpose::ComputeInternal(ComputeContext& context) const { TransposeProgram program{*p_perm}; program .CacheHint(absl::StrJoin(*p_perm, "-")) - .Inputs({{input_tensor, ProgramTensorMetadataDependency::Rank}}) - .Outputs({output_tensor}) - .DispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) - .UniformVariables({ + .AddInputs({{input_tensor, ProgramTensorMetadataDependency::Rank}}) + .AddOutputs({output_tensor}) + .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .AddUniformVariables({ {static_cast(output_size)}, }); return context.RunProgram(program); From 65f549cbbce87bdd1c9c7d1b3098d0b1232301b3 Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Wed, 11 Sep 2024 10:14:22 +0800 Subject: [PATCH 6/7] Merge version --- onnxruntime/core/providers/webgpu/tensor/transpose.cc | 9 +-------- .../core/providers/webgpu/webgpu_execution_provider.cc | 6 ++---- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index b751d2931e7d9..7efd79e76e890 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -91,12 +91,6 @@ Status Transpose::ComputeInternal(ComputeContext& context) const { return context.RunProgram(program); } -#define WEBGPU_TRANSPOSE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS, TYPE) \ - ONNX_OPERATOR_KERNEL_EX( \ - OP_TYPE, kOnnxDomain, VERSION, kWebGpuExecutionProvider, \ - KernelDefBuilder().TypeConstraint("T", TYPE), \ - KERNEL_CLASS); - #define WEBGPU_TRANSPOSE_VERSIONED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, KERNEL_CLASS, TYPE) \ ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ OP_TYPE, kOnnxDomain, VERSION_FROM, VERSION_TO, kWebGpuExecutionProvider, \ @@ -104,8 +98,7 @@ Status Transpose::ComputeInternal(ComputeContext& context) const { KERNEL_CLASS); WEBGPU_TRANSPOSE_VERSIONED_KERNEL(Transpose, 1, 12, Transpose, WebGpuSupportedFloatTypes()) -WEBGPU_TRANSPOSE_KERNEL(Transpose, 13, Transpose, WebGpuSupportedFloatTypes()) -WEBGPU_TRANSPOSE_KERNEL(Transpose, 17, Transpose, WebGpuSupportedFloatTypes()) +WEBGPU_TRANSPOSE_VERSIONED_KERNEL(Transpose, 13, 20, Transpose, WebGpuSupportedFloatTypes()) } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc index 91642ac2b2308..ae5b429fb2301 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc @@ -239,8 +239,7 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxD class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 16, Where); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, 12, Transpose); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, Transpose); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 17, Transpose); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, 20, Transpose); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 11, 12, DepthToSpace); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 13, DepthToSpace); @@ -554,8 +553,7 @@ std::unique_ptr RegisterKernels() { // KERNEL_CREATE_INFO(16, Where), BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, From d77ab42a297e5cef38a9d8682a54ea84f954b9fc Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Wed, 11 Sep 2024 00:25:26 -0700 Subject: [PATCH 7/7] fix and revise transpose (naive) --- .../core/providers/webgpu/tensor/transpose.cc | 63 +++++++++---------- .../providers/webgpu/webgpu_supported_types.h | 6 +- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index 7efd79e76e890..68af858d515c2 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -6,6 +6,7 @@ #include "core/providers/cpu/tensor/utils.h" #include "core/providers/webgpu/shader_variable.h" #include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/webgpu_supported_types.h" namespace onnxruntime { namespace webgpu { @@ -14,59 +15,66 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX( Transpose, kOnnxDomain, 1, 12, - kCudaExecutionProvider, + kWebGpuExecutionProvider, (*KernelDefBuilder::Create()) - .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + .TypeConstraint("T", WebGpuSupportedNumberTypes()), Transpose); -ONNX_OPERATOR_KERNEL_EX( +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + Transpose, + kOnnxDomain, + 13, 20, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", WebGpuSupportedNumberTypes()), + Transpose); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( Transpose, kOnnxDomain, - 13, - kCudaExecutionProvider, + 21, 22, + kWebGpuExecutionProvider, (*KernelDefBuilder::Create()) - .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + .TypeConstraint("T", WebGpuSupportedNumberTypes()), Transpose); ONNX_OPERATOR_KERNEL_EX( Transpose, kOnnxDomain, - 17, - kCudaExecutionProvider, + 23, + kWebGpuExecutionProvider, (*KernelDefBuilder::Create()) - .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), + .TypeConstraint("T", WebGpuSupportedNumberTypes()), Transpose); -const std::string AppendPermFunction(std::string_view input_name, std::string_view output_name, gsl::span perm) { +const std::string AppendPermFunction(gsl::span perm) { std::ostringstream ss; ss.imbue(std::locale::classic()); - ss << "fn perm(i: " << output_name << "_indices_t" - << ")->" << input_name << "_indices_t " - << "{\n var a: " << input_name << "_indices_t;\n"; + ss << "fn perm(i: y_indices_t)->x_indices_t {\n" + " var a: x_indices_t;\n"; for (auto i = 0; i < perm.size(); ++i) { ss << " a[" << perm[i] << "] = i[" << i << "];\n"; } - ss << " return a;\n}\n"; + ss << " return a;\n" + "}\n"; return ss.str(); } Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const { - const auto input_name{"x"}; - const auto output_name{"y"}; - const auto& input = shader.AddInput(input_name, - ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); - const auto& output = shader.AddOutput(output_name, - ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); - shader.AppendImplementation(AppendPermFunction(input_name, output_name, this->perm_)); + const auto& input = shader.AddInput("x", ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); + const auto& output = shader.AddOutput("y", ShaderVariable::UseUniform | ShaderVariable::UseIndicesTypeAlias); + shader.AppendImplementation(AppendPermFunction(this->perm_)); shader.SetMainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"), " let indices = ", output.OffsetToIndices("global_idx"), ";\n" - " let x_indices = perm(indices); \n", + " let x_indices = perm(indices); \n" + " ", output.SetByOffset("global_idx", input.GetByIndices("x_indices"))); return Status::OK(); } Status Transpose::ComputeInternal(ComputeContext& context) const { + // TODO: there is an optimized version of transpose to port. const auto* input_tensor = context.Input(0); const TensorShape& input_shape = input_tensor->Shape(); int32_t rank = gsl::narrow_cast(input_shape.NumDimensions()); @@ -82,7 +90,7 @@ Status Transpose::ComputeInternal(ComputeContext& context) const { TransposeProgram program{*p_perm}; program .CacheHint(absl::StrJoin(*p_perm, "-")) - .AddInputs({{input_tensor, ProgramTensorMetadataDependency::Rank}}) + .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank}}) .AddOutputs({output_tensor}) .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) .AddUniformVariables({ @@ -91,14 +99,5 @@ Status Transpose::ComputeInternal(ComputeContext& context) const { return context.RunProgram(program); } -#define WEBGPU_TRANSPOSE_VERSIONED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, KERNEL_CLASS, TYPE) \ - ONNX_OPERATOR_VERSIONED_KERNEL_EX( \ - OP_TYPE, kOnnxDomain, VERSION_FROM, VERSION_TO, kWebGpuExecutionProvider, \ - KernelDefBuilder().TypeConstraint("T", TYPE), \ - KERNEL_CLASS); - -WEBGPU_TRANSPOSE_VERSIONED_KERNEL(Transpose, 1, 12, Transpose, WebGpuSupportedFloatTypes()) -WEBGPU_TRANSPOSE_VERSIONED_KERNEL(Transpose, 13, 20, Transpose, WebGpuSupportedFloatTypes()) - } // namespace webgpu } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webgpu/webgpu_supported_types.h b/onnxruntime/core/providers/webgpu/webgpu_supported_types.h index fccaef2c53575..ff66cd535399e 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_supported_types.h +++ b/onnxruntime/core/providers/webgpu/webgpu_supported_types.h @@ -8,7 +8,7 @@ namespace onnxruntime { namespace webgpu { -using SupportedTypes = +using SupportedNumberTypes = TypeList< float, MLFloat16, @@ -20,8 +20,8 @@ using SupportedFloats = float, MLFloat16>; -inline const std::vector& WebGpuSupportedDataTypes() { - static const std::vector supportedDataTypes = BuildKernelDefConstraintsFromTypeList(); +inline const std::vector& WebGpuSupportedNumberTypes() { + static const std::vector supportedDataTypes = BuildKernelDefConstraintsFromTypeList(); return supportedDataTypes; }