From de64e53b496e5aec1c69a0a24e3c5343c936e10b Mon Sep 17 00:00:00 2001 From: Jiajia Qin Date: Wed, 13 Nov 2024 04:37:19 +0800 Subject: [PATCH] [js/webgpu] Optimize Expand (#22752) Use components = 4 if possible. llama3.2-1B becomes 20 tokens/s from 18 tokens/s on my iGPUs. --- js/web/lib/wasm/jsep/webgpu/ops/expand.ts | 18 +++++--- js/web/test/data/ops/expand.jsonc | 50 +++++++++++++++++++++++ 2 files changed, 63 insertions(+), 5 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts index 4e2bfa9d89924..3691b5ecb602b 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/expand.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/expand.ts @@ -48,11 +48,18 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo => const shape = Array.from(inputs[1].getBigInt64Array(), Number); const outputShape: number[] = calculateOutputShape(inputShape, shape); const dataType = inputs[0].dataType; - const components = dataType === DataType.bool ? 4 : 1; + const isBoolOrScalar = dataType === DataType.bool || ShapeUtil.size(inputShape) === 1; + const iComponents = + dataType === DataType.bool ? 4 : inputShape.length > 0 && inputShape[inputShape.length - 1] % 4 === 0 ? 4 : 1; + const components = isBoolOrScalar + ? 4 + : outputShape.length > 0 && outputShape[outputShape.length - 1] % 4 === 0 + ? 4 + : 1; const outputSize = Math.ceil(ShapeUtil.size(outputShape) / components); const getShaderSource = (shaderHelper: ShaderHelper) => { - const input = inputVariable('input', dataType, inputShape.length, components); + const input = inputVariable('input', dataType, inputShape.length, iComponents); const output = outputVariable('output', dataType, outputShape.length, components); let assignment: string; if (dataType === DataType.bool) { @@ -74,9 +81,10 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo => }`; } else { assignment = ` - let outputIndices = ${output.offsetToIndices('global_idx')}; + let outputIndices = ${output.offsetToIndices(`global_idx * ${components}`)}; let inputOffset = ${input.broadcastedIndicesToOffset('outputIndices', output)}; - ${output.setByOffset('global_idx', input.getByOffset('inputOffset'))} + let data = ${output.type.value}(${input.getByOffset(`inputOffset / ${iComponents}`)}); + ${output.setByOffset('global_idx', 'data')} }`; } return ` @@ -92,7 +100,7 @@ const createExpandProgramInfo = (inputs: readonly TensorView[]): ProgramInfo => ]; return { name: 'Expand', - shaderCache: { hint: `${outputShape.length}`, inputDependencies: ['rank'] }, + shaderCache: { hint: `${outputShape.length};${iComponents}${components}`, inputDependencies: ['rank'] }, getShaderSource, getRunData: () => ({ outputs: [{ dims: outputShape, dataType: inputs[0].dataType }], diff --git a/js/web/test/data/ops/expand.jsonc b/js/web/test/data/ops/expand.jsonc index 613b4507b2b15..8fbe9339feb9b 100644 --- a/js/web/test/data/ops/expand.jsonc +++ b/js/web/test/data/ops/expand.jsonc @@ -134,6 +134,56 @@ "type": "float32" } ] + }, + { + "name": "Expand in components = 1, out components = 4", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6], + "dims": [3, 2, 1], + "type": "float32" + }, + { + "data": [3, 1, 8], + "dims": [3], + "type": "int64" + } + ], + "outputs": [ + { + "data": [ + 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, + 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6 + ], + "dims": [3, 2, 8], + "type": "float32" + } + ] + }, + { + "name": "Expand in components = 4, out components = 4", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + "dims": [1, 1, 2, 8], + "type": "float32" + }, + { + "data": [2, 1, 8], + "dims": [3], + "type": "int64" + } + ], + "outputs": [ + { + "data": [ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16 + ], + "dims": [1, 2, 2, 8], + "type": "float32" + } + ] } ] },