From d94fababcb00324e5d118fde64f5cb6bc199c6ed Mon Sep 17 00:00:00 2001 From: Jiajia Qin Date: Fri, 30 Sep 2022 12:39:04 +0800 Subject: [PATCH] webgpu: Support AdapterInfo (#6862) Support sequential access mode Use sequential mode for intel device --- tfjs-backend-webgpu/src/adapter_info.ts | 37 ++++ tfjs-backend-webgpu/src/backend_webgpu.ts | 5 +- tfjs-backend-webgpu/src/base.ts | 4 +- tfjs-backend-webgpu/src/conv2d_mm_webgpu.ts | 9 +- .../src/kernels/BatchMatMul_impl.ts | 6 +- .../src/kernels/Conv2D_impl.ts | 4 +- .../src/matmul_packed_webgpu.ts | 201 ++++++++++++------ 7 files changed, 191 insertions(+), 75 deletions(-) create mode 100644 tfjs-backend-webgpu/src/adapter_info.ts diff --git a/tfjs-backend-webgpu/src/adapter_info.ts b/tfjs-backend-webgpu/src/adapter_info.ts new file mode 100644 index 00000000000..e6f40f959bf --- /dev/null +++ b/tfjs-backend-webgpu/src/adapter_info.ts @@ -0,0 +1,37 @@ +/** + * @license + * Copyright 2022 Google LLC. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ============================================================================= + */ + +// TODO: Remove it once webgpu/types is successfully upgraded. +// https://github.com/tensorflow/tfjs/issues/6869 +export interface GPUAdapterInfo { + vendor: string; + architecture: string; +} + +export class AdapterInfo { + private vendor: string; + + constructor(adapterInfo: GPUAdapterInfo) { + if (adapterInfo) { + this.vendor = adapterInfo.vendor; + } + } + + isIntel(): boolean { + return this.vendor === 'intel'; + } +} diff --git a/tfjs-backend-webgpu/src/backend_webgpu.ts b/tfjs-backend-webgpu/src/backend_webgpu.ts index 4126e7295a5..421261b93e6 100644 --- a/tfjs-backend-webgpu/src/backend_webgpu.ts +++ b/tfjs-backend-webgpu/src/backend_webgpu.ts @@ -19,6 +19,7 @@ import './flags_webgpu'; import {backend_util, buffer, DataStorage, DataType, engine, env, GPUData, KernelBackend, Rank, RecursiveArray, ShapeMap, TensorBuffer, TensorInfo, TimingInfo, TypedArray, util} from '@tensorflow/tfjs-core'; +import {AdapterInfo, GPUAdapterInfo} from './adapter_info'; import {BufferManager} from './buffer_manager'; import {TextureManager} from './texture_manager'; import * as webgpu_program from './webgpu_program'; @@ -107,6 +108,7 @@ const reshapeDispatch = export class WebGPUBackend extends KernelBackend { bufferManager: BufferManager; + adapterInfo: AdapterInfo; device: GPUDevice; queue: GPUQueue; tensorMap: DataStorage; @@ -135,7 +137,7 @@ export class WebGPUBackend extends KernelBackend { return WebGPUBackend.nextDataId++; } - constructor(device: GPUDevice) { + constructor(device: GPUDevice, adapterInfo?: GPUAdapterInfo) { super(); if (!webgpu_util.isWebGPUSupported()) { throw new Error('WebGPU is not supported on this device'); @@ -146,6 +148,7 @@ export class WebGPUBackend extends KernelBackend { this.currentCommandEncoder = null; this.currentComputePass = null; this.supportTimeQuery = device.features.has('timestamp-query'); + this.adapterInfo = new AdapterInfo(adapterInfo); this.bufferManager = new BufferManager(this.device); this.textureManager = new TextureManager(this.device); diff --git a/tfjs-backend-webgpu/src/base.ts b/tfjs-backend-webgpu/src/base.ts index 4e8df2a72b9..9802b583d2f 100644 --- a/tfjs-backend-webgpu/src/base.ts +++ b/tfjs-backend-webgpu/src/base.ts @@ -50,7 +50,9 @@ if (isWebGPUSupported()) { deviceDescriptor.requiredFeatures = ['timestamp-query']; } const device: GPUDevice = await adapter.requestDevice(deviceDescriptor); - return new WebGPUBackend(device); + // tslint:disable-next-line:no-any + const adapterInfo = await (adapter as any).requestAdapterInfo(); + return new WebGPUBackend(device, adapterInfo); }, 3 /*priority*/); } diff --git a/tfjs-backend-webgpu/src/conv2d_mm_webgpu.ts b/tfjs-backend-webgpu/src/conv2d_mm_webgpu.ts index 077bece398a..55a3c3e1df8 100644 --- a/tfjs-backend-webgpu/src/conv2d_mm_webgpu.ts +++ b/tfjs-backend-webgpu/src/conv2d_mm_webgpu.ts @@ -176,12 +176,13 @@ export class Conv2DMMProgram implements WebGPUProgram { tileInner: number; innerElementSize: number; isVec4?: boolean; + private sequentialAccessByThreads: boolean; constructor( convInfo: backend_util.Conv2DInfo, dimAOuter: number, dimBOuter: number, dimInner: number, addBias = false, activation: backend_util.Activation = null, - hasPreluActivationWeights = false) { + hasPreluActivationWeights = false, sequentialAccessByThreads = false) { this.outputShape = convInfo.outShape; this.isChannelsLast = convInfo.dataFormat === 'channelsLast'; this.isVec4 = @@ -229,6 +230,7 @@ export class Conv2DMMProgram implements WebGPUProgram { } } + this.sequentialAccessByThreads = sequentialAccessByThreads; this.addBias = addBias; this.activation = activation; this.hasPreluActivationWeights = hasPreluActivationWeights; @@ -244,7 +246,8 @@ export class Conv2DMMProgram implements WebGPUProgram { this.shaderKey = `conv2DMM_${this.elementsPerThread}_${this.activation}}_${ this.fitAOuter}_${this.fitBOuter}_${this.fitInner}_${this.isVec4}_${ - this.innerElementSize}_${this.isChannelsLast}`; + this.innerElementSize}_${this.isChannelsLast}_${ + this.sequentialAccessByThreads}`; } getUserCode(): string { @@ -254,7 +257,7 @@ export class Conv2DMMProgram implements WebGPUProgram { this.tileInner) : makeMatMulPackedSource( this.elementsPerThread, this.workGroupSize, !this.isChannelsLast, - this.tileInner); + this.tileInner, false, null, this.sequentialAccessByThreads); const elementsSize = this.isVec4 ? [this.innerElementSize, 4, 4] : [1, 1, 1]; const userCode = ` diff --git a/tfjs-backend-webgpu/src/kernels/BatchMatMul_impl.ts b/tfjs-backend-webgpu/src/kernels/BatchMatMul_impl.ts index eca42437746..366e59aefab 100644 --- a/tfjs-backend-webgpu/src/kernels/BatchMatMul_impl.ts +++ b/tfjs-backend-webgpu/src/kernels/BatchMatMul_impl.ts @@ -184,9 +184,13 @@ export function batchMatMulImpl({ activation, preluActivationWeights); break; case MatMulProgramType.MatMulPackedProgram: + // Experiments show that sequential access is more friendly for Intel + // GPUs. + const sequentialAccessByThreads = backend.adapterInfo.isIntel(); program = new MatMulPackedProgram( a3dShape, outputShape, batchAEqualOne, batchBEqualOne, transposeA, - transposeB, bias, activation, preluActivationWeights); + transposeB, bias, activation, preluActivationWeights, + sequentialAccessByThreads); break; default: throw new Error(`Unsupported MatMulProgramType ${matmulProgramType}.`); diff --git a/tfjs-backend-webgpu/src/kernels/Conv2D_impl.ts b/tfjs-backend-webgpu/src/kernels/Conv2D_impl.ts index cb6552abff2..e83859f0a09 100644 --- a/tfjs-backend-webgpu/src/kernels/Conv2D_impl.ts +++ b/tfjs-backend-webgpu/src/kernels/Conv2D_impl.ts @@ -229,9 +229,11 @@ export function conv2DImpl({ {type: 'int32', data: [dimAOuter]}, {type: 'int32', data: [dimBOuter]}, {type: 'int32', data: [dimInner]}); + // Experiments show that sequential access is more friendly for Intel GPUs. + const sequentialAccessByThreads = backend.adapterInfo.isIntel(); program = new Conv2DMMProgram( convInfo, dimAOuter, dimBOuter, dimInner, hasBias, activation, - hasPreluActivationWeights); + hasPreluActivationWeights, sequentialAccessByThreads); } const intermediates: TensorInfo[] = []; diff --git a/tfjs-backend-webgpu/src/matmul_packed_webgpu.ts b/tfjs-backend-webgpu/src/matmul_packed_webgpu.ts index 9ed926f29f7..8322a7fbbde 100644 --- a/tfjs-backend-webgpu/src/matmul_packed_webgpu.ts +++ b/tfjs-backend-webgpu/src/matmul_packed_webgpu.ts @@ -39,8 +39,7 @@ export function matMulReadFnSource( `value = getB(batch, row, col);`; return ` - fn mm_readA(batchIn: i32, row: i32, colIn: i32) -> ${ - typeSnippet(component)} { + fn mm_readA(batchIn: i32, row: i32, colIn: i32) -> ${typeSnippet(component)} { var value = ${typeSnippet(component)}(0.0); let col = colIn * ${component}; ${ @@ -58,8 +57,7 @@ export function matMulReadFnSource( return value; } - fn mm_readB(batchIn: i32, row: i32, colIn: i32) -> ${ - typeSnippet(component)} { + fn mm_readB(batchIn: i32, row: i32, colIn: i32) -> ${typeSnippet(component)} { let col = colIn * ${component}; let batch = ${batchBEqualOne ? '0' : 'batchIn'}; var value = ${typeSnippet(component)}(0.0); @@ -270,10 +268,12 @@ const readDataFromSubASnippet = (transposeA: boolean) => { 'let ACached = mm_Asub[tileRow + innerRow][k];'; }; +// sequentialAccessByThreads means sequential data in memory is accessed by +// threads, instead of a single thread (default behavior). export function makeMatMulPackedSource( workPerThread: number[], workGroupSize: [number, number, number], - transposeA = false, tileInner = 32, splitK = false, - splitedDimInner = 32): string { + transposeA = false, tileInner = 32, splitK = false, splitedDimInner = 32, + sequentialAccessByThreads = false): string { const tileAOuter = workPerThread[1] * workGroupSize[1]; const tileBOuter = workPerThread[0] * workGroupSize[0]; const tileAWidth = transposeA ? tileAOuter : tileInner; @@ -290,69 +290,31 @@ export function makeMatMulPackedSource( const rowPerThreadA = tileAHight / workGroupSize[1]; const colPerThreadA = tileAWidth / workGroupSize[0]; const rowPerThreadB = tileInner / workGroupSize[1]; - return ` - var mm_Asub : array, ${tileAHight}>; - var mm_Bsub : array, ${tileInner}>; - const RowPerThread = ${workPerThread[1]}; - const ColPerThread = ${workPerThread[0]}; - const TileInner = ${tileInner}; - - @compute @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ) - fn _start(@builtin(local_invocation_id) LocalId : vec3, - @builtin(global_invocation_id) GlobalId : vec3, - @builtin(num_workgroups) NumWorkgroups: vec3, - @builtin(workgroup_id) workgroupId: vec3) { - localId = LocalId; - globalId = GlobalId; - numWorkgroups = NumWorkgroups; - - let tileRow = i32(localId.y) * RowPerThread; - let tileCol = i32(localId.x) * ColPerThread; - - let globalRow = i32(globalId.y) * RowPerThread; - let globalCol = i32(globalId.x) * ColPerThread; - let batch = ${splitK ? '0' : 'i32(globalId.z)'}; + const matmulSnippet = sequentialAccessByThreads ? + ` + let localRow = i32(localId.y); + let localCol = i32(localId.x); let globalRowStart = i32(workgroupId.y) * ${tileAOuter}; + let globalColStart = i32(workgroupId.x) * ${tileBOuter}; - let numTiles = ${ - splitK ? `${Math.ceil(splitedDimInner / tileInner)}` : - '(uniforms.dimInner - 1) / TileInner + 1'}; - var kStart = ${splitK ? `i32(globalId.z) * ${splitedDimInner}` : '0'}; - - var acc : array, RowPerThread>; - - // Without this initialization strange values show up in acc. - for (var innerRow = 0; innerRow < RowPerThread; innerRow = innerRow + 1) { - for (var innerCol = 0; innerCol < ColPerThread; innerCol = innerCol + 1) { - acc[innerRow][innerCol] = 0.0; - } - } - - let tileRowA = i32(localId.y) * ${rowPerThreadA}; - let tileColA = i32(localId.x) * ${colPerThreadA}; - let tileRowB = i32(localId.y) * ${rowPerThreadB}; // Loop over shared dimension. for (var t = 0; t < numTiles; t = t + 1) { // Load one tile of A into local memory. - for (var innerRow = 0; innerRow < ${ - rowPerThreadA}; innerRow = innerRow + 1) { - for (var innerCol = 0; innerCol < ${ - colPerThreadA}; innerCol = innerCol + 1) { - let inputRow = tileRowA + innerRow; - let inputCol = tileColA + innerCol; + for (var inputRow = localRow; inputRow < ${ + tileAHight}; inputRow = inputRow + ${workGroupSize[1]}) { + for (var inputCol = localCol; inputCol < ${ + tileAWidth}; inputCol = inputCol + ${workGroupSize[0]}) { ${writeDataToSubASnippet(transposeA)} } } - // Load one tile of B into local memory. - for (var innerRow = 0; innerRow < ${ - rowPerThreadB}; innerRow = innerRow + 1) { - for (var innerCol = 0; innerCol < ColPerThread; innerCol = innerCol + 1) { - let inputRow = tileRowB + innerRow; - let inputCol = tileCol + innerCol; + for (var inputRow = localRow; inputRow < ${ + tileInner}; inputRow = inputRow + ${workGroupSize[1]}) { + for (var inputCol = localCol; inputCol < ${ + tileBOuter}; inputCol = inputCol + ${workGroupSize[0]}) { mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, - globalCol + innerCol); + globalColStart + inputCol); } } kStart = kStart + TileInner; @@ -362,26 +324,124 @@ export function makeMatMulPackedSource( var BCached : array; for (var k = 0; k < TileInner; k = k + 1) { for (var inner = 0; inner < ColPerThread; inner = inner + 1) { - BCached[inner] = mm_Bsub[k][tileCol + inner]; + BCached[inner] = mm_Bsub[k][localCol + inner * ${workGroupSize[0]}]; } - for (var innerRow = 0; innerRow < RowPerThread; innerRow = innerRow + 1) { - ${readDataFromSubASnippet(transposeA)} + let ACached = ${ + transposeA ? + `mm_Asub[k][localRow + innerRow * ${workGroupSize[1]}];` : + `mm_Asub[localRow + innerRow * ${workGroupSize[1]}][k];`} for (var innerCol = 0; innerCol < ColPerThread; innerCol = innerCol + 1) { - acc[innerRow][innerCol] = acc[innerRow][innerCol] + ACached * BCached[innerCol]; + acc[innerRow][innerCol] = acc[innerRow][innerCol] + + ACached * BCached[innerCol]; } } } - workgroupBarrier(); } + for (var innerRow = 0; innerRow < RowPerThread; innerRow = innerRow + 1) { + let gRow = globalRowStart + localRow + innerRow * ${workGroupSize[1]}; + for (var innerCol = 0; innerCol < ColPerThread; innerCol = innerCol + 1) { + let gCol = globalColStart + localCol + innerCol * ${workGroupSize[0]}; + mm_write(batch, gRow, gCol, acc[innerRow][innerCol]); + } + } + ` : + ` + let tileRow = i32(localId.y) * RowPerThread; + let tileCol = i32(localId.x) * ColPerThread; + + let globalRow = i32(globalId.y) * RowPerThread; + let globalCol = i32(globalId.x) * ColPerThread; + let globalRowStart = i32(workgroupId.y) * ${tileAOuter}; + + let tileRowA = i32(localId.y) * ${rowPerThreadA}; + let tileColA = i32(localId.x) * ${colPerThreadA}; + let tileRowB = i32(localId.y) * ${rowPerThreadB}; + // Loop over shared dimension. + for (var t = 0; t < numTiles; t = t + 1) { + // Load one tile of A into local memory. + for (var innerRow = 0; innerRow < ${ + rowPerThreadA}; innerRow = innerRow + 1) { + for (var innerCol = 0; innerCol < ${ + colPerThreadA}; innerCol = innerCol + 1) { + let inputRow = tileRowA + innerRow; + let inputCol = tileColA + innerCol; + ${writeDataToSubASnippet(transposeA)} + } + } + + // Load one tile of B into local memory. + for (var innerRow = 0; innerRow < ${ + rowPerThreadB}; innerRow = innerRow + 1) { + for (var innerCol = 0; innerCol < ColPerThread; innerCol = innerCol + 1) { + let inputRow = tileRowB + innerRow; + let inputCol = tileCol + innerCol; + mm_Bsub[inputRow][inputCol] = mm_readB(batch, + kStart + inputRow, + globalCol + innerCol); + } + } + kStart = kStart + TileInner; + workgroupBarrier(); + + // Compute acc values for a single thread. + var BCached : array; + for (var k = 0; k < TileInner; k = k + 1) { + for (var inner = 0; inner < ColPerThread; inner = inner + 1) { + BCached[inner] = mm_Bsub[k][tileCol + inner]; + } + + for (var innerRow = 0; innerRow < RowPerThread; innerRow = innerRow + 1) { + ${readDataFromSubASnippet(transposeA)} + for (var innerCol = 0; innerCol < ColPerThread; innerCol = innerCol + 1) { + acc[innerRow][innerCol] = acc[innerRow][innerCol] + ACached * BCached[innerCol]; + } + } + } + + workgroupBarrier(); + } + + for (var innerRow = 0; innerRow < RowPerThread; innerRow = innerRow + 1) { + for (var innerCol = 0; innerCol < ColPerThread; innerCol = innerCol + 1) { + mm_write(batch, globalRow + innerRow, globalCol + innerCol, + acc[innerRow][innerCol]); + } + } + `; + + return ` + var mm_Asub : array, ${tileAHight}>; + var mm_Bsub : array, ${tileInner}>; + const RowPerThread = ${workPerThread[1]}; + const ColPerThread = ${workPerThread[0]}; + const TileInner = ${tileInner}; + + @compute @workgroup_size(workGroupSizeX, workGroupSizeY, workGroupSizeZ) + fn _start(@builtin(local_invocation_id) LocalId : vec3, + @builtin(global_invocation_id) GlobalId : vec3, + @builtin(num_workgroups) NumWorkgroups: vec3, + @builtin(workgroup_id) workgroupId: vec3) { + localId = LocalId; + globalId = GlobalId; + numWorkgroups = NumWorkgroups; + let batch = ${splitK ? '0' : 'i32(globalId.z)'}; + let numTiles = ${ + splitK ? `${Math.ceil(splitedDimInner / tileInner)}` : + '(uniforms.dimInner - 1) / TileInner + 1'}; + var kStart = ${splitK ? `i32(globalId.z) * ${splitedDimInner}` : '0'}; + + var acc : array, RowPerThread>; + + // Without this initialization strange values show up in acc. for (var innerRow = 0; innerRow < RowPerThread; innerRow = innerRow + 1) { for (var innerCol = 0; innerCol < ColPerThread; innerCol = innerCol + 1) { - mm_write(batch, globalRow + innerRow, globalCol + innerCol, - acc[innerRow][innerCol]); + acc[innerRow][innerCol] = 0.0; } } + ${matmulSnippet} } `; } @@ -469,13 +529,15 @@ export class MatMulPackedProgram implements WebGPUProgram { tileInner: number; isVectorA: boolean; isVec4: boolean; + private sequentialAccessByThreads: boolean; constructor( aShape: [number, number, number], outputShape: [number, number, number], batchAEqualOne: boolean, batchBEqualOne: boolean, transposeA = false, transposeB = false, bias: TensorInfo = null, activation: backend_util.Activation = null, - preluActivationWeights: TensorInfo = null) { + preluActivationWeights: TensorInfo = null, + sequentialAccessByThreads = false) { this.outputShape = outputShape; this.dispatchLayout = {x: [2], y: [1], z: [0]}; const dimInner = transposeA ? aShape[1] : aShape[2]; @@ -509,6 +571,7 @@ export class MatMulPackedProgram implements WebGPUProgram { this.variableNames.push('preluActivationWeights'); } + this.sequentialAccessByThreads = sequentialAccessByThreads; this.transposeA = transposeA; this.transposeB = transposeB; this.addBias = addBias; @@ -521,7 +584,8 @@ export class MatMulPackedProgram implements WebGPUProgram { this.shaderKey = `matMulPacked_${this.elementsPerThread}_${transposeA}_${ transposeB}_${this.activation}_${this.fitAOuter}_${this.fitBOuter}_${ this.fitInner}_${this.isVec4}_${this.isVectorA}_${ - this.batchAEqualOne}_${this.batchBEqualOne}`; + this.batchAEqualOne}_${this.batchBEqualOne}_${ + this.sequentialAccessByThreads}`; } getShapeFit(dimAOuter: number, dimBOuter: number, dimInner: number): @@ -563,7 +627,8 @@ export class MatMulPackedProgram implements WebGPUProgram { this.workGroupSize, this.transposeA) : makeMatMulPackedSource( this.elementsPerThread, this.workGroupSize, - this.transposeA, this.tileInner))} + this.transposeA, this.tileInner, false, null, + this.sequentialAccessByThreads))} `; return userCode; }