Skip to content

Commit

Permalink
Add flag WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY
Browse files Browse the repository at this point in the history
  • Loading branch information
axinging committed Nov 21, 2022
1 parent 0d7a38e commit 6e0937c
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 33 deletions.
39 changes: 30 additions & 9 deletions tfjs-backend-webgpu/src/backend_webgpu.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,12 @@ type TensorData = {
shape: number[],
refCount: number,
resourceInfo?: BufferInfo|TextureInfo,
// Indicate the tensor is created from an external GPU resource.
external?: boolean,
// zeroCopy is used for creating tensor from GPUBuffer. When zeroCopy is false
// or undefined (default), this GPUBuffer will be copied to the tensor's
// resource buffer. When zeroCopy is true, tensor will use this GPUBUffer as
// tensor's resource buffer, user should not destroy this GPUBuffer until all
// access are done.
zeroCopy?: boolean,
// For complex numbers, the real and imaginary parts are stored as their own
// individual tensors, with a parent joining the two with the
// complexTensorInfos field.
Expand Down Expand Up @@ -244,8 +248,9 @@ export class WebGPUBackend extends KernelBackend {
if (!tensorData || !tensorData.resourceInfo) {
return;
}
// If tensor data is from external resource, do not release.
if (tensorData.external) {
// If tensor's resource buffer is from a zero copy GPUBuffer, do not
// release.
if (tensorData.zeroCopy) {
tensorData.resourceInfo = null;
return;
}
Expand Down Expand Up @@ -445,18 +450,31 @@ export class WebGPUBackend extends KernelBackend {
return vals;
}

// The source GPUBuffer and destination GPUBuffer have the same size and
// usage.
private copyBuffer(srcBuffer: GPUBuffer, size: number, usage: number) {
const dstBuffer = this.bufferManager.acquireBuffer(size, usage);
this.ensureCommandEncoderReady();
this.ensureComputePassEnded();
this.currentCommandEncoder.copyBufferToBuffer(
srcBuffer, 0, dstBuffer, 0, size);
this.submitQueue();
return dstBuffer;
}

/**
* Create a TF.js tensor out of an existing WebGPU buffer.
*/
override createTensorFromGPUData(
values: WebGPUData, shape: number[], dtype: DataType): Tensor {
const buffer = values.buffer;
let buffer = values.buffer;
if (dtype === 'complex64') {
throw new Error(`Cannot write to a complex64 dtype. `);
}
const dataId = {id: this.nextDataId()};
const zeroCopy = env().getBool('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY');
this.tensorMap.set(
dataId, {dtype, shape, values: null, refCount: 1, external: true});
dataId, {dtype, shape, values: null, refCount: 1, zeroCopy});
const tensorData = this.tensorMap.get(dataId);
const size = webgpu_util.GPUBytesPerElement(tensorData.dtype) *
util.sizeFromShape(tensorData.shape);
Expand All @@ -469,6 +487,10 @@ export class WebGPUBackend extends KernelBackend {
throw new Error('GPUBuffer.usage should include GPUBufferUsage.STORAGE!');
}

// Do buffer copy by default.
if (zeroCopy === false) {
buffer = this.copyBuffer(buffer, size, buffer.usage);
}
tensorData.resourceInfo = {size: buffer.size, usage: buffer.usage, buffer};
return engine().makeTensorFromDataId(dataId, shape, dtype, this);
}
Expand Down Expand Up @@ -659,9 +681,8 @@ export class WebGPUBackend extends KernelBackend {
// TODO: WebGPU doesn't support read data synchronously from GPU to CPU.
// So it will report error when switching backend from WebGPU to others.
// There are two situations: 1) swithcing the backend after running a
// model; 2) swithcing the backend within the model. Temporarilly keep the
// values on CPU to solve the first issue.
// tensorData.values = null;
// model; 2) swithcing the backend within the model. Temporarilly keep
// the values on CPU to solve the first issue. tensorData.values = null;
}
}

Expand Down
63 changes: 49 additions & 14 deletions tfjs-backend-webgpu/src/backend_webgpu_test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,8 @@ describeWebGPU('keeping data on gpu ', () => {

function createReadonlyGPUBufferFromData(
device: GPUDevice, data: number[], dtype: tf.DataType,
bufferUsage = GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE) {
bufferUsage = GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE |
GPUBufferUsage.COPY_SRC) {
const bytesPerElement = 4;
const sizeInBytes = data.length * bytesPerElement;

Expand Down Expand Up @@ -427,7 +428,7 @@ function createStagingGPUBufferFromData(
}

async function testCreateTensorFromGPUBuffer(
dtype: tf.DataType, useDefaultShapeAndType = false) {
dtype: tf.DataType, useDefaultShapeAndType = false, zeroCopy = false) {
const webGPUBackend = tf.backend() as WebGPUBackend;
const device = webGPUBackend.device;
const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
Expand All @@ -439,6 +440,9 @@ async function testCreateTensorFromGPUBuffer(
const startNumTensors = tf.memory().numTensors;
const a = useDefaultShapeAndType ? tf.tensor({buffer: aBuffer}) :
tf.tensor({buffer: aBuffer}, shape, dtype);
if (zeroCopy !== true) {
aBuffer.destroy();
}
const b = tf.tensor(bData, shape, dtype);
const result = tf.add(a, b);
tf.test_util.expectArraysClose(await result.data(), expected);
Expand All @@ -449,35 +453,39 @@ async function testCreateTensorFromGPUBuffer(
const endNumTensors = tf.memory().numTensors;
expect(endNumBytes - startNumBytes).toEqual(0);
expect(endNumTensors - startNumTensors).toEqual(0);
aBuffer.destroy();
if (zeroCopy === true) {
aBuffer.destroy();
}
}

describeWebGPU('create tensor from GPUBuffer', () => {
function createTensorFromGPUTest(zeroCopy = false) {
it('use default shape and data type(float32)', async () => {
await testCreateTensorFromGPUBuffer('float32', true);
await testCreateTensorFromGPUBuffer('float32', true, zeroCopy);
});

it('work for float32', async () => {
await testCreateTensorFromGPUBuffer('float32');
await testCreateTensorFromGPUBuffer('float32', false, zeroCopy);
});

it('work for int32', async () => {
await testCreateTensorFromGPUBuffer('int32');
await testCreateTensorFromGPUBuffer('int32', false, zeroCopy);
});

it('work for read', async () => {
const webGPUBackend = tf.backend() as WebGPUBackend;
const device = webGPUBackend.device;
const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
const dtype = 'float32';
const aBuffer = createReadonlyGPUBufferFromData(
device, aData, dtype,
GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE |
GPUBufferUsage.COPY_SRC);
const aBuffer = createReadonlyGPUBufferFromData(device, aData, dtype);
const shape: number[] = [aData.length];
const a = tf.tensor({buffer: aBuffer}, shape, dtype);
if (zeroCopy !== true) {
aBuffer.destroy();
}
await a.data();
aBuffer.destroy();
if (zeroCopy === true) {
aBuffer.destroy();
}
});

it('two tensors share the same GPUBuffer', async () => {
Expand All @@ -491,6 +499,9 @@ describeWebGPU('create tensor from GPUBuffer', () => {
const shape: number[] = [aData.length];
const a = tf.tensor({buffer: aBuffer}, shape, dtype);
const b = tf.tensor({buffer: aBuffer}, shape, dtype);
if (zeroCopy !== true) {
aBuffer.destroy();
}
const result = tf.add(a, b);
const expected =
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32];
Expand All @@ -502,7 +513,9 @@ describeWebGPU('create tensor from GPUBuffer', () => {
const endNumTensors = tf.memory().numTensors;
expect(endNumBytes - startNumBytes).toEqual(0);
expect(endNumTensors - startNumTensors).toEqual(0);
aBuffer.destroy();
if (zeroCopy === true) {
aBuffer.destroy();
}
});

it('GPUBuffer size is bigger than tensor size', async () => {
Expand All @@ -517,6 +530,9 @@ describeWebGPU('create tensor from GPUBuffer', () => {
const shape: number[] = [aData.length - 1];
const a = tf.tensor({buffer: aBuffer}, shape, dtype);
const b = tf.tensor({buffer: aBuffer}, shape, dtype);
if (zeroCopy !== true) {
aBuffer.destroy();
}
const result = tf.add(a, b);
const expected = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30];
tf.test_util.expectArraysClose(await result.data(), expected);
Expand All @@ -527,7 +543,9 @@ describeWebGPU('create tensor from GPUBuffer', () => {
const endNumTensors = tf.memory().numTensors;
expect(endNumBytes - startNumBytes).toEqual(0);
expect(endNumTensors - startNumTensors).toEqual(0);
aBuffer.destroy();
if (zeroCopy === true) {
aBuffer.destroy();
}
});

it('throw when GPUBuffer size is smaller than tensor size', async () => {
Expand Down Expand Up @@ -556,4 +574,21 @@ describeWebGPU('create tensor from GPUBuffer', () => {
expect(a).toThrowError();
aBuffer.destroy();
});
}

describeWebGPU('create tensor from GPUBuffer', () => {
createTensorFromGPUTest();
});

describeWebGPU('create tensor from GPUBuffer with zero copy', () => {
let savedZeroCopyFlag = false;
beforeAll(() => {
savedZeroCopyFlag =
tf.env().get('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY') as boolean;
tf.env().set('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY', true);
});
afterAll(() => {
tf.env().set('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY', savedZeroCopyFlag);
});
createTensorFromGPUTest(true);
});
8 changes: 6 additions & 2 deletions tfjs-backend-webgpu/src/flags_webgpu.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,10 +76,14 @@ ENV.registerFlag('WEBGPU_USE_NAIVE_CONV2D_DEBUG', () => false);
* are dispatched, it means the hardware may be in low occupancy.
* 0 means it's not set by the user. A default strategy will be applied.
*/
ENV.registerFlag(
'WEBGPU_THRESHOLD_TO_INCREASE_WORKGROUPS_FOR_MATMUL', () => 0);
ENV.registerFlag('WEBGPU_THRESHOLD_TO_INCREASE_WORKGROUPS_FOR_MATMUL', () => 0);

/**
* Whether we will run im2col as a separate shader for convolution.
*/
ENV.registerFlag('WEBGPU_CONV_SEPARATE_IM2COL_SHADER', () => false);

/**
* Whether use zero copy when create tensor from GPUBuffer.
*/
ENV.registerFlag('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY', () => false);
14 changes: 6 additions & 8 deletions tfjs-core/src/ops/tensor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,9 +99,7 @@ import {makeTensor} from './tensor_ops_util';
* // This makes it possible for TF.js applications to avoid GPU / CPU sync.
* // For example, if your application includes a preprocessing step on the GPU,
* // you could upload the GPU output directly to TF.js, rather than first
* // downloading the values. Unlike WebGL, to support zero copy, this GPUBuffer
* // is bound directly by the tensor. So donot destroy this GPUBuffer until all
* // access are done.
* // downloading the values.
*
* // Example for WebGPU:
* function createReadonlyGPUBufferFromData(device, data, dtype) {
Expand All @@ -128,7 +126,8 @@ import {makeTensor} from './tensor_ops_util';
* const gpuReadBuffer = device.createBuffer({
* mappedAtCreation: false,
* size: sizeInBytes,
* usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE
* usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE |
* GPUBufferUsage.COPY_SRC
* });
*
* const copyEncoder = device.createCommandEncoder();
Expand Down Expand Up @@ -171,10 +170,9 @@ import {makeTensor} from './tensor_ops_util';
* size, zeros will be padded at the rear.). If the values is a `WebGPUData`
* object, the dtype could only be 'float32' or 'int32 and the object has to
* have: buffer, a `GPUBuffer`. The buffer must: 1. share the same `GPUDevice`
* with TFJS's WebGPU backend; 2.buffer.usage should at least support
* GPUBufferUsage.STORAGE, to support tensor.data, GPUBufferUsage.COPY_SRC is
* also required; 3. buffer.size should not be smaller than the byte size of
* tensor shape.
* with TFJS's WebGPU backend; 2. buffer.usage should at least support
* GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC; 3. buffer.size should not
* be smaller than the byte size of tensor shape.
* @param shape The shape of the tensor. Optional. If not provided,
* it is inferred from `values`.
* @param dtype The data type.
Expand Down

0 comments on commit 6e0937c

Please sign in to comment.