Add flag WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY

tensorflow · Nov 21, 2022 · 6e0937c · 6e0937c
1 parent 0d7a38e
commit 6e0937c
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 33 deletions.
diff --git a/tfjs-backend-webgpu/src/backend_webgpu.ts b/tfjs-backend-webgpu/src/backend_webgpu.ts
@@ -51,8 +51,12 @@ type TensorData = {
   shape: number[],
   refCount: number,
   resourceInfo?: BufferInfo|TextureInfo,
-  // Indicate the tensor is created from an external GPU resource.
-  external?: boolean,
+  // zeroCopy is used for creating tensor from GPUBuffer. When zeroCopy is false
+  // or undefined (default), this GPUBuffer will be copied to the tensor's
+  // resource buffer. When zeroCopy is true, tensor will use this GPUBUffer as
+  // tensor's resource buffer, user should not destroy this GPUBuffer until all
+  // access are done.
+  zeroCopy?: boolean,
   // For complex numbers, the real and imaginary parts are stored as their own
   // individual tensors, with a parent joining the two with the
   // complexTensorInfos field.
@@ -244,8 +248,9 @@ export class WebGPUBackend extends KernelBackend {
     if (!tensorData || !tensorData.resourceInfo) {
       return;
     }
-    // If tensor data is from external resource, do not release.
-    if (tensorData.external) {
+    // If tensor's resource buffer is from a zero copy GPUBuffer, do not
+    // release.
+    if (tensorData.zeroCopy) {
       tensorData.resourceInfo = null;
       return;
     }
@@ -445,18 +450,31 @@ export class WebGPUBackend extends KernelBackend {
     return vals;
   }
 
+  // The source GPUBuffer and destination GPUBuffer have the same size and
+  // usage.
+  private copyBuffer(srcBuffer: GPUBuffer, size: number, usage: number) {
+    const dstBuffer = this.bufferManager.acquireBuffer(size, usage);
+    this.ensureCommandEncoderReady();
+    this.ensureComputePassEnded();
+    this.currentCommandEncoder.copyBufferToBuffer(
+        srcBuffer, 0, dstBuffer, 0, size);
+    this.submitQueue();
+    return dstBuffer;
+  }
+
   /**
    * Create a TF.js tensor out of an existing WebGPU buffer.
    */
   override createTensorFromGPUData(
       values: WebGPUData, shape: number[], dtype: DataType): Tensor {
-    const buffer = values.buffer;
+    let buffer = values.buffer;
     if (dtype === 'complex64') {
       throw new Error(`Cannot write to a complex64 dtype. `);
     }
     const dataId = {id: this.nextDataId()};
+    const zeroCopy = env().getBool('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY');
     this.tensorMap.set(
-        dataId, {dtype, shape, values: null, refCount: 1, external: true});
+        dataId, {dtype, shape, values: null, refCount: 1, zeroCopy});
     const tensorData = this.tensorMap.get(dataId);
     const size = webgpu_util.GPUBytesPerElement(tensorData.dtype) *
         util.sizeFromShape(tensorData.shape);
@@ -469,6 +487,10 @@ export class WebGPUBackend extends KernelBackend {
       throw new Error('GPUBuffer.usage should include GPUBufferUsage.STORAGE!');
     }
 
+    // Do buffer copy by default.
+    if (zeroCopy === false) {
+      buffer = this.copyBuffer(buffer, size, buffer.usage);
+    }
     tensorData.resourceInfo = {size: buffer.size, usage: buffer.usage, buffer};
     return engine().makeTensorFromDataId(dataId, shape, dtype, this);
   }
@@ -659,9 +681,8 @@ export class WebGPUBackend extends KernelBackend {
       // TODO: WebGPU doesn't support read data synchronously from GPU to CPU.
       // So it will report error when switching backend from WebGPU to others.
       // There are two situations: 1) swithcing the backend after running a
-      // model; 2) swithcing the backend within the model. Temporarilly keep the
-      // values on CPU to solve the first issue.
-      // tensorData.values = null;
+      // model; 2) swithcing the backend within the model. Temporarilly keep
+      // the values on CPU to solve the first issue. tensorData.values = null;
     }
   }
 

diff --git a/tfjs-backend-webgpu/src/backend_webgpu_test.ts b/tfjs-backend-webgpu/src/backend_webgpu_test.ts
@@ -369,7 +369,8 @@ describeWebGPU('keeping data on gpu ', () => {
 
 function createReadonlyGPUBufferFromData(
     device: GPUDevice, data: number[], dtype: tf.DataType,
-    bufferUsage = GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE) {
+    bufferUsage = GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE |
+        GPUBufferUsage.COPY_SRC) {
   const bytesPerElement = 4;
   const sizeInBytes = data.length * bytesPerElement;
 
@@ -427,7 +428,7 @@ function createStagingGPUBufferFromData(
 }
 
 async function testCreateTensorFromGPUBuffer(
-    dtype: tf.DataType, useDefaultShapeAndType = false) {
+    dtype: tf.DataType, useDefaultShapeAndType = false, zeroCopy = false) {
   const webGPUBackend = tf.backend() as WebGPUBackend;
   const device = webGPUBackend.device;
   const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
@@ -439,6 +440,9 @@ async function testCreateTensorFromGPUBuffer(
   const startNumTensors = tf.memory().numTensors;
   const a = useDefaultShapeAndType ? tf.tensor({buffer: aBuffer}) :
                                      tf.tensor({buffer: aBuffer}, shape, dtype);
+  if (zeroCopy !== true) {
+    aBuffer.destroy();
+  }
   const b = tf.tensor(bData, shape, dtype);
   const result = tf.add(a, b);
   tf.test_util.expectArraysClose(await result.data(), expected);
@@ -449,35 +453,39 @@ async function testCreateTensorFromGPUBuffer(
   const endNumTensors = tf.memory().numTensors;
   expect(endNumBytes - startNumBytes).toEqual(0);
   expect(endNumTensors - startNumTensors).toEqual(0);
-  aBuffer.destroy();
+  if (zeroCopy === true) {
+    aBuffer.destroy();
+  }
 }
 
-describeWebGPU('create tensor from GPUBuffer', () => {
+function createTensorFromGPUTest(zeroCopy = false) {
   it('use default shape and data type(float32)', async () => {
-    await testCreateTensorFromGPUBuffer('float32', true);
+    await testCreateTensorFromGPUBuffer('float32', true, zeroCopy);
   });
 
   it('work for float32', async () => {
-    await testCreateTensorFromGPUBuffer('float32');
+    await testCreateTensorFromGPUBuffer('float32', false, zeroCopy);
   });
 
   it('work for int32', async () => {
-    await testCreateTensorFromGPUBuffer('int32');
+    await testCreateTensorFromGPUBuffer('int32', false, zeroCopy);
   });
 
   it('work for read', async () => {
     const webGPUBackend = tf.backend() as WebGPUBackend;
     const device = webGPUBackend.device;
     const aData = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
     const dtype = 'float32';
-    const aBuffer = createReadonlyGPUBufferFromData(
-        device, aData, dtype,
-        GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE |
-            GPUBufferUsage.COPY_SRC);
+    const aBuffer = createReadonlyGPUBufferFromData(device, aData, dtype);
     const shape: number[] = [aData.length];
     const a = tf.tensor({buffer: aBuffer}, shape, dtype);
+    if (zeroCopy !== true) {
+      aBuffer.destroy();
+    }
     await a.data();
-    aBuffer.destroy();
+    if (zeroCopy === true) {
+      aBuffer.destroy();
+    }
   });
 
   it('two tensors share the same GPUBuffer', async () => {
@@ -491,6 +499,9 @@ describeWebGPU('create tensor from GPUBuffer', () => {
     const shape: number[] = [aData.length];
     const a = tf.tensor({buffer: aBuffer}, shape, dtype);
     const b = tf.tensor({buffer: aBuffer}, shape, dtype);
+    if (zeroCopy !== true) {
+      aBuffer.destroy();
+    }
     const result = tf.add(a, b);
     const expected =
         [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32];
@@ -502,7 +513,9 @@ describeWebGPU('create tensor from GPUBuffer', () => {
     const endNumTensors = tf.memory().numTensors;
     expect(endNumBytes - startNumBytes).toEqual(0);
     expect(endNumTensors - startNumTensors).toEqual(0);
-    aBuffer.destroy();
+    if (zeroCopy === true) {
+      aBuffer.destroy();
+    }
   });
 
   it('GPUBuffer size is bigger than tensor size', async () => {
@@ -517,6 +530,9 @@ describeWebGPU('create tensor from GPUBuffer', () => {
     const shape: number[] = [aData.length - 1];
     const a = tf.tensor({buffer: aBuffer}, shape, dtype);
     const b = tf.tensor({buffer: aBuffer}, shape, dtype);
+    if (zeroCopy !== true) {
+      aBuffer.destroy();
+    }
     const result = tf.add(a, b);
     const expected = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30];
     tf.test_util.expectArraysClose(await result.data(), expected);
@@ -527,7 +543,9 @@ describeWebGPU('create tensor from GPUBuffer', () => {
     const endNumTensors = tf.memory().numTensors;
     expect(endNumBytes - startNumBytes).toEqual(0);
     expect(endNumTensors - startNumTensors).toEqual(0);
-    aBuffer.destroy();
+    if (zeroCopy === true) {
+      aBuffer.destroy();
+    }
   });
 
   it('throw when GPUBuffer size is smaller than tensor size', async () => {
@@ -556,4 +574,21 @@ describeWebGPU('create tensor from GPUBuffer', () => {
     expect(a).toThrowError();
     aBuffer.destroy();
   });
+}
+
+describeWebGPU('create tensor from GPUBuffer', () => {
+  createTensorFromGPUTest();
+});
+
+describeWebGPU('create tensor from GPUBuffer with zero copy', () => {
+  let savedZeroCopyFlag = false;
+  beforeAll(() => {
+    savedZeroCopyFlag =
+        tf.env().get('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY') as boolean;
+    tf.env().set('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY', true);
+  });
+  afterAll(() => {
+    tf.env().set('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY', savedZeroCopyFlag);
+  });
+  createTensorFromGPUTest(true);
 });
diff --git a/tfjs-backend-webgpu/src/flags_webgpu.ts b/tfjs-backend-webgpu/src/flags_webgpu.ts
@@ -76,10 +76,14 @@ ENV.registerFlag('WEBGPU_USE_NAIVE_CONV2D_DEBUG', () => false);
  * are dispatched, it means the hardware may be in low occupancy.
  * 0 means it's not set by the user. A default strategy will be applied.
  */
-ENV.registerFlag(
-    'WEBGPU_THRESHOLD_TO_INCREASE_WORKGROUPS_FOR_MATMUL', () => 0);
+ENV.registerFlag('WEBGPU_THRESHOLD_TO_INCREASE_WORKGROUPS_FOR_MATMUL', () => 0);
 
 /**
  * Whether we will run im2col as a separate shader for convolution.
  */
 ENV.registerFlag('WEBGPU_CONV_SEPARATE_IM2COL_SHADER', () => false);
+
+/**
+ * Whether use zero copy when create tensor from GPUBuffer.
+ */
+ENV.registerFlag('WEBGPU_TENSOR_FROM_BUFFER_WITH_ZERO_COPY', () => false);
diff --git a/tfjs-core/src/ops/tensor.ts b/tfjs-core/src/ops/tensor.ts
@@ -99,9 +99,7 @@ import {makeTensor} from './tensor_ops_util';
  * // This makes it possible for TF.js applications to avoid GPU / CPU sync.
  * // For example, if your application includes a preprocessing step on the GPU,
  * // you could upload the GPU output directly to TF.js, rather than first
- * // downloading the values. Unlike WebGL, to support zero copy, this GPUBuffer
- * // is bound directly by the tensor. So donot destroy this GPUBuffer until all
- * // access are done.
+ * // downloading the values.
  *
  * // Example for WebGPU:
  * function createReadonlyGPUBufferFromData(device, data, dtype) {
@@ -128,7 +126,8 @@ import {makeTensor} from './tensor_ops_util';
  *   const gpuReadBuffer = device.createBuffer({
  *     mappedAtCreation: false,
  *     size: sizeInBytes,
- *     usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE
+ *     usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.STORAGE |
+ *         GPUBufferUsage.COPY_SRC
  *   });
  *
  *   const copyEncoder = device.createCommandEncoder();
@@ -171,10 +170,9 @@ import {makeTensor} from './tensor_ops_util';
  * size, zeros will be padded at the rear.). If the values is a `WebGPUData`
  * object, the dtype could only be 'float32' or 'int32 and the object has to
  * have: buffer, a `GPUBuffer`. The buffer must: 1. share the same `GPUDevice`
- * with TFJS's WebGPU backend; 2.buffer.usage should at least support
- * GPUBufferUsage.STORAGE, to support tensor.data, GPUBufferUsage.COPY_SRC is
- * also required; 3. buffer.size should not be smaller than the byte size of
- * tensor shape.
+ * with TFJS's WebGPU backend; 2. buffer.usage should at least support
+ * GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC; 3. buffer.size should not
+ * be smaller than the byte size of tensor shape.
  * @param shape The shape of the tensor. Optional. If not provided,
  *   it is inferred from `values`.
  * @param dtype The data type.