From 52a8c1cae8cfd82f952dd27707f61b3b980bb843 Mon Sep 17 00:00:00 2001 From: Enrico Galli Date: Fri, 27 Sep 2024 17:24:21 -0700 Subject: [PATCH] [WebNN EP] Enable IO Bindings with MLTensor (#21301) ### Description Enables using the MLTensor to pass data between models. ### Motivation and Context Using MLTensor instead of ArrayBuffers reduces the number of copies between the CPU and devices as well as the renderer and GPU process in Chromium. --- .../onnxruntime/core/framework/allocator.h | 1 + js/common/lib/tensor-factory-impl.ts | 12 + js/common/lib/tensor-factory.ts | 46 +++ js/common/lib/tensor-impl.ts | 59 +++- js/common/lib/tensor-utils-impl.ts | 8 + js/common/lib/tensor.ts | 30 +- js/web/lib/wasm/jsep/backend-webnn.ts | 169 ++++++++++ js/web/lib/wasm/jsep/init.ts | 19 +- js/web/lib/wasm/jsep/webnn/tensor-manager.ts | 303 ++++++++++++++++++ js/web/lib/wasm/jsep/webnn/webnn.d.ts | 43 ++- js/web/lib/wasm/proxy-messages.ts | 10 +- js/web/lib/wasm/session-handler-inference.ts | 12 +- js/web/lib/wasm/wasm-common.ts | 18 +- js/web/lib/wasm/wasm-core-impl.ts | 62 +++- js/web/lib/wasm/wasm-types.ts | 87 ++++- js/web/script/test-runner-cli-args.ts | 6 +- js/web/script/test-runner-cli.ts | 2 +- js/web/test/test-runner.ts | 107 ++++++- js/web/test/test-types.ts | 6 +- onnxruntime/core/framework/allocator.cc | 3 +- onnxruntime/core/providers/webnn/allocator.cc | 41 +++ onnxruntime/core/providers/webnn/allocator.h | 32 ++ .../core/providers/webnn/builders/helper.cc | 18 ++ .../core/providers/webnn/builders/helper.h | 4 + .../core/providers/webnn/builders/model.cc | 53 ++- .../core/providers/webnn/builders/model.h | 10 +- .../providers/webnn/builders/model_builder.cc | 2 +- .../core/providers/webnn/data_transfer.cc | 44 +++ .../core/providers/webnn/data_transfer.h | 21 ++ .../webnn/webnn_execution_provider.cc | 71 +++- .../webnn/webnn_execution_provider.h | 2 + onnxruntime/wasm/api.cc | 26 +- onnxruntime/wasm/pre-jsep.js | 33 ++ 33 files changed, 1287 insertions(+), 73 deletions(-) create mode 100644 js/web/lib/wasm/jsep/backend-webnn.ts create mode 100644 js/web/lib/wasm/jsep/webnn/tensor-manager.ts create mode 100644 onnxruntime/core/providers/webnn/allocator.cc create mode 100644 onnxruntime/core/providers/webnn/allocator.h create mode 100644 onnxruntime/core/providers/webnn/data_transfer.cc create mode 100644 onnxruntime/core/providers/webnn/data_transfer.h diff --git a/include/onnxruntime/core/framework/allocator.h b/include/onnxruntime/core/framework/allocator.h index abab118efd04f..57b332ce65b93 100644 --- a/include/onnxruntime/core/framework/allocator.h +++ b/include/onnxruntime/core/framework/allocator.h @@ -53,6 +53,7 @@ constexpr const char* OpenVINO_GPU = "OpenVINO_GPU"; constexpr const char* OpenVINO_RT = "OpenVINO_RT"; constexpr const char* OpenVINO_RT_NPU = "OpenVINO_RT_NPU"; constexpr const char* WEBGPU_BUFFER = "WebGPU_Buffer"; +constexpr const char* WEBNN_TENSOR = "WebNN_Tensor"; constexpr size_t kAllocAlignment = 256; diff --git a/js/common/lib/tensor-factory-impl.ts b/js/common/lib/tensor-factory-impl.ts index 5eb7ba4793b32..cbc0270091818 100644 --- a/js/common/lib/tensor-factory-impl.ts +++ b/js/common/lib/tensor-factory-impl.ts @@ -11,6 +11,7 @@ import { TensorFromImageBitmapOptions, TensorFromImageDataOptions, TensorFromImageElementOptions, + TensorFromMLTensorOptions, TensorFromTextureOptions, TensorFromUrlOptions, } from './tensor-factory.js'; @@ -310,6 +311,17 @@ export const tensorFromGpuBuffer = ( + mlTensor: TensorInterface.MLTensorType, + options: TensorFromMLTensorOptions, +): Tensor => { + const { dataType, dims, download, dispose } = options; + return new Tensor({ location: 'ml-tensor', type: dataType ?? 'float32', mlTensor, dims, download, dispose }); +}; + /** * implementation of Tensor.fromPinnedBuffer(). */ diff --git a/js/common/lib/tensor-factory.ts b/js/common/lib/tensor-factory.ts index 7938b4a4eb927..f66684112623e 100644 --- a/js/common/lib/tensor-factory.ts +++ b/js/common/lib/tensor-factory.ts @@ -86,6 +86,20 @@ export interface GpuBufferConstructorParameters + extends CommonConstructorParameters, + GpuResourceConstructorParameters { + /** + * Specify the location of the data to be 'ml-tensor'. + */ + readonly location: 'ml-tensor'; + + /** + * Specify the WebNN MLTensor that holds the tensor data. + */ + readonly mlTensor: Tensor.MLTensorType; +} + // #endregion // the following region contains type definitions of each individual options. @@ -219,6 +233,15 @@ export interface TensorFromGpuBufferOptions dataType?: T; } +export interface TensorFromMLTensorOptions + extends Pick, + GpuResourceConstructorParameters { + /** + * Describes the data type of the tensor. + */ + dataType?: T; +} + // #endregion /** @@ -336,6 +359,29 @@ export interface TensorFactory { options: TensorFromGpuBufferOptions, ): TypedTensor; + /** + * create a tensor from a WebNN MLTensor + * + * @param tensor - the MLTensor object to create tensor from + * @param options - An optional object representing options for creating tensor from a WebNN MLTensor. + * + * The options include following properties: + * - `dataType`: the data type of the tensor. If omitted, assume 'float32'. + * - `dims`: the dimension of the tensor. Required. + * - `download`: an optional function to download the tensor data from the MLTensor to CPU. If omitted, the MLTensor + * data will not be able to download. Usually, this is provided by the WebNN backend for the inference outputs. + * Users don't need to provide this function. + * - `dispose`: an optional function to dispose the tensor data on the WebNN MLTensor. If omitted, the MLTensor will + * not be disposed. Usually, this is provided by the WebNN backend for the inference outputs. Users don't need to + * provide this function. + * + * @returns a tensor object + */ + fromMLTensor( + tensor: Tensor.MLTensorType, + options: TensorFromMLTensorOptions, + ): TypedTensor; + /** * create a tensor from a pre-allocated buffer. The buffer will be used as a pinned buffer. * diff --git a/js/common/lib/tensor-impl.ts b/js/common/lib/tensor-impl.ts index 342f5e3a467eb..c0e1582c17de5 100644 --- a/js/common/lib/tensor-impl.ts +++ b/js/common/lib/tensor-impl.ts @@ -6,16 +6,19 @@ import { TensorToDataUrlOptions, TensorToImageDataOptions } from './tensor-conve import { tensorFromGpuBuffer, tensorFromImage, + tensorFromMLTensor, tensorFromPinnedBuffer, tensorFromTexture, } from './tensor-factory-impl.js'; import { CpuPinnedConstructorParameters, GpuBufferConstructorParameters, + MLTensorConstructorParameters, TensorFromGpuBufferOptions, TensorFromImageBitmapOptions, TensorFromImageDataOptions, TensorFromImageElementOptions, + TensorFromMLTensorOptions, TensorFromTextureOptions, TensorFromUrlOptions, TextureConstructorParameters, @@ -37,6 +40,7 @@ type TensorDataType = TensorInterface.DataType; type TensorDataLocation = TensorInterface.DataLocation; type TensorTextureType = TensorInterface.TextureType; type TensorGpuBufferType = TensorInterface.GpuBufferType; +type TensorMLTensorType = TensorInterface.MLTensorType; /** * the implementation of Tensor interface. @@ -86,6 +90,15 @@ export class Tensor implements TensorInterface { */ constructor(params: GpuBufferConstructorParameters); + /** + * Construct a new tensor object from the WebNN MLTensor with the given type and dims. + * + * Tensor's location will be set to 'ml-tensor'. + * + * @param params - Specify the parameters to construct the tensor. + */ + constructor(params: MLTensorConstructorParameters); + /** * implementation. */ @@ -98,7 +111,8 @@ export class Tensor implements TensorInterface { | readonly boolean[] | CpuPinnedConstructorParameters | TextureConstructorParameters - | GpuBufferConstructorParameters, + | GpuBufferConstructorParameters + | MLTensorConstructorParameters, arg1?: TensorDataType | Uint8ClampedArray | readonly number[] | readonly string[] | readonly boolean[], arg2?: readonly number[], ) { @@ -155,6 +169,25 @@ export class Tensor implements TensorInterface { this.disposer = arg0.dispose; break; } + case 'ml-tensor': { + if ( + type !== 'float32' && + type !== 'float16' && + type !== 'int32' && + type !== 'int64' && + type !== 'uint32' && + type !== 'uint64' && + type !== 'int8' && + type !== 'uint8' && + type !== 'bool' + ) { + throw new TypeError(`unsupported type "${type}" to create tensor from MLTensor`); + } + this.mlTensorData = arg0.mlTensor; + this.downloader = arg0.download; + this.disposer = arg0.dispose; + break; + } default: throw new Error(`Tensor constructor: unsupported location '${this.dataLocation}'`); } @@ -325,6 +358,13 @@ export class Tensor implements TensorInterface { return tensorFromGpuBuffer(gpuBuffer, options); } + static fromMLTensor( + mlTensor: TensorMLTensorType, + options: TensorFromMLTensorOptions, + ): TensorInterface { + return tensorFromMLTensor(mlTensor, options); + } + static fromPinnedBuffer( type: T, buffer: TensorInterface.DataTypeMap[T], @@ -373,6 +413,11 @@ export class Tensor implements TensorInterface { */ private gpuBufferData?: TensorGpuBufferType; + /** + * stores the underlying WebNN MLTensor when location is 'ml-tensor'. otherwise empty. + */ + private mlTensorData?: TensorMLTensorType; + /** * stores an optional downloader function to download data from GPU to CPU. */ @@ -420,6 +465,14 @@ export class Tensor implements TensorInterface { } return this.gpuBufferData; } + + get mlTensor(): TensorMLTensorType { + this.ensureValid(); + if (!this.mlTensorData) { + throw new Error('The data is not stored as a WebNN MLTensor.'); + } + return this.mlTensorData; + } // #endregion // #region methods @@ -431,7 +484,8 @@ export class Tensor implements TensorInterface { case 'cpu-pinned': return this.data; case 'texture': - case 'gpu-buffer': { + case 'gpu-buffer': + case 'ml-tensor': { if (!this.downloader) { throw new Error('The current tensor is not created with a specified data downloader.'); } @@ -472,6 +526,7 @@ export class Tensor implements TensorInterface { this.cpuData = undefined; this.gpuTextureData = undefined; this.gpuBufferData = undefined; + this.mlTensorData = undefined; this.downloader = undefined; this.isDownloading = undefined; diff --git a/js/common/lib/tensor-utils-impl.ts b/js/common/lib/tensor-utils-impl.ts index 9c633cd95fac3..97b1735e6eac5 100644 --- a/js/common/lib/tensor-utils-impl.ts +++ b/js/common/lib/tensor-utils-impl.ts @@ -4,6 +4,7 @@ import { CpuPinnedConstructorParameters, GpuBufferConstructorParameters, + MLTensorConstructorParameters, TextureConstructorParameters, } from './tensor-factory.js'; import { Tensor } from './tensor-impl.js'; @@ -56,6 +57,13 @@ export const tensorReshape = (tensor: Tensor, dims: readonly number[]): Tensor = type: tensor.type as GpuBufferConstructorParameters['type'], dims, }); + case 'ml-tensor': + return new Tensor({ + location: 'ml-tensor', + mlTensor: tensor.mlTensor, + type: tensor.type as MLTensorConstructorParameters['type'], + dims, + }); default: throw new Error(`tensorReshape: tensor location ${tensor.location} is not supported`); } diff --git a/js/common/lib/tensor.ts b/js/common/lib/tensor.ts index 8a1197994393b..17e2f4d37c91f 100644 --- a/js/common/lib/tensor.ts +++ b/js/common/lib/tensor.ts @@ -42,6 +42,13 @@ interface TypedTensorBase { */ readonly gpuBuffer: Tensor.GpuBufferType; + /** + * Get the WebNN MLTensor that holds the tensor data. + * + * If the data is not in a WebNN MLTensor, throw error. + */ + readonly mlTensor: Tensor.MLTensorType; + /** * Get the buffer data of the tensor. * @@ -136,15 +143,36 @@ export declare namespace Tensor { */ export type GpuBufferType = { size: number; mapState: 'unmapped' | 'pending' | 'mapped' }; + /** + * type alias for WebNN MLTensor + * + * The specification for WebNN's MLTensor is currently in flux. + */ + export type MLTensorType = unknown; + /** * supported data types for constructing a tensor from a WebGPU buffer */ export type GpuBufferDataTypes = 'float32' | 'float16' | 'int32' | 'int64' | 'uint32' | 'uint8' | 'bool'; + /** + * supported data types for constructing a tensor from a WebNN MLTensor + */ + export type MLTensorDataTypes = + | 'float32' + | 'float16' + | 'int8' + | 'uint8' + | 'int32' + | 'uint32' + | 'int64' + | 'uint64' + | 'bool'; + /** * represent where the tensor data is stored */ - export type DataLocation = 'none' | 'cpu' | 'cpu-pinned' | 'texture' | 'gpu-buffer'; + export type DataLocation = 'none' | 'cpu' | 'cpu-pinned' | 'texture' | 'gpu-buffer' | 'ml-tensor'; /** * represent the data type of a tensor diff --git a/js/web/lib/wasm/jsep/backend-webnn.ts b/js/web/lib/wasm/jsep/backend-webnn.ts new file mode 100644 index 0000000000000..685f3dc019461 --- /dev/null +++ b/js/web/lib/wasm/jsep/backend-webnn.ts @@ -0,0 +1,169 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// WebNN API currently does not have a TypeScript definition file. This file is a workaround with types generated from +// WebNN API specification. +// https://github.com/webmachinelearning/webnn/issues/677 +/// + +import { Env, Tensor } from 'onnxruntime-common'; + +import { DataType } from '../wasm-common'; +import { getInstance } from '../wasm-factory'; + +import { createView } from './tensor-view'; +import { TensorId, createTensorManager } from './webnn/tensor-manager'; +import { configureLogger, LOG_DEBUG } from './log'; + +/* + * TensorProto::data_type to WebNN OperandType mapping. + */ +const onnxDataTypeToWebnnDataType = new Map([ + [DataType.float, 'float32'], + [DataType.float16, 'float16'], + [DataType.int32, 'int32'], + [DataType.uint32, 'uint32'], + [DataType.int64, 'int64'], + [DataType.uint64, 'uint64'], + [DataType.int8, 'int8'], + [DataType.uint8, 'uint8'], + [DataType.bool, 'uint8'], +]); + +/** + * WebNN backend implementation. This class is used to keep track of the MLTensors created by the backend and keep track + * of the current MLContext being used by the sessions. + */ +export class WebNNBackend { + /** + * Tensor managers for each session. + */ + private tensorManager = createTensorManager(this); + /** + * Maps from session id to MLContexts. + */ + private mlContextBySessionId = new Map(); + /** + * Maps from MLContext to session ids. + */ + private sessionIdsByMLContext = new Map>(); + /** + * Current session id. + */ + private activeSessionId?: number; + + constructor(env: Env) { + configureLogger(env.logLevel!, !!env.debug); + } + + public get currentSessionId(): number { + if (this.activeSessionId === undefined) { + throw new Error('No active session'); + } + return this.activeSessionId; + } + + public onRunStart(sessionId: number): void { + this.activeSessionId = sessionId; + } + + public get currentContext(): MLContext { + const mlContext = this.getMLContext(this.currentSessionId); + if (!mlContext) { + throw new Error(`No MLContext found for session ${this.currentSessionId}`); + } + return mlContext; + } + + public registerMLContext(sessionId: number, mlContext: MLContext): void { + this.mlContextBySessionId.set(sessionId, mlContext); + let sessionIds = this.sessionIdsByMLContext.get(mlContext); + if (!sessionIds) { + sessionIds = new Set(); + this.sessionIdsByMLContext.set(mlContext, sessionIds); + } + sessionIds.add(sessionId); + } + + public onReleaseSession(sessionId: number): void { + const mlContext = this.mlContextBySessionId.get(sessionId)!; + if (!mlContext) { + // Current session is not a WebNN session. + return; + } + this.mlContextBySessionId.delete(sessionId); + const sessionIds = this.sessionIdsByMLContext.get(mlContext)!; + sessionIds.delete(sessionId); + if (sessionIds.size === 0) { + this.sessionIdsByMLContext.delete(mlContext); + this.tensorManager.releaseTensorsForContext(mlContext); + } + } + + public getMLContext(sessionId: number): MLContext | undefined { + return this.mlContextBySessionId.get(sessionId); + } + + public reserveTensorId(): TensorId { + return this.tensorManager.reserveTensorId(); + } + + public releaseTensorId(tensorId: TensorId): void { + LOG_DEBUG('verbose', () => `[WebNN] releaseTensorId {tensorId: ${tensorId}}`); + this.tensorManager.releaseTensorId(tensorId); + } + + public async ensureTensor( + tensorId: TensorId, + onnxDataType: DataType, + dimensions: number[], + copyOld: boolean, + ): Promise { + const webnnDataType = onnxDataTypeToWebnnDataType.get(onnxDataType); + if (!webnnDataType) { + throw new Error(`Unsupported ONNX data type: ${onnxDataType}`); + } + return this.tensorManager.ensureTensor(tensorId, webnnDataType, dimensions, copyOld); + } + + public uploadTensor(tensorId: TensorId, data: Uint8Array): void { + const wasm = getInstance(); + if (!wasm.shouldTransferToMLTensor) { + throw new Error('Trying to upload to a MLTensor while shouldTransferToMLTensor is false'); + } + LOG_DEBUG('verbose', () => `[WebNN] uploadTensor {tensorId: ${tensorId}, data: ${data.byteLength}}`); + this.tensorManager.upload(tensorId, data); + } + + public async downloadTensor(tensorId: TensorId, dstBuffer: ArrayBufferView | ArrayBuffer): Promise { + return this.tensorManager.download(tensorId, dstBuffer); + } + + public createMLTensorDownloader(tensorId: TensorId, type: Tensor.MLTensorDataTypes): () => Promise { + return async () => { + const data = await this.tensorManager.download(tensorId); + return createView(data, type); + }; + } + + public registerMLTensor(tensor: MLTensor, onnxDataType: DataType, dimensions: number[]): TensorId { + const webnnDataType = onnxDataTypeToWebnnDataType.get(onnxDataType); + if (!webnnDataType) { + throw new Error(`Unsupported ONNX data type: ${onnxDataType}`); + } + + const id = this.tensorManager.registerTensor(this.currentContext, tensor, webnnDataType, dimensions); + LOG_DEBUG( + 'verbose', + () => + `[WebNN] registerMLTensor {tensor: ${tensor}, dataType: ${webnnDataType}, dimensions: ${ + dimensions + }} -> {tensorId: ${id}}`, + ); + return id; + } + + public flush(): void { + // Unlike the WebGPU backend, the WebNN backend does not need to flush any pending operations. + } +} diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts index 2f0e5da2b3f27..7bce5ff9390e8 100644 --- a/js/web/lib/wasm/jsep/init.ts +++ b/js/web/lib/wasm/jsep/init.ts @@ -12,6 +12,7 @@ import { LOG_DEBUG } from './log'; import { TensorView } from './tensor-view'; import { ShapeUtil } from './util'; import { AdapterInfo, ComputeContext, ComputeContextInputsOutputsMapping, ProgramInfo } from './webgpu/types'; +import { WebNNBackend } from './backend-webnn'; /* eslint-disable no-bitwise */ @@ -266,6 +267,22 @@ export const init = async ( () => backend.replay(), ]); } else { - jsepInit('webnn'); + const backend = new WebNNBackend(env); + jsepInit('webnn', [ + backend, + // jsepReserveTensorId + () => backend.reserveTensorId(), + // jsepReleaseTensorId, + (tensorId: number) => backend.releaseTensorId(tensorId), + // jsepEnsureTensor + async (tensorId: number, onnxDataType: number, shape: number[], copyOld) => + backend.ensureTensor(tensorId, onnxDataType, shape, copyOld), + // jsepUploadTensor + (tensorId: number, data: Uint8Array) => { + backend.uploadTensor(tensorId, data); + }, + // jsepDownloadTensor + async (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => backend.downloadTensor(tensorId, dstBuffer), + ]); } }; diff --git a/js/web/lib/wasm/jsep/webnn/tensor-manager.ts b/js/web/lib/wasm/jsep/webnn/tensor-manager.ts new file mode 100644 index 0000000000000..9475de019ed1d --- /dev/null +++ b/js/web/lib/wasm/jsep/webnn/tensor-manager.ts @@ -0,0 +1,303 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +import { WebNNBackend } from '../backend-webnn'; +import { LOG_DEBUG } from '../log'; + +// WebNN API currently does not have a TypeScript definition file. This file is a workaround with types generated from +// WebNN API specification. +// https://github.com/webmachinelearning/webnn/issues/677 +/// + +export type TensorId = number; + +/** + * Manages TensorId to MLTensor mapping. + */ +export interface TensorManager { + /** + * Reserve a new TensorId. + */ + reserveTensorId(): TensorId; + /** + * Release a TensorId. + */ + releaseTensorId(tensorId: TensorId): void; + /** + * Ensure a MLTensor is created for the TensorId. + */ + ensureTensor( + tensorId: TensorId, + dataType: MLOperandDataType, + shape: readonly number[], + copyOld: boolean, + ): Promise; + /** + * Upload data to a MLTensor. + */ + upload(tensorId: TensorId, data: Uint8Array): void; + /** + * Download data from a MLTensor. + */ + download(tensorId: TensorId): Promise; + download(tensorId: TensorId, dstTensor: ArrayBufferView | ArrayBuffer): Promise; + /** + * Release all tensors for a MLContext. + */ + releaseTensorsForContext(mlContext: MLContext): void; + /** + * Register an externally created MLTensor with a given MLContext and return a TensorId. + */ + registerTensor(mlContext: MLContext, mlTensor: MLTensor, dataType: MLOperandDataType, shape: number[]): TensorId; +} + +let tensorGuid = 1; +const createNewTensorId = (): TensorId => tensorGuid++; + +export type MLTensorEntry = [MLTensor, MLOperandDataType, readonly number[]]; + +/** + * TensorTracker tracks the MLTensor and pending upload data. + * + * We need to track the MLTensor and pending upload data because we delay the creation of MLTensor until + * we know the data type and shape. This is because future implementations of WebNN will only support creating + * MLTensors with dataTypes and shape. + */ +class TensorTracker { + private tensorEntry?: MLTensorEntry; + private activeUpload?: Uint8Array; + private tensorCache: MLTensorEntry[]; + + constructor( + private mlContext?: MLContext, + tensorEntry?: MLTensorEntry, + ) { + this.tensorEntry = tensorEntry; + this.tensorCache = tensorEntry ? [tensorEntry] : []; + } + + public get tensor(): MLTensor | undefined { + return this.tensorEntry?.[0]; + } + + public get context(): MLContext { + if (!this.mlContext) { + throw new Error('MLContext has not been set.'); + } + return this.mlContext; + } + + public set context(mlContext: MLContext) { + if (this.mlContext && this.mlContext !== mlContext) { + throw new Error('MLTensor in use in a different MLContext.'); + } + this.mlContext = mlContext; + } + + public destroy(): void { + for (const [mlTensor] of this.tensorCache) { + mlTensor.destroy(); + } + this.tensorCache = []; + this.tensorEntry = undefined; + } + + public trySelectTensor(context: MLContext, tryMLTensor: MLTensor): boolean { + for (const [mlTensor, dataType, shape] of this.tensorCache) { + if (tryMLTensor === mlTensor) { + if (this.context !== context) { + throw new Error('MLTensor cannot be registered with a different MLContext.'); + } + this.tensorEntry = [mlTensor, dataType, shape]; + return true; + } + } + return false; + } + + public async ensureTensor( + dataType: MLOperandDataType, + shape: readonly number[], + copyOld: boolean, + ): Promise { + if (this.tensorEntry) { + const [mlTensor, existingDataType, existingShape] = this.tensorEntry; + if (existingDataType === dataType && existingShape.every((v, i) => v === shape[i])) { + return mlTensor; + } + } + + for (const [mlTensor, existingDataType, existingShape] of this.tensorCache) { + if (existingDataType === dataType && existingShape.every((v, i) => v === shape[i])) { + if (copyOld && this.tensorEntry) { + // WebNN does not support copyTensorToTensor, so we need to read and write the tensors. + LOG_DEBUG( + 'verbose', + () => `[WebNN] Slowdown may occur, having to copy existing tensor {dataType: ${dataType}, shape: ${shape}}`, + ); + const data = await this.context.readTensor(this.tensorEntry[0]); + this.context.writeTensor(mlTensor, data); + } + this.tensorEntry = [mlTensor, existingDataType, existingShape]; + return mlTensor; + } + } + LOG_DEBUG('verbose', () => `[WebNN] MLContext.createTensor {dataType: ${dataType}, shape: ${shape}}`); + // eslint-disable-next-line no-bitwise + const usage = MLTensorUsage.READ | MLTensorUsage.WRITE; + const tensor = await this.context.createTensor({ + dataType, + shape, + // Assign both shape and dimensions while transitioning to new API. + dimensions: shape, + usage, + }); + this.tensorEntry = [tensor, dataType, shape]; + this.tensorCache.push(this.tensorEntry); + + if (this.activeUpload) { + this.mlContext?.writeTensor(tensor, this.activeUpload); + this.activeUpload = undefined; + } + + return tensor; + } + + public upload(data: Uint8Array): void { + if (!this.tensorEntry) { + this.activeUpload = new Uint8Array(data); + return; + } + this.mlContext?.writeTensor(this.tensorEntry[0], data); + } + + public async download(dstBuffer?: ArrayBufferView | ArrayBuffer): Promise { + if (this.activeUpload) { + if (dstBuffer) { + if (dstBuffer instanceof ArrayBuffer) { + new Uint8Array(dstBuffer).set(this.activeUpload); + } else { + new Uint8Array(dstBuffer.buffer, dstBuffer.byteOffset, dstBuffer.byteLength).set(this.activeUpload); + } + + return; + } else { + return this.activeUpload.buffer; + } + } + if (!this.tensorEntry) { + throw new Error('Tensor has not been created.'); + } + if (dstBuffer) { + return this.context.readTensor(this.tensorEntry[0], dstBuffer); + } + return this.context.readTensor(this.tensorEntry[0]); + } +} + +class TensorManagerImpl implements TensorManager { + private tensorsById = new Map(); + private tensorIdsByContext = new Map>(); + + constructor(private backend: WebNNBackend) {} + + public reserveTensorId(): TensorId { + const tensorId = createNewTensorId(); + this.tensorsById.set(tensorId, new TensorTracker()); + return tensorId; + } + + public releaseTensorId(tensorId: TensorId): void { + const tensorTracker = this.tensorsById.get(tensorId); + if (!tensorTracker) { + return; + } + tensorTracker.destroy(); + this.tensorsById.delete(tensorId); + for (const [mlContext, tensors] of this.tensorIdsByContext) { + if (tensors.has(tensorId)) { + tensors.delete(tensorId); + if (tensors.size === 0) { + this.tensorIdsByContext.delete(mlContext); + } + break; + } + } + } + + public async ensureTensor( + tensorId: TensorId, + dataType: MLOperandDataType, + shape: number[], + copyOld: boolean, + ): Promise { + LOG_DEBUG( + 'verbose', + () => + `[WebNN] TensorManager.ensureTensor {tensorId: ${tensorId}, dataType: ${ + dataType + }, shape: ${shape}, copyOld: ${copyOld}}`, + ); + const tensor = this.tensorsById.get(tensorId); + if (!tensor) { + throw new Error('Tensor not found.'); + } + tensor.context = this.backend.currentContext; + if (!this.tensorIdsByContext.has(this.backend.currentContext)) { + this.tensorIdsByContext.set(this.backend.currentContext, new Set()); + } + this.tensorIdsByContext.get(this.backend.currentContext)?.add(tensorId); + return tensor.ensureTensor(dataType, shape, copyOld); + } + + public upload(tensorId: TensorId, data: Uint8Array): void { + this.tensorsById.get(tensorId)!.upload(data); + } + + public async download(tensorId: TensorId): Promise; + public async download(tensorId: TensorId, dstBuffer: ArrayBufferView | ArrayBuffer): Promise; + async download(tensorId: TensorId, dstBuffer?: ArrayBufferView | ArrayBuffer): Promise { + LOG_DEBUG( + 'verbose', + () => `[WebNN] TensorManager.download {tensorId: ${tensorId}, dstBuffer: ${dstBuffer?.byteLength}}`, + ); + return this.tensorsById.get(tensorId)!.download(dstBuffer); + } + + public releaseTensorsForContext(mlContext: MLContext): void { + const tensors = this.tensorIdsByContext.get(mlContext); + if (!tensors) { + return; + } + for (const tensorId of tensors) { + this.tensorsById.get(tensorId)!.destroy(); + this.tensorsById.delete(tensorId); + } + this.tensorIdsByContext.delete(mlContext); + } + + public registerTensor( + mlContext: MLContext, + mlTensor: MLTensor, + dataType: MLOperandDataType, + shape: readonly number[], + ): TensorId { + for (const [tensorId, tensorTracker] of this.tensorsById) { + if (tensorTracker.trySelectTensor(mlContext, mlTensor)) { + return tensorId; + } + } + const tensorId = createNewTensorId(); + this.tensorsById.set(tensorId, new TensorTracker(mlContext, [mlTensor, dataType, shape])); + let tensors = this.tensorIdsByContext.get(mlContext); + if (!tensors) { + tensors = new Set(); + this.tensorIdsByContext.set(mlContext, tensors); + } + tensors.add(tensorId); + return tensorId; + } +} + +export const createTensorManager = (...args: ConstructorParameters): TensorManager => + new TensorManagerImpl(...args); diff --git a/js/web/lib/wasm/jsep/webnn/webnn.d.ts b/js/web/lib/wasm/jsep/webnn/webnn.d.ts index f8a1e1966fd4c..5cb0f4e74c3df 100644 --- a/js/web/lib/wasm/jsep/webnn/webnn.d.ts +++ b/js/web/lib/wasm/jsep/webnn/webnn.d.ts @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +/* eslint-disable @typescript-eslint/naming-convention */ + interface NavigatorML { readonly ml: ML; } @@ -30,7 +32,9 @@ type MLInputOperandLayout = 'nchw'|'nhwc'; type MLOperandDataType = 'float32'|'float16'|'int32'|'uint32'|'int64'|'uint64'|'int8'|'uint8'; interface MLOperandDescriptor { dataType: MLOperandDataType; - dimensions?: number[]; + shape?: readonly number[]; + /** @deprecated Use shape instead of dimensions */ + dimensions?: readonly number[]; } interface MLOperand { dataType(): MLOperandDataType; @@ -379,23 +383,32 @@ interface MLGraphBuilder { where(condition: MLOperand, input: MLOperand, other: MLOperand): MLOperand; } -// Experimental MLBuffer interface +// Experimental MLTensor interface -type MLSize64Out = number; -interface MLBuffer { - readonly size: MLSize64Out; +interface MLTensor { destroy(): void; } -type MLSize64 = number; -interface MLBufferDescriptor { - size: MLSize64; + +type MLNamedTensor = Record; + +type MLTensorUsageFlags = number; + +declare const MLTensorUsage: { + readonly WEBGPU_INTEROP: MLTensorUsageFlags; + readonly READ: MLTensorUsageFlags; + readonly WRITE: MLTensorUsageFlags; +}; + +interface MLTensorDescriptor extends MLOperandDescriptor { + usage: MLTensorUsageFlags; } -type MLNamedBuffers = Record; + interface MLContext { - createBuffer(descriptor: MLBufferDescriptor): MLBuffer; - writeBuffer( - dstBuffer: MLBuffer, srcData: ArrayBufferView|ArrayBuffer, srcElementOffset?: MLSize64, - srcElementSize?: MLSize64): void; - readBuffer(srcBuffer: MLBuffer): Promise; - dispatch(graph: MLGraph, inputs: MLNamedBuffers, outputs: MLNamedBuffers): void; + createTensor(descriptor: MLTensorDescriptor): Promise; + writeTensor( + destinationTensor: MLTensor, sourceData: ArrayBufferView|ArrayBuffer, sourceElementOffset?: number, + sourceElementSize?: number): void; + readTensor(sourceTensor: MLTensor): Promise; + readTensor(sourceTensor: MLTensor, destinationData: ArrayBufferView|ArrayBuffer): Promise; + dispatch(graph: MLGraph, inputs: MLNamedTensor, outputs: MLNamedTensor): void; } diff --git a/js/web/lib/wasm/proxy-messages.ts b/js/web/lib/wasm/proxy-messages.ts index 8f3acdd582445..559f319a10f66 100644 --- a/js/web/lib/wasm/proxy-messages.ts +++ b/js/web/lib/wasm/proxy-messages.ts @@ -19,11 +19,18 @@ export type GpuBufferMetadata = { dispose?: () => void; }; +export type MLTensorMetadata = { + mlTensor: Tensor.MLTensorType; + download?: () => Promise; + dispose?: () => void; +}; + /** - * Tensors on location 'cpu-pinned' and 'gpu-buffer' are not serializable. + * Tensors on location 'cpu-pinned', 'gpu-buffer', and 'ml-tensor' are not serializable. */ export type UnserializableTensorMetadata = | [dataType: Tensor.Type, dims: readonly number[], data: GpuBufferMetadata, location: 'gpu-buffer'] + | [dataType: Tensor.Type, dims: readonly number[], data: MLTensorMetadata, location: 'ml-tensor'] | [dataType: Tensor.Type, dims: readonly number[], data: Tensor.DataType, location: 'cpu-pinned']; /** @@ -34,6 +41,7 @@ export type UnserializableTensorMetadata = * - cpu: Uint8Array * - cpu-pinned: Uint8Array * - gpu-buffer: GpuBufferMetadata + * - ml-tensor: MLTensorMetadata * - location: tensor data location */ export type TensorMetadata = SerializableTensorMetadata | UnserializableTensorMetadata; diff --git a/js/web/lib/wasm/session-handler-inference.ts b/js/web/lib/wasm/session-handler-inference.ts index eff3e91389c98..c19043cc3637f 100644 --- a/js/web/lib/wasm/session-handler-inference.ts +++ b/js/web/lib/wasm/session-handler-inference.ts @@ -12,7 +12,7 @@ import { import { SerializableInternalBuffer, TensorMetadata } from './proxy-messages'; import { copyFromExternalBuffer, createSession, endProfiling, releaseSession, run } from './proxy-wrapper'; -import { isGpuBufferSupportedType } from './wasm-common'; +import { isGpuBufferSupportedType, isMLTensorSupportedType } from './wasm-common'; import { isNode } from './wasm-utils-env'; import { loadFile } from './wasm-utils-load-file'; @@ -22,6 +22,8 @@ export const encodeTensorMetadata = (tensor: Tensor, getName: () => string): Ten return [tensor.type, tensor.dims, tensor.data, 'cpu']; case 'gpu-buffer': return [tensor.type, tensor.dims, { gpuBuffer: tensor.gpuBuffer }, 'gpu-buffer']; + case 'ml-tensor': + return [tensor.type, tensor.dims, { mlTensor: tensor.mlTensor }, 'ml-tensor']; default: throw new Error(`invalid data location: ${tensor.location} for ${getName()}`); } @@ -39,6 +41,14 @@ export const decodeTensorMetadata = (tensor: TensorMetadata): Tensor => { const { gpuBuffer, download, dispose } = tensor[2]; return Tensor.fromGpuBuffer(gpuBuffer, { dataType, dims: tensor[1], download, dispose }); } + case 'ml-tensor': { + const dataType = tensor[0]; + if (!isMLTensorSupportedType(dataType)) { + throw new Error(`not supported data type: ${dataType} for deserializing MLTensor tensor`); + } + const { mlTensor, download, dispose } = tensor[2]; + return Tensor.fromMLTensor(mlTensor, { dataType, dims: tensor[1], download, dispose }); + } default: throw new Error(`invalid data location: ${tensor[3]}`); } diff --git a/js/web/lib/wasm/wasm-common.ts b/js/web/lib/wasm/wasm-common.ts index 78ff14540d8cb..ad2ff62587252 100644 --- a/js/web/lib/wasm/wasm-common.ts +++ b/js/web/lib/wasm/wasm-common.ts @@ -240,6 +240,20 @@ export const isGpuBufferSupportedType = (type: Tensor.Type): type is Tensor.GpuB type === 'uint4' || type === 'int4'; +/** + * Check whether the given tensor type is supported by WebNN MLTensor + */ +export const isMLTensorSupportedType = (type: Tensor.Type): type is Tensor.MLTensorDataTypes => + type === 'float32' || + type === 'float16' || + type === 'int32' || + type === 'int64' || + type === 'uint32' || + type === 'uint64' || + type === 'int8' || + type === 'uint8' || + type === 'bool'; + /** * Map string data location to integer value */ @@ -255,6 +269,8 @@ export const dataLocationStringToEnum = (location: Tensor.DataLocation): number return 3; case 'gpu-buffer': return 4; + case 'ml-tensor': + return 5; default: throw new Error(`unsupported data location: ${location}`); } @@ -264,4 +280,4 @@ export const dataLocationStringToEnum = (location: Tensor.DataLocation): number * Map integer data location to string value */ export const dataLocationEnumToString = (location: number): Tensor.DataLocation | undefined => - (['none', 'cpu', 'cpu-pinned', 'texture', 'gpu-buffer'] as const)[location]; + (['none', 'cpu', 'cpu-pinned', 'texture', 'gpu-buffer', 'ml-tensor'] as const)[location]; diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts index ed001cfa90f59..0668ac1931988 100644 --- a/js/web/lib/wasm/wasm-core-impl.ts +++ b/js/web/lib/wasm/wasm-core-impl.ts @@ -20,6 +20,7 @@ import { calculateTensorSizeInBytes, dataLocationStringToEnum, isGpuBufferSupportedType, + isMLTensorSupportedType, logLevelStringToEnum, tensorDataTypeEnumToString, tensorDataTypeStringToEnum, @@ -162,7 +163,7 @@ export const initEp = async (env: Env, epName: string): Promise => { /** * valid data locations for input/output tensors. */ -type SupportedTensorDataLocationForInputOutput = 'cpu' | 'cpu-pinned' | 'gpu-buffer'; +type SupportedTensorDataLocationForInputOutput = 'cpu' | 'cpu-pinned' | 'gpu-buffer' | 'ml-tensor'; type IOBindingState = { /** @@ -173,7 +174,7 @@ type IOBindingState = { /** * the preferred location for each output tensor. * - * value is one of 'cpu', 'cpu-pinned', 'gpu-buffer'. + * value is one of 'cpu', 'cpu-pinned', 'gpu-buffer', 'ml-tensor'. */ readonly outputPreferredLocations: readonly SupportedTensorDataLocationForInputOutput[]; @@ -287,6 +288,7 @@ export const createSession = async ( for (const provider of options?.executionProviders ?? []) { const providerName = typeof provider === 'string' ? provider : provider.name; if (providerName === 'webnn') { + wasm.shouldTransferToMLTensor = false; if (wasm.currentContext) { throw new Error('WebNN execution provider is already set.'); } @@ -318,7 +320,9 @@ export const createSession = async ( // clear current MLContext after session creation if (wasm.currentContext) { + wasm.jsepRegisterMLContext!(sessionHandle, wasm.currentContext); wasm.currentContext = undefined; + wasm.shouldTransferToMLTensor = true; } const [inputCount, outputCount] = getSessionInputOutputCount(sessionHandle); @@ -354,7 +358,7 @@ export const createSession = async ( typeof options?.preferredOutputLocation === 'string' ? options.preferredOutputLocation : (options?.preferredOutputLocation?.[nameString] ?? 'cpu'); - if (location !== 'cpu' && location !== 'cpu-pinned' && location !== 'gpu-buffer') { + if (location !== 'cpu' && location !== 'cpu-pinned' && location !== 'gpu-buffer' && location !== 'ml-tensor') { throw new Error(`Not supported preferred output location: ${location}.`); } if (enableGraphCapture && location !== 'gpu-buffer') { @@ -366,9 +370,9 @@ export const createSession = async ( } } - // use IO binding only when at least one output is preffered to be on GPU. + // use IO binding only when at least one output is preferred to be on GPU. let bindingState: IOBindingState | null = null; - if (!BUILD_DEFS.DISABLE_JSEP && outputPreferredLocations.some((l) => l === 'gpu-buffer')) { + if (!BUILD_DEFS.DISABLE_JSEP && outputPreferredLocations.some((l) => l === 'gpu-buffer' || l === 'ml-tensor')) { ioBindingHandle = wasm._OrtCreateBinding(sessionHandle); if (ioBindingHandle === 0) { checkLastError("Can't create IO binding."); @@ -459,7 +463,7 @@ export const prepareInputOutputTensor = ( let rawData: number; let dataByteLength: number; - if (dataType === 'string' && location === 'gpu-buffer') { + if (dataType === 'string' && (location === 'gpu-buffer' || location === 'ml-tensor')) { throw new Error('String tensor is not supported on GPU.'); } @@ -478,6 +482,15 @@ export const prepareInputOutputTensor = ( throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.'); } rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength); + } else if (location === 'ml-tensor') { + const mlTensor = tensor[2].mlTensor as MLTensor; + dataByteLength = calculateTensorSizeInBytes(tensorDataTypeStringToEnum(dataType), dims)!; + + const registerMLTensor = wasm.jsepRegisterMLTensor; + if (!registerMLTensor) { + throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.'); + } + rawData = registerMLTensor(mlTensor, tensorDataTypeStringToEnum(dataType), dims); } else { const data = tensor[2]; @@ -563,6 +576,9 @@ export const run = async ( const outputNamesOffset = wasm.stackAlloc(outputCount * 4); try { + // WebNN backend needs the active session to check MLTensors with the current context. + wasm.jsepOnRunStart?.(sessionHandle); + [runOptionsHandle, runOptionsAllocs] = setRunOptions(options); // create input tensors @@ -654,7 +670,6 @@ export const run = async ( ]); } - wasm.jsepOnRunStart?.(sessionHandle); let errorCode: number; if (!BUILD_DEFS.DISABLE_JSEP && ioBindingState) { errorCode = await wasm._OrtRunWithBinding( @@ -726,7 +741,7 @@ export const run = async ( const preferredLocation = ioBindingState?.outputPreferredLocations[outputIndices[i]]; if (type === 'string') { - if (preferredLocation === 'gpu-buffer') { + if (preferredLocation === 'gpu-buffer' || preferredLocation === 'ml-tensor') { throw new Error('String tensor is not supported on GPU.'); } const stringData: string[] = []; @@ -766,6 +781,37 @@ export const run = async ( }, 'gpu-buffer', ]); + } else if (preferredLocation === 'ml-tensor' && size > 0) { + const ensureTensor = wasm.jsepEnsureTensor; + if (!ensureTensor) { + throw new Error('preferredLocation "ml-tensor" is not supported without using WebNN.'); + } + const tensorSize = calculateTensorSizeInBytes(dataType, size); + if (tensorSize === undefined || !isMLTensorSupportedType(type)) { + throw new Error(`Unsupported data type: ${type}`); + } + + // If the graph has been partitioned, the output tensor may have not been created. For this reason, we use + // ensureTensor to get/create the MLTensor. In which case, we don't need to copy the data if a new tensor + // has been created. + const mlTensor = await ensureTensor(dataOffset, dataType, dims, false); + + // do not release the tensor right now. it will be released when user calls tensor.dispose(). + keepOutputTensor = true; + + output.push([ + type, + dims, + { + mlTensor, + download: wasm.jsepCreateMLTensorDownloader!(dataOffset, type), + dispose: () => { + wasm.jsepReleaseTensorId!(dataOffset); + wasm._OrtReleaseTensor(tensor); + }, + }, + 'ml-tensor', + ]); } else { const typedArrayConstructor = tensorTypeToTypedArrayConstructor(type); const data = new typedArrayConstructor(size); diff --git a/js/web/lib/wasm/wasm-types.ts b/js/web/lib/wasm/wasm-types.ts index 828cd3cfd94fa..3e08fe97f559d 100644 --- a/js/web/lib/wasm/wasm-types.ts +++ b/js/web/lib/wasm/wasm-types.ts @@ -7,6 +7,7 @@ /// import type { Tensor } from 'onnxruntime-common'; +import { DataType } from './wasm-common'; /* eslint-disable @typescript-eslint/naming-convention */ @@ -27,6 +28,16 @@ export declare namespace JSEP { type CaptureBeginFunction = () => void; type CaptureEndFunction = () => void; type ReplayFunction = () => void; + type ReserveTensorIdFunction = () => number; + type ReleaseTensorIdFunction = (tensorId: number) => void; + type EnsureTensorFunction = ( + tensorId: number, + dataType: DataType, + shape: readonly number[], + copyOld: boolean, + ) => Promise; + type UploadTensorFunction = (tensorId: number, data: Uint8Array) => void; + type DownloadTensorFunction = (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise; export interface Module extends WebGpuModule, WebNnModule { /** @@ -62,7 +73,17 @@ export declare namespace JSEP { replay: ReplayFunction, ], ): void; - jsepInit(name: 'webnn', initParams?: never): void; + jsepInit( + name: 'webnn', + initParams: [ + backend: BackendType, + reserveTensorId: ReserveTensorIdFunction, + releaseTensorId: ReleaseTensorIdFunction, + ensureTensor: EnsureTensorFunction, + uploadTensor: UploadTensorFunction, + downloadTensor: DownloadTensorFunction, + ], + ): void; } export interface WebGpuModule { @@ -134,6 +155,70 @@ export declare namespace JSEP { * Active MLContext used to create WebNN EP. */ currentContext: MLContext; + + /** + * Disables creating MLTensors. This is used to avoid creating MLTensors for graph initializers. + */ + shouldTransferToMLTensor: boolean; + + /** + * [exported from pre-jsep.js] Register MLContext for a session. + * @param sessionId - specify the session ID. + * @param context - specify the MLContext. + * @returns + */ + jsepRegisterMLContext: (sessionId: number, context: MLContext) => void; + /** + * [exported from pre-jsep.js] Reserve a MLTensor ID attached to the current session. + * @returns the MLTensor ID. + */ + jsepReserveTensorId: () => number; + /** + * [exported from pre-jsep.js] Release an MLTensor ID from use and destroys underlying MLTensor if no longer in use. + * @param tensorId - specify the MLTensor ID. + * @returns + */ + jsepReleaseTensorId: (tensorId: number) => void; + /** + * [exported from pre-jsep.js] Ensure that an MLTensor of a given type and shape exists for a MLTensor ID. + * @param tensorId - specify the MLTensor ID. + * @param onnxDataType - specify the data type. + * @param shape - specify the dimensions (WebNN shape) of the tensor. + * @param copyOld - specify whether to copy the old tensor if a new tensor was created. + * @returns the MLTensor associated with the tensor ID. + */ + jsepEnsureTensor: (tensorId: number, dataType: DataType, shape: number[], copyOld: boolean) => Promise; + /** + * [exported from pre-jsep.js] Upload data to an MLTensor. + * @param tensorId - specify the MLTensor ID. + * @param data - specify the data to upload. It can be a TensorProto::data_type or a WebNN MLOperandDataType. + * @returns + */ + jsepUploadTensor: (tensorId: number, data: Uint8Array) => void; + /** + * [exported from pre-jsep.js] Download data from an MLTensor. + * @param tensorId - specify the MLTensor ID. + * @returns the downloaded data. + */ + jsepDownloadTensor: (tensorId: number, dstBuffer: ArrayBufferView | ArrayBuffer) => Promise; + /** + * [exported from pre-jsep.js] Creates a downloader function to download data from an MLTensor. + * @param tensorId - specify the MLTensor ID. + * @param type - specify the data type. + * @returns the downloader function. + */ + jsepCreateMLTensorDownloader: ( + tensorId: number, + type: Tensor.MLTensorDataTypes, + ) => () => Promise; + /** + * [exported from pre-jsep.js] Registers an external MLTensor to a session. + * @param tensor - specify the MLTensor. + * @param dataType - specify the data type. + * @param dimensions - specify the dimensions. + * @returns the MLTensor ID for the external MLTensor. + */ + jsepRegisterMLTensor: (tensor: MLTensor, onnxDataType: DataType, dimensions: readonly number[]) => number; } } diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts index d237293dbb192..e94e11d0ace56 100644 --- a/js/web/script/test-runner-cli-args.ts +++ b/js/web/script/test-runner-cli-args.ts @@ -62,6 +62,8 @@ Options: none (default) gpu-tensor use pre-allocated GPU tensors for inputs and outputs gpu-location use pre-allocated GPU tensors for inputs and set preferredOutputLocation to 'gpu-buffer' + ml-tensor use pre-allocated MLTensor tensors for inputs and outputs + ml-location use pre-allocated MLTensor tensors for inputs and set preferredOutputLocation to 'ml-tensor' *** Logging Options *** @@ -133,7 +135,7 @@ export declare namespace TestRunnerCliArgs { type Backend = 'cpu' | 'webgl' | 'webgpu' | 'wasm' | 'onnxruntime' | 'webnn'; type Environment = 'chrome' | 'chromecanary' | 'edge' | 'firefox' | 'electron' | 'safari' | 'node' | 'bs'; type BundleMode = 'dev' | 'perf'; - type IOBindingMode = 'none' | 'gpu-tensor' | 'gpu-location'; + type IOBindingMode = 'none' | 'gpu-tensor' | 'gpu-location' | 'ml-tensor' | 'ml-location'; } export interface TestRunnerCliArgs { @@ -455,7 +457,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs // Option: -i=<...>, --io-binding=<...> const ioBindingArg = args['io-binding'] || args.i; const ioBindingMode = typeof ioBindingArg !== 'string' ? 'none' : ioBindingArg; - if (['none', 'gpu-tensor', 'gpu-location'].indexOf(ioBindingMode) === -1) { + if (['none', 'gpu-tensor', 'gpu-location', 'ml-tensor', 'ml-location'].indexOf(ioBindingMode) === -1) { throw new Error(`not supported io binding mode ${ioBindingMode}`); } diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts index a9fcd7b876b2f..68ee58dab7094 100644 --- a/js/web/script/test-runner-cli.ts +++ b/js/web/script/test-runner-cli.ts @@ -380,7 +380,7 @@ async function main() { } let ioBinding: Test.IOBindingMode; - if (backend !== 'webgpu' && args.ioBindingMode !== 'none') { + if (!['webgpu', 'webnn'].includes(backend) && args.ioBindingMode !== 'none') { npmlog.warn( 'TestRunnerCli.Init.Model', `Ignoring IO Binding Mode "${args.ioBindingMode}" for backend "${backend}".`, diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts index aa9555c191501..2176a776a0192 100644 --- a/js/web/test/test-runner.ts +++ b/js/web/test/test-runner.ts @@ -1,6 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +// WebNN API currently does not have a TypeScript definition file. This file is a workaround with types generated from +// WebNN API specification. +// https://github.com/webmachinelearning/webnn/issues/677 +/// + import { Float16Array as Float16ArrayPolyfill } from '@petamoriken/float16'; import { expect } from 'chai'; import * as ort from 'onnxruntime-common'; @@ -19,6 +24,7 @@ import { createView } from '../lib/wasm/jsep/tensor-view'; import { calculateTensorSizeInBytes, isGpuBufferSupportedType, + isMLTensorSupportedType, tensorDataTypeStringToEnum, } from '../lib/wasm/wasm-common'; @@ -170,13 +176,20 @@ async function initializeSession( }`, ); + let preferredOutputLocation: ort.Tensor.DataLocation | undefined; + if (ioBindingMode === 'gpu-location') { + preferredOutputLocation = 'gpu-buffer'; + } else if (ioBindingMode === 'ml-location') { + preferredOutputLocation = 'ml-tensor'; + } + const profilerConfig = profile ? { maxNumberEvents: 65536 } : undefined; const sessionConfig = { ...sessionOptions, executionProviders: [backendHint], profiler: profilerConfig, enableProfiling: profile, - preferredOutputLocation: ioBindingMode === 'gpu-location' ? ('gpu-buffer' as const) : undefined, + preferredOutputLocation, externalData, }; @@ -219,6 +232,7 @@ export class ModelTestContext { readonly perfData: ModelTestContext.ModelTestPerfData, readonly ioBinding: Test.IOBindingMode, private readonly profile: boolean, + public readonly mlContext?: MLContext, ) {} /** @@ -272,7 +286,24 @@ export class ModelTestContext { const initStart = now(); const executionProviderConfig = - modelTest.backend === 'webnn' ? testOptions?.webnnOptions || 'webnn' : modelTest.backend!; + modelTest.backend === 'webnn' ? testOptions?.webnnOptions || { name: 'webnn' } : modelTest.backend!; + let mlContext: MLContext | undefined; + if (['ml-tensor', 'ml-location'].includes(modelTest.ioBinding)) { + const webnnOptions = executionProviderConfig as ort.InferenceSession.WebNNExecutionProviderOption; + const deviceType = (webnnOptions as ort.InferenceSession.WebNNContextOptions)?.deviceType; + const numThreads = (webnnOptions as ort.InferenceSession.WebNNContextOptions)?.numThreads; + const powerPreference = (webnnOptions as ort.InferenceSession.WebNNContextOptions)?.powerPreference; + + mlContext = await navigator.ml.createContext({ + deviceType, + numThreads, + powerPreference, + }); + (executionProviderConfig as ort.InferenceSession.WebNNExecutionProviderOption).context = mlContext; + if (!deviceType) { + (executionProviderConfig as ort.InferenceSession.WebNNContextOptions).deviceType = deviceType; + } + } const session = await initializeSession( modelTest.modelUrl, executionProviderConfig, @@ -295,6 +326,7 @@ export class ModelTestContext { { init: initEnd - initStart, firstRun: -1, runs: [], count: 0 }, modelTest.ioBinding, profile, + mlContext, ); } finally { this.initializing = false; @@ -622,30 +654,82 @@ function createGpuTensorForOutput(type: ort.Tensor.Type, dims: readonly number[] }); } +async function createMLTensorForOutput(mlContext: MLContext, type: ort.Tensor.Type, dims: readonly number[]) { + if (!isMLTensorSupportedType(type)) { + throw new Error(`createMLTensorForOutput can not work with ${type} tensor`); + } + + const dataType = type === 'bool' ? 'uint8' : type; + + const mlTensor = await mlContext.createTensor({ + dataType, + shape: dims as number[], + // Assign both shape and dimensions while transitioning to new API. + dimensions: dims as number[], + usage: MLTensorUsage.READ, + }); + + return ort.Tensor.fromMLTensor(mlTensor, { + dataType: type, + dims, + dispose: () => mlTensor.destroy(), + download: async () => { + const arrayBuffer = await mlContext.readTensor(mlTensor); + return createView(arrayBuffer, type) as ort.Tensor.DataTypeMap[ort.Tensor.MLTensorDataTypes]; + }, + }); +} + +async function createMLTensorForInput(mlContext: MLContext, cpuTensor: ort.Tensor): Promise { + if (!isMLTensorSupportedType(cpuTensor.type) || Array.isArray(cpuTensor.data)) { + throw new Error(`createMLTensorForInput can not work with ${cpuTensor.type} tensor`); + } + const dataType = cpuTensor.type === 'bool' ? 'uint8' : cpuTensor.type; + const mlTensor = await mlContext.createTensor({ + dataType, + shape: cpuTensor.dims as number[], + // Assign both shape and dimensions while transitioning to new API. + dimensions: cpuTensor.dims as number[], + usage: MLTensorUsage.WRITE, + }); + mlContext.writeTensor(mlTensor, cpuTensor.data); + return ort.Tensor.fromMLTensor(mlTensor, { + dataType: cpuTensor.type, + dims: cpuTensor.dims, + dispose: () => mlTensor.destroy(), + }); +} + export async function sessionRun(options: { session: ort.InferenceSession; feeds: Record; outputsMetaInfo: Record>; ioBinding: Test.IOBindingMode; + mlContext?: MLContext; }): Promise<[number, number, ort.InferenceSession.OnnxValueMapType]> { const session = options.session; const feeds = options.feeds; const fetches: Record = {}; - // currently we only support IO Binding for WebGPU + // currently we only support IO Binding for WebGPU and WebNN // - // For inputs, we create GPU tensors on both 'gpu-tensor' and 'gpu-location' binding testing mode. - // For outputs, we create GPU tensors on 'gpu-tensor' binding testing mode only. + // For inputs, we create tensors on 'gpu-tensor', 'gpu-location', 'ml-tensor', and 'ml-location' binding testing + // modes. + // For outputs, we create tensors on 'gpu-tensor' and 'ml-tensor' binding testing modes. // in 'gpu-device' binding mode, outputs are not pre-allocated. - const shouldUploadInput = options.ioBinding === 'gpu-tensor' || options.ioBinding === 'gpu-location'; - const shouldUploadOutput = options.ioBinding === 'gpu-tensor'; + const shouldUploadInput = ['gpu-tensor', 'gpu-location', 'ml-location', 'ml-tensor'].includes(options.ioBinding); + const shouldUploadOutput = options.ioBinding === 'gpu-tensor' || options.ioBinding === 'ml-tensor'; try { if (shouldUploadInput) { // replace the CPU tensors in feeds into GPU tensors for (const name in feeds) { if (Object.hasOwnProperty.call(feeds, name)) { if (feeds[name].size > 0) { - feeds[name] = createGpuTensorForInput(feeds[name]); + if (options.ioBinding === 'ml-location' || options.ioBinding === 'ml-tensor') { + feeds[name] = await createMLTensorForInput(options.mlContext!, feeds[name]); + } else { + feeds[name] = createGpuTensorForInput(feeds[name]); + } } } } @@ -658,7 +742,11 @@ export async function sessionRun(options: { if (dims.some((d) => d === 0)) { fetches[name] = new ort.Tensor(type, [], dims); } else { - fetches[name] = createGpuTensorForOutput(type, dims); + if (options.ioBinding === 'ml-tensor') { + fetches[name] = await createMLTensorForOutput(options.mlContext!, type, dims); + } else { + fetches[name] = createGpuTensorForOutput(type, dims); + } } } } @@ -714,6 +802,7 @@ export async function runModelTestSet( feeds, outputsMetaInfo, ioBinding: context.ioBinding, + mlContext: context.mlContext, }); if (context.perfData.count === 0) { context.perfData.firstRun = end - start; diff --git a/js/web/test/test-types.ts b/js/web/test/test-types.ts index be1e56485ec5a..29a11f969ffea 100644 --- a/js/web/test/test-types.ts +++ b/js/web/test/test-types.ts @@ -52,8 +52,12 @@ export declare namespace Test { * `preferredOutputLocation` will be set to `gpu-buffer`. * - gpu-tensor: inputs and outputs will all be pre-allocated as GPU tensors. `preferredOutputLocation` * will not be set. + * - ml-location: inputs will be pre-allocated as ML tensors; no output will be pre-allocated; + * `preferredOutputLocation` will be set to `ml-tensor`. + * - ml-tensor: inputs and outputs will all be pre-allocated as MLTensor tensors. `preferredOutputLocation` + * will not be set. */ - export type IOBindingMode = 'none' | 'gpu-tensor' | 'gpu-location'; + export type IOBindingMode = 'none' | 'gpu-tensor' | 'gpu-location' | 'ml-tensor' | 'ml-location'; export interface ModelTestCase { name: string; diff --git a/onnxruntime/core/framework/allocator.cc b/onnxruntime/core/framework/allocator.cc index 5e66f2b99fded..b6dc8ad56f257 100644 --- a/onnxruntime/core/framework/allocator.cc +++ b/onnxruntime/core/framework/allocator.cc @@ -141,7 +141,8 @@ ORT_API_STATUS_IMPL(OrtApis::CreateMemoryInfo, _In_ const char* name1, enum OrtA strcmp(name1, onnxruntime::OpenVINO_GPU) == 0 || strcmp(name1, onnxruntime::DML) == 0 || strcmp(name1, onnxruntime::HIP) == 0 || - strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0) { + strcmp(name1, onnxruntime::WEBGPU_BUFFER) == 0 || + strcmp(name1, onnxruntime::WEBNN_TENSOR) == 0) { *out = new OrtMemoryInfo( name1, type, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast(id1)), id1, mem_type1); diff --git a/onnxruntime/core/providers/webnn/allocator.cc b/onnxruntime/core/providers/webnn/allocator.cc new file mode 100644 index 0000000000000..9c5cd651e1f00 --- /dev/null +++ b/onnxruntime/core/providers/webnn/allocator.cc @@ -0,0 +1,41 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/webnn/allocator.h" + +#include "core/common/safeint.h" + +namespace onnxruntime { +namespace webnn { + +void* WebNNTensorAllocator::Alloc(size_t size) { + if (size == 0) { + return nullptr; + } + if (!emscripten::val::module_property("shouldTransferToMLTensor").as()) { + // We don't need to transfer the tensor to an MLTensor, so we don't need to allocate an MLTensor id. + return nullptr; + } + void* p = EM_ASM_PTR({ return Module.jsepReserveTensorId(); }); + allocations_[p] = size; + stats_.num_allocs++; + stats_.bytes_in_use += SafeInt(size); + return p; +} + +void WebNNTensorAllocator::Free(void* p) { + if (p == nullptr) { + return; + } + EM_ASM({ Module.jsepReleaseTensorId($0); }, p); + size_t size = allocations_[p]; + stats_.bytes_in_use -= size; + allocations_.erase(p); +} + +void WebNNTensorAllocator::GetStats(AllocatorStats* stats) { + *stats = stats_; +} + +} // namespace webnn +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/allocator.h b/onnxruntime/core/providers/webnn/allocator.h new file mode 100644 index 0000000000000..c06da909801cc --- /dev/null +++ b/onnxruntime/core/providers/webnn/allocator.h @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include + +#include "core/common/inlined_containers.h" +#include "core/framework/allocator.h" +#include "core/framework/ortdevice.h" + +namespace onnxruntime { +namespace webnn { + +class WebNNTensorAllocator : public IAllocator { + public: + WebNNTensorAllocator() : IAllocator(OrtMemoryInfo(WEBNN_TENSOR, OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0), 0, OrtMemTypeDefault)) {} + + void* Alloc(size_t size) override; + + void Free(void* p) override; + + void GetStats(AllocatorStats* stats) override; + + private: + AllocatorStats stats_; + InlinedHashMap allocations_; +}; + +} // namespace webnn +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc index c4a633fcc92bb..b90c7d76a6507 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.cc +++ b/onnxruntime/core/providers/webnn/builders/helper.cc @@ -12,6 +12,19 @@ namespace onnxruntime { namespace webnn { +WebnnDeviceType DeviceTypeFromString(const std::string_view& device_type) { + if (device_type == "gpu") { + return WebnnDeviceType::GPU; + } + if (device_type == "cpu") { + return WebnnDeviceType::CPU; + } + if (device_type == "npu") { + return WebnnDeviceType::NPU; + } + ORT_THROW("Unknown WebNN deviceType."); +} + InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer) { InitializedTensorSet all_initializers; if (graph_viewer.IsSubgraph()) { @@ -243,5 +256,10 @@ bool SetWebnnDataType(emscripten::val& desc, const int32_t data_type) { } } +bool IsMLTensorSupported() { + static bool is_supported = !emscripten::val::global("MLTensor").isUndefined(); + return is_supported; +} + } // namespace webnn } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h index 7ba1d18fa1a76..529463f0808ad 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.h +++ b/onnxruntime/core/providers/webnn/builders/helper.h @@ -31,6 +31,8 @@ enum class WebnnDeviceType { NPU, }; +WebnnDeviceType DeviceTypeFromString(const std::string_view& device_type); + // Collects all the initializer tensors in the subGraph and its ancestor graphs. InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer); @@ -292,5 +294,7 @@ bool GetBidirectionalBroadcastShape(std::vector& shape_a, bool SetWebnnDataType(emscripten::val& desc, const int32_t data_type); +bool IsMLTensorSupported(); + } // namespace webnn } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/builders/model.cc b/onnxruntime/core/providers/webnn/builders/model.cc index 8cd2e8d0ffad3..fcfdb146bff34 100644 --- a/onnxruntime/core/providers/webnn/builders/model.cc +++ b/onnxruntime/core/providers/webnn/builders/model.cc @@ -11,21 +11,30 @@ #include "core/common/safeint.h" #include "core/graph/onnx_protobuf.h" #include "core/providers/common.h" -#include "core/providers/webnn/builders/helper.h" #include "model.h" namespace onnxruntime { namespace webnn { -Model::Model(const emscripten::val& context, const emscripten::val& graph, const logging::Logger& logger) +Model::Model(const emscripten::val& context, const emscripten::val& graph, const logging::Logger& logger, bool use_dispatch) : wnn_context_(context), wnn_graph_(graph), - logger_(logger) {} + logger_(logger), + use_dispatch_(use_dispatch) {} Model::~Model() {} Status Model::Predict(const InlinedHashMap& inputs, const InlinedHashMap& outputs) { + if (use_dispatch_) { + return Dispatch(inputs, outputs); + } else { + return Compute(inputs, outputs); + } +} + +onnxruntime::common::Status Model::Compute(const InlinedHashMap& inputs, + const InlinedHashMap& outputs) { for (const auto& input : inputs) { const std::string& name = input.first; const struct OnnxTensorData tensor = input.second; @@ -142,6 +151,40 @@ Status Model::Predict(const InlinedHashMap& inputs, return Status::OK(); } +onnxruntime::common::Status Model::Dispatch(const InlinedHashMap& inputs, + const InlinedHashMap& outputs) { + auto jsepEnsureTensor = emscripten::val::module_property("jsepEnsureTensor"); + auto promises = emscripten::val::array(); + for (const auto& [_, tensor] : inputs) { + emscripten::val shape = emscripten::val::array(); + for (const auto& dim : tensor.tensor_info.shape) { + uint32_t dim_val = SafeInt(dim); + shape.call("push", dim_val); + } + auto ml_tensor = jsepEnsureTensor(reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, true); + promises.call("push", ml_tensor); + } + for (const auto& [_, tensor] : outputs) { + emscripten::val shape = emscripten::val::array(); + for (const auto& dim : tensor.tensor_info.shape) { + uint32_t dim_val = SafeInt(dim); + shape.call("push", dim_val); + } + auto ml_tensor = jsepEnsureTensor(reinterpret_cast(tensor.buffer), tensor.tensor_info.data_type, shape, false); + promises.call("push", ml_tensor); + } + auto ml_tensors = emscripten::val::global("Promise").call("all", promises).await(); + for (const auto& [name, _] : inputs) { + wnn_inputs_.set(name, ml_tensors.call("shift")); + } + for (const auto& [name, _] : outputs) { + wnn_outputs_.set(name, ml_tensors.call("shift")); + } + wnn_context_.call("dispatch", wnn_graph_, wnn_inputs_, wnn_outputs_); + + return Status::OK(); +} + const OnnxTensorInfo& Model::GetInputOutputInfo(const std::string& name) const { return input_output_info_.at(name); } @@ -156,6 +199,10 @@ void Model::SetOutputMap(InlinedHashMap&& output_map) { // Pre-allocate the input and output buffers for the WebNN graph. void Model::AllocateInputOutputBuffers() { + // We don't need to allocate JS ArrayBuffers if the WebNN API supports MLTensor. + if (use_dispatch_) { + return; + } for (const auto& input : inputs_) { const auto& input_info = input_output_info_.at(input); const auto input_shape = input_info.shape; diff --git a/onnxruntime/core/providers/webnn/builders/model.h b/onnxruntime/core/providers/webnn/builders/model.h index 5119dbbbc9858..c554dcb6f6877 100644 --- a/onnxruntime/core/providers/webnn/builders/model.h +++ b/onnxruntime/core/providers/webnn/builders/model.h @@ -56,6 +56,12 @@ class Model { size_t GetMappedOutputIdx(const std::string& name) const; private: + onnxruntime::common::Status Dispatch(const InlinedHashMap& inputs, + const InlinedHashMap& outputs); + + onnxruntime::common::Status Compute(const InlinedHashMap& inputs, + const InlinedHashMap& outputs); + emscripten::val wnn_context_ = emscripten::val::object(); emscripten::val wnn_graph_ = emscripten::val::object(); const logging::Logger& logger_; @@ -73,7 +79,9 @@ class Model { OrtMutex mutex_; - Model(const emscripten::val& context, const emscripten::val& path, const logging::Logger& logger); + bool use_dispatch_; + + Model(const emscripten::val& context, const emscripten::val& path, const logging::Logger& logger, bool use_dispatch); void SetInputOutputInfo(InlinedHashMap&& input_output_info) { input_output_info_ = std::move(input_output_info); diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc index f92fda8c74717..044baa738e8c4 100644 --- a/onnxruntime/core/providers/webnn/builders/model_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc @@ -340,7 +340,7 @@ Status ModelBuilder::Compile(std::unique_ptr& model) { } // Explicitly release the WebNN builder to free memory. wnn_builder_ = emscripten::val::undefined(); - model.reset(new Model(std::move(wnn_context_), std::move(wnn_graph), logger_)); + model.reset(new Model(std::move(wnn_context_), std::move(wnn_graph), logger_, IsMLTensorSupported())); model->SetInputs(std::move(input_names_)); model->SetOutputs(std::move(output_names_)); model->SetInputOutputInfo(std::move(input_output_info_)); diff --git a/onnxruntime/core/providers/webnn/data_transfer.cc b/onnxruntime/core/providers/webnn/data_transfer.cc new file mode 100644 index 0000000000000..44e9bf9edf3d9 --- /dev/null +++ b/onnxruntime/core/providers/webnn/data_transfer.cc @@ -0,0 +1,44 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/webnn/data_transfer.h" + +#include +#include "core/framework/tensor.h" + +namespace onnxruntime { +namespace webnn { + +bool DataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { + // Copying data between MLTensors is not supported by WebNN. + return (dst_device.Type() == OrtDevice::GPU && src_device.Type() == OrtDevice::CPU) || + (dst_device.Type() == OrtDevice::CPU && src_device.Type() == OrtDevice::GPU); +} + +common::Status DataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const { + if (!emscripten::val::module_property("shouldTransferToMLTensor").as()) { + // We don't need to transfer the tensor to an MLTensor, so we don't need to copy the data. + return Status::OK(); + } + + size_t bytes = src.SizeInBytes(); + if (bytes > 0) { + const void* src_data = src.DataRaw(); + void* dst_data = dst.MutableDataRaw(); + + const auto& dst_device = dst.Location().device; + + if (dst_device.Type() == OrtDevice::GPU) { + EM_ASM({ Module.jsepUploadTensor($0, HEAPU8.subarray($1, $1 + $2)); }, dst_data, reinterpret_cast(src_data), bytes); + } else { + auto jsepDownloadTensor = emscripten::val::module_property("jsepDownloadTensor"); + auto subarray = emscripten::typed_memory_view(bytes, static_cast(dst_data)); + jsepDownloadTensor(reinterpret_cast(src_data), subarray).await(); + } + } + + return Status::OK(); +} + +} // namespace webnn +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/data_transfer.h b/onnxruntime/core/providers/webnn/data_transfer.h new file mode 100644 index 0000000000000..03cfada46d1a0 --- /dev/null +++ b/onnxruntime/core/providers/webnn/data_transfer.h @@ -0,0 +1,21 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include + +#include "core/framework/data_transfer.h" + +namespace onnxruntime { +namespace webnn { + +class DataTransfer : public IDataTransfer { + public: + bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; + + common::Status CopyTensor(const Tensor& src, Tensor& dst) const override; +}; + +} // namespace webnn +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc index b729623c5d3d8..2258d1ac1cd8f 100644 --- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc +++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc @@ -5,11 +5,14 @@ #include "webnn_execution_provider.h" #include "core/framework/compute_capability.h" +#include "core/framework/data_transfer_manager.h" #include "core/framework/memcpy.h" #include "core/framework/kernel_registry.h" #include "core/graph/graph_viewer.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/common/safeint.h" +#include "core/providers/webnn/allocator.h" +#include "core/providers/webnn/data_transfer.h" #include "builders/model.h" #include "builders/helper.h" @@ -18,20 +21,14 @@ namespace onnxruntime { WebNNExecutionProvider::WebNNExecutionProvider(const std::string& webnn_device_flags) - : IExecutionProvider{onnxruntime::kWebNNExecutionProvider} { - // WebNN EP uses NHWC layout for CPU XNNPACK backend and NCHW for GPU DML backend. - if (webnn_device_flags.compare("cpu") == 0) { - wnn_device_type_ = webnn::WebnnDeviceType::CPU; - } else { - if (webnn_device_flags.compare("gpu") == 0) { - wnn_device_type_ = webnn::WebnnDeviceType::GPU; - } else if (webnn_device_flags.compare("npu") == 0) { - wnn_device_type_ = webnn::WebnnDeviceType::NPU; - } else { - ORT_THROW("Unknown WebNN deviceType."); - } - } - + : IExecutionProvider{ + onnxruntime::kWebNNExecutionProvider, + // If MLTensor is supported, we force all the tensors to be allocated as MLTensor. + OrtDevice( + webnn::IsMLTensorSupported() ? OrtDevice::GPU : OrtDevice::CPU, + OrtDevice::MemType::DEFAULT, + 0)}, + wnn_device_type_(webnn::DeviceTypeFromString(webnn_device_flags)) { wnn_context_ = emscripten::val::module_property("currentContext"); if (!wnn_context_.as()) { ORT_THROW("Failed to create WebNN context."); @@ -322,6 +319,32 @@ common::Status WebNNExecutionProvider::Compile(const std::vectorInput(0); + ORT_ENFORCE(X != nullptr, "Memcpy: input tensor is null"); + auto* Y = context->Output(0, X->Shape()); + ORT_ENFORCE(X != nullptr, "Memcpy: output tensor is null"); + emscripten::val shape = emscripten::val::array(); + for (auto dim : X->Shape().GetDims()) { + shape.call("push", SafeInt(dim).Ref()); + } + + jsepEnsureTensor(reinterpret_cast(Y->MutableDataRaw()), + Y->GetElementType(), + shape, false) + .await(); + + const auto* data_transfer = Info().GetDataTransferManager().GetDataTransfer(X->Location().device, Y->Location().device); + + return data_transfer->CopyTensor(*X, *Y); + } +}; + ONNX_OPERATOR_KERNEL_EX( MemcpyFromHost, kOnnxDomain, @@ -330,7 +353,7 @@ ONNX_OPERATOR_KERNEL_EX( KernelDefBuilder() .InputMemoryType(OrtMemTypeCPUInput, 0) .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()), - Memcpy); + WebNNMemcpy); ONNX_OPERATOR_KERNEL_EX( MemcpyToHost, @@ -373,4 +396,22 @@ WebNNExecutionProvider::GetKernelRegistry() const { return kernel_registry; } +std::unique_ptr WebNNExecutionProvider::GetDataTransfer() const { + if (!webnn::IsMLTensorSupported()) { + return nullptr; + } + return std::make_unique(); +} + +std::vector WebNNExecutionProvider::CreatePreferredAllocators() { + if (!webnn::IsMLTensorSupported()) { + return {}; + } + AllocatorCreationInfo customAllocatorCreationInfo([&](OrtDevice::DeviceId) { + return std::make_unique(); + }, + 0, false); + return {CreateAllocator(customAllocatorCreationInfo)}; +} + } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.h b/onnxruntime/core/providers/webnn/webnn_execution_provider.h index 8ea8cedf04300..26c5e476bcc4f 100644 --- a/onnxruntime/core/providers/webnn/webnn_execution_provider.h +++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.h @@ -40,6 +40,8 @@ class WebNNExecutionProvider : public IExecutionProvider { #endif std::shared_ptr GetKernelRegistry() const override; + std::unique_ptr GetDataTransfer() const override; + std::vector CreatePreferredAllocators() override; private: emscripten::val wnn_context_ = emscripten::val::undefined(); diff --git a/onnxruntime/wasm/api.cc b/onnxruntime/wasm/api.cc index 0e58bb4f93f7f..5173125cb8634 100644 --- a/onnxruntime/wasm/api.cc +++ b/onnxruntime/wasm/api.cc @@ -23,7 +23,8 @@ enum DataLocation { DATA_LOCATION_CPU = 1, DATA_LOCATION_CPU_PINNED = 2, DATA_LOCATION_TEXTURE = 3, - DATA_LOCATION_GPU_BUFFER = 4 + DATA_LOCATION_GPU_BUFFER = 4, + DATA_LOCATION_ML_TENSOR = 5 }; static_assert(sizeof(const char*) == sizeof(size_t), "size of a pointer and a size_t value should be the same."); @@ -235,7 +236,8 @@ void OrtFree(void* ptr) { OrtValue* OrtCreateTensor(int data_type, void* data, size_t data_length, size_t* dims, size_t dims_length, int data_location) { if (data_location != DATA_LOCATION_CPU && data_location != DATA_LOCATION_CPU_PINNED && - data_location != DATA_LOCATION_GPU_BUFFER) { + data_location != DATA_LOCATION_GPU_BUFFER && + data_location != DATA_LOCATION_ML_TENSOR) { std::ostringstream ostr; ostr << "Invalid data location: " << data_location; CheckStatus(Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, ostr.str().c_str())); @@ -264,10 +266,15 @@ OrtValue* OrtCreateTensor(int data_type, void* data, size_t data_length, size_t* return UNREGISTER_AUTO_RELEASE(value); } else { OrtMemoryInfo* memory_info = nullptr; - if (data_location != DATA_LOCATION_GPU_BUFFER) { - RETURN_NULLPTR_IF_ERROR(CreateCpuMemoryInfo, OrtDeviceAllocator, OrtMemTypeDefault, &memory_info); - } else { - RETURN_NULLPTR_IF_ERROR(CreateMemoryInfo, "WebGPU_Buffer", OrtDeviceAllocator, 0, OrtMemTypeDefault, &memory_info); + switch (data_location) { + case DATA_LOCATION_GPU_BUFFER: + RETURN_NULLPTR_IF_ERROR(CreateMemoryInfo, "WebGPU_Buffer", OrtDeviceAllocator, 0, OrtMemTypeDefault, &memory_info); + break; + case DATA_LOCATION_ML_TENSOR: + RETURN_NULLPTR_IF_ERROR(CreateMemoryInfo, "WebNN_Tensor", OrtDeviceAllocator, 0, OrtMemTypeDefault, &memory_info); + break; + default: + RETURN_NULLPTR_IF_ERROR(CreateCpuMemoryInfo, OrtDeviceAllocator, OrtMemTypeDefault, &memory_info); } REGISTER_AUTO_RELEASE_HANDLE(MemoryInfo, memory_info); @@ -418,15 +425,18 @@ int EMSCRIPTEN_KEEPALIVE OrtBindOutput(OrtIoBinding* io_binding, if (output_location != DATA_LOCATION_NONE && output_location != DATA_LOCATION_CPU && output_location != DATA_LOCATION_CPU_PINNED && - output_location != DATA_LOCATION_GPU_BUFFER) { + output_location != DATA_LOCATION_GPU_BUFFER && + output_location != DATA_LOCATION_ML_TENSOR) { std::ostringstream ostr; ostr << "Invalid data location (" << output_location << ") for output: \"" << name << "\"."; return CheckStatus(Ort::GetApi().CreateStatus(ORT_INVALID_ARGUMENT, ostr.str().c_str())); } OrtMemoryInfo* memory_info = nullptr; - if (output_location != DATA_LOCATION_GPU_BUFFER) { + if (output_location != DATA_LOCATION_GPU_BUFFER && output_location != DATA_LOCATION_ML_TENSOR) { RETURN_ERROR_CODE_IF_ERROR(CreateCpuMemoryInfo, OrtDeviceAllocator, OrtMemTypeDefault, &memory_info); + } else if (output_location == DATA_LOCATION_ML_TENSOR) { + RETURN_ERROR_CODE_IF_ERROR(CreateMemoryInfo, "WebNN_Tensor", OrtDeviceAllocator, 0, OrtMemTypeDefault, &memory_info); } else { RETURN_ERROR_CODE_IF_ERROR(CreateMemoryInfo, "WebGPU_Buffer", OrtDeviceAllocator, 0, OrtMemTypeDefault, &memory_info); } diff --git a/onnxruntime/wasm/pre-jsep.js b/onnxruntime/wasm/pre-jsep.js index 70ed295887994..68332d07a9782 100644 --- a/onnxruntime/wasm/pre-jsep.js +++ b/onnxruntime/wasm/pre-jsep.js @@ -202,5 +202,38 @@ Module['jsepInit'] = (name, params) => { Module.jsepUploadExternalBuffer = (dataId, buffer) => { backend['upload'](dataId, buffer); }; + } else if (name === 'webnn') { + // Functions called from EM_ASM need to be assigned in a way that can be minified. + // Functions called via emscripten::val::module_property need to be assigned by name so that the minifier doesn't + // change the name. + + [Module.jsepBackend, + Module.jsepReserveTensorId, + Module.jsepReleaseTensorId, + Module['jsepEnsureTensor'], + Module.jsepUploadTensor, + Module['jsepDownloadTensor'], + ] = params; + + // This function is called from both JS and an EM_ASM block, it needs both a minifiable name and an explicit name. + Module['jsepReleaseTensorId'] = Module.jsepReleaseTensorId; + + // Functions called from JS also need to have explicit names. + const backend = Module.jsepBackend; + Module['jsepOnRunStart'] = sessionId => { + return backend['onRunStart'](sessionId); + }; + Module['jsepRegisterMLContext'] = (sessionId, mlContext) => { + backend['registerMLContext'](sessionId, mlContext); + }; + Module['jsepOnReleaseSession'] = sessionId => { + backend['onReleaseSession'](sessionId); + }; + Module['jsepCreateMLTensorDownloader'] = (tensorId, type) => { + return backend['createMLTensorDownloader'](tensorId, type); + } + Module['jsepRegisterMLTensor'] = (tensor, dataType, shape) => { + return backend['registerMLTensor'](tensor, dataType, shape); + } } };