index.bs

<pre class='metadata'>
Title: Model Loader API
Shortname: model-loader
Level: 1
Status: w3c/CG-DRAFT
Status Text: <strong>This incubation is on pause, see <a href="https://github.com/webmachinelearning/model-loader/issues/36">discussion</a> for the latest updates.</strong>
Group: webml
URL: https://webmachinelearning.github.io/model-loader/
!Explainer: <a href="https://github.com/webmachinelearning/model-loader/blob/master/explainer.md">explainer.md</a>
Editor: Jonathan Bingham 114606, Google Inc. https://google.com
Abstract: This document describes an API to load a custom pre-trained machine learning model.
Logo: https://webmachinelearning.github.io/webmachinelearning-logo.png
</pre>
<pre class="anchors">
urlPrefix: https://webmachinelearning.github.io/webnn/; url: dom-navigator-ml; type: interface; text: ML
</pre>
<pre class="anchors">
urlPrefix: https://webmachinelearning.github.io/webnn/; spec: webnn
    type: interface
        text: ML; url: ml
        text: MLContextOptions; url: dictdef-mlcontextoptions
        text: MLContext; url: mlcontext
        text: MLNamedInputs; url: typedefdef-mlnamedinputs
        text: MLNamedOutputs; url: typedefdef-mlnamedoutputs
</pre>
<pre class="link-defaults">
spec: webnn; type: interface; text: ML
</pre>

Introduction {#intro}
=====================

For the introduction and use cases, please see the <a href="https://github.com/webmachinelearning/model-loader/blob/master/explainer.md">explainer.md</a>.

For illustration purposes, the API and examples use the <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs">TF Lite flatbuffer</a> format.

API {#api}
==========

<pre class="idl">
enum MLModelFormat {
  // Tensorflow-lite flatbuffer.
  "tflite" 
};

enum MLDevicePreference {
  // Let the backend selects the most suitable device.
  "auto",
  // The backend will use GPU to do model inference. If some operator is not
  // supported by GPU, it will fall back to CPU.
  "gpu",
  // The backend will use CPU to do model inference.
  "cpu"
};

enum MLPowerPreference {
  // Let the backend selects the most suitable behavior.
  "auto",
  // Prioritizes execution speed over power consumption.
  "high-performance",
  // Prioritizes power consumption over other considerations such as execution
  // speed.
  "low-power",
};

dictionary MLContextOptions {
  // Preferred kind of device to use.
  MLDevicePreference devicePreference = "auto";

  // Preference as related to power consumption.
  MLPowerPreference powerPreference = "auto";

  // Model format for the model loader API.
  MLModelFormat modelFormat = "tflite";
  
  // Number of thread to use.
  // "0" means the backend can determine it automatically.
  unsigned long numThreads = 0;
};

[Exposed=Window]
interface ML {
  Promise&lt;MLContext&gt; createContext(optional MLContextOptions options = {});
};

enum MLDataType {
  // "Unknown" doesn't mean "unsupported". The background can support more types
  // than which are explicitly listed here (e.g., TfLite has complex numbers).
  // We treat them as "unknown" to avoid exposing too many details of the
  // backends from the beginning.
  "unknown",
  "int64",
  "uint64",
  "float64",
  "int32",
  "uint32",
  "float32",
  "int16",
  "uint16",
  "float16",
  "int8",
  "uint8",
  "bool",
};

dictionary MLTensor {
  required ArrayBufferView data;
  required sequence&lt;unsigned long&gt; dimensions;
};

dictionary MLTensorInfo {
  required DOMString name;
  required MLDataType type;
  required sequence&lt;unsigned long&gt; dimensions;
};

[SecureContext, Exposed=Window]
interface MLModel {
  Promise&lt;record&lt;DOMString, MLTensor&gt;&gt; compute(record&lt;DOMString, MLTensor&gt; inputs);      
  sequence&lt;MLTensorInfo&gt; inputs();
  sequence&lt;MLTensorInfo&gt; outputs();
};

[Exposed=Window]
interface MLModelLoader {
  constructor(MLContext context);
  Promise&lt;MLModel&gt; load(ArrayBuffer modelBuffer);
};

</pre>


Examples {#examples}
==================

<pre highlight="js">
// First, create an MLContext. This is consistent with the WebNN API. And we will 
// add two new fields, “numThread” and "modelFormat". 
const context = await navigator.ml.createContext(
                                     { devicePreference: "cpu",
                                       powerPreference: "low-power",
                                       numThread: 0,   // the default 0 means 
                                                       // "decide automatically". 
                                       modelFormat: "tflite" });
// Then create the model loader using the ML context.
loader = new MLModelLoader(context);
// In the first version, we only support loading models from ArrayBuffers. We 
// believe this covers most of the usage cases. Web developers can download the 
// model, e.g., by the fetch API. We can add new "load" functions in the future
// if they are really needed.
const modelUrl = 'https://path/to/model/file';
const modelBuffer = await fetch(modelUrl)
                            .then(response => response.arrayBuffer());
// Load the model.
model = await loader.load(modelBuffer);
// Use the `model.compute` function to get the output of the model from some 
// inputs.  Example ways of using this function includes, 
// 1. When there is only one input tensor of the model, one can simply input the 
// tensor, without specifying the name of it (the user can still designate this 
// input tensor by name if they like).
z = await model.compute({ data: new Float32Array([10]), 
                          dimensions: [1]) });
// 2. When there are multiple input tensors, the user has to designate the name 
// of the input tensors by their names.
z = await model.compute({ x: { data: new Float32Array([10]), 
                               dimensions: [1] },
                          y: { data: new Float32Array([20]), 
                               dimensions: [1] } });
// 3. The client can also specify the output tensor. This is consistent with the 
// WebNN API and can be useful, e.g., when the output tensor is a GPU buffer. At 
// this time, the function will return an empty promise. The dimension of the 
// output tensor specified must match the dimensions of the output tensor of the 
// model. 
z_buffer = ml.tensor({data: new Float64Array(1), 
                      dimensions: [1] });
await model.compute({ data: new Float32Array([10]), 
                      dimensions: [1] },
                    z_buffer);
// For the output tensor(s),
// Similar to the input arguments, if there is only one output tensor, the 
// `compute` function returns a tensor in case 1 and 2, and there is no need to 
// specify the name of the output tensor in case 3. But if there are multiple 
// output tensors, the output in case 1 and 2 will be a map from tensor name to 
// tensors, and in case 3, the output argument must be a map from tensor name to
// tensors too.
// For case 1 and 2, where the actual output data locate will depend on the 
// context: if it is CPU context, the output tensor’s buffer will be RAM buffer(s)
// and if the context is GPU context, the output tensor’s buffer will be GPU 
// buffer(s).
</pre>