microsoft · trajepl · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/examples/llama2/README.md b/examples/llama2/README.md
@@ -54,23 +54,30 @@ Run the following command to execute the workflow:
 python -m olive.workflows.run --config lamma2_genai.json
 ```
 Snippet below shows an example run of generated llama2 model.
-```
-import onnxruntime_genai as ortgenai
+```python
+import onnxruntime_genai as og
+
+model = og.Model("model_path")
+tokenizer = og.Tokenizer(model)
+tokenizer_stream = tokenizer.create_stream()
+
+prompt = '''def print_prime(n):
+    """
+    Print all primes between 1 and n
+    """'''
 
-model = ortgenai.Model("llama2-7b-chat-int4-cpu", ortgenai.DeviceType.CPU)
-tokenizer = model.create_tokenizer()
+tokens = tokenizer.encode(prompt)
 
-while True:
-    prompt = input("Input: ")
-    input_tokens = tokenizer.encode(prompt)
+params = og.GeneratorParams(model)
+params.set_search_options({"max_length":200})
+params.input_ids = tokens
 
-    params = ortgenai.GeneratorParams(model)
-    params.max_length = 64
-    params.input_ids = input_tokens
+output_tokens = model.generate(params)
 
-    output_tokens = model.generate(params)[0]
+text = tokenizer.decode(output_tokens)
 
-    print("Output: ", tokenizer.decode(output_tokens))
+print("Output:")
+print(text)
 ```
 
 ### Quantization using GPTQ and do text generation using ONNX Runtime with Optimum

diff --git a/examples/phi2/README.md b/examples/phi2/README.md
@@ -49,6 +49,78 @@ cuda_int4
 python phi2.py --model_type cuda_int4
 ```
 
+### GenAI Optimization
+For using ONNX runtime GenAI to optimize, follow build and installation instructions [here](https://github.com/microsoft/onnxruntime-genai).
+Run the following command to execute the workflow:
+```bash
+python -m olive.workflows.run --config phi2_genai.json
+```
+This `phi2_genai.json` config file will generate optimized models for `cpu_int4` and `cuda_int4` model types as onnxruntime-gpu support cpu ep and cuda ep both.
+If you only want cpu or cuda model, you can modify the config file by remove the unwanted execution providers.
+```json
+# CPU
+"accelerators": [
+  {
+      "device": "CPU",
+      "execution_providers": [
+          "CPUExecutionProvider",
+      ]
+  }
+]
+# CPU: this is same with above as onnxruntime-gpu support cpu ep
+"accelerators": [
+  {
+      "device": "GPU",
+      "execution_providers": [
+          "CPUExecutionProvider",
+      ]
+  }
+]
+# CUDA
+"accelerators": [
+  {
+      "device": "GPU",
+      "execution_providers": [
+          "CUDAExecutionProvider",
+      ]
+  }
+]
+```
+
+or you can use `ph2.py` to generate optimized models separately by running the following commands:
+```bash
+python phi2.py --model_type cpu_int4 --genai_optimization
+python phi2.py --model_type cuda_int4 --genai_optimization
+```
+
+Snippet below shows an example run of generated phi2 model.
+```python
+import onnxruntime_genai as og
+
+model = og.Model("model_path")
+tokenizer = og.Tokenizer(model)
+tokenizer_stream = tokenizer.create_stream()
+
+prompt = '''def print_prime(n):
+    """
+    Print all primes between 1 and n
+    """'''
+
+tokens = tokenizer.encode(prompt)
+
+params = og.GeneratorParams(model)
+params.set_search_options({"max_length":200})
+params.input_ids = tokens
+
+output_tokens = model.generate(params)
+
+text = tokenizer.decode(output_tokens)
+
+print("Output:")
+print(text)
+```
+
+### Optimum Optimization
 Above commands will generate optimized models with given model_type and save them in the `phi2` folder. These optimized models can be wrapped by ONNXRuntime for inference.
 Besides, for better generation experience, this example also let use use [Optimum](https://huggingface.co/docs/optimum/v1.2.1/en/onnxruntime/modeling_ort) to generate optimized models.
 Then use can call `model.generate` easily to run inference with the optimized model.

diff --git a/examples/phi2/phi2.py b/examples/phi2/phi2.py
@@ -80,6 +80,11 @@ def get_args(raw_args):
         action="store_true",
         help="Use optimum optimization",
     )
+    parser.add_argument(
+        "--genai_optimization",
+        action="store_true",
+        help="Use optimum optimization",
+    )
     parser.add_argument(
         "--export_mlflow_format",
         action="store_true",
@@ -115,84 +120,104 @@ def main(raw_args=None):
     if not args.model_type and not args.finetune_method:
         raise ValueError("Please specify either model_type or finetune_method")
 
-    if not args.optimum_optimization and version.parse(OrtVersion) < version.parse("1.18.0"):
-        # Check if onnxruntime version is supported
-        # in linux, it requires the
-        # 1. model_type as `phi`
-        # 2. "optimization_options": {"attention_op_type": "MultiHeadAttention"}
-        # in windows, it requires the
-        # 1. model_type as `gpt2`
-        # 2. "optimization_options": {"attention_op_type": "MultiHeadAttention"}
-        # and `phi` and `MultiHeadAttention` requires ort-nightly version >= 1.18.0
-        raise ValueError(
-            "Please use onnxruntime>=1.18.0 for phi2 optimization in Linux, you can refer to "
-            "https://onnxruntime.ai/docs/install/#inference-install-table-for-all-languages "
-            "for ort-nightly installation. If you are optimizing phi2 model in GPU, only cuda11 "
-            "is supported in onnxruntime>=1.18.0"
-        )
-
-    json_file_template = "phi2_optimize_template.json"
-    with open(json_file_template) as f:
-        template_json = json.load(f)
-
-    if platform.system() == "Windows":
-        legacy_optimization_setting(template_json)
-
-    # add pass flows
-    pass_flows = [[]]
-    if args.finetune_method:
-        pass_flows[0].append(args.finetune_method)
-        template_json["systems"]["local_system"]["config"]["accelerators"][0]["device"] = "gpu"
-        # torch fine tuning does not require execution provider, just set it to CUDAExecutionProvider
-        template_json["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = [
-            "CUDAExecutionProvider"
-        ]
-    if args.model_type:
-        model_type = str(args.model_type)
-        pass_flows[0].extend(SUPPORTED_WORKFLOWS[model_type][0])
-        template_json["pass_flows"] = pass_flows
-        if args.optimum_optimization:
+    model_type = str(args.model_type) or ""
+
+    if args.genai_optimization:
+        json_file_template = "phi2_genai.json"
+        with open(json_file_template) as f:
+            template_json = json.load(f)
+            ep_str, precision = model_type.split("_")
+            device = "GPU" if ep_str == "cuda" else "CPU"
+            template_json["passes"]["genai_exporter"]["config"]["precision"] = precision
+            template_json["systems"]["local_system"]["config"]["accelerators"] = [
+                {"device": device, "execution_providers": [DEVICE_TO_EP[device.lower()]]}
+            ]
+
+        new_json_file = f"phi2_genai_{device.lower()}.json"
+        with open(new_json_file, "w") as f:
+            json.dump(template_json, f, indent=4)
+
+    else:
+        if not args.optimum_optimization and version.parse(OrtVersion) < version.parse("1.18.0"):
+            # Check if onnxruntime version is supported
+            # in linux, it requires the
+            # 1. model_type as `phi`
+            # 2. "optimization_options": {"attention_op_type": "MultiHeadAttention"}
+            # in windows, it requires the
+            # 1. model_type as `gpt2`
+            # 2. "optimization_options": {"attention_op_type": "MultiHeadAttention"}
+            # and `phi` and `MultiHeadAttention` requires ort-nightly version >= 1.18.0
+            raise ValueError(
+                "Please use onnxruntime>=1.18.0 for phi2 optimization in Linux, you can refer to "
+                "https://onnxruntime.ai/docs/install/#inference-install-table-for-all-languages "
+                "for ort-nightly installation. If you are optimizing phi2 model in GPU, only cuda11 "
+                "is supported in onnxruntime>=1.18.0"
+            )
+
+        json_file_template = "phi2_optimize_template.json"
+        with open(json_file_template) as f:
+            template_json = json.load(f)
+
+        if platform.system() == "Windows":
             legacy_optimization_setting(template_json)
-            for pass_flow in template_json["pass_flows"]:
-                pass_flow[0] = "optimum_convert"
-                if "perf_tuning" in pass_flow:
-                    pass_flow.remove("perf_tuning")
 
-        if "cuda" in model_type:
+        # add pass flows
+        pass_flows = [[]]
+        if args.finetune_method:
+            pass_flows[0].append(args.finetune_method)
             template_json["systems"]["local_system"]["config"]["accelerators"][0]["device"] = "gpu"
+            # torch fine tuning does not require execution provider, just set it to CUDAExecutionProvider
             template_json["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = [
                 "CUDAExecutionProvider"
             ]
-        if "cpu" in model_type:
-            # no need to set device for CPU since default it is CPU
-            template_json["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = [
-                "CPUExecutionProvider"
+        if model_type:
+            pass_flows[0].extend(SUPPORTED_WORKFLOWS[model_type][0])
+            template_json["pass_flows"] = pass_flows
+            if args.optimum_optimization:
+                legacy_optimization_setting(template_json)
+                for pass_flow in template_json["pass_flows"]:
+                    pass_flow[0] = "optimum_convert"
+                    if "perf_tuning" in pass_flow:
+                        pass_flow.remove("perf_tuning")
+
+            if "cuda" in model_type:
+                template_json["systems"]["local_system"]["config"]["accelerators"][0]["device"] = "gpu"
+                template_json["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = [
+                    "CUDAExecutionProvider"
+                ]
+            if "cpu" in model_type:
+                # no need to set device for CPU since default it is CPU
+                template_json["systems"]["local_system"]["config"]["accelerators"][0]["execution_providers"] = [
+                    "CPUExecutionProvider"
+                ]
+        if args.optimum_optimization or (args.finetune_method and not model_type):
+            # set evaluator as None:
+            template_json["engine"]["evaluate_input_model"] = False
+            template_json["engine"]["evaluator"] = None
+        used_passes = {pass_name for pass_flow in pass_flows for pass_name in pass_flow}
+        for pass_name in list(template_json["passes"].keys()):
+            if pass_name not in used_passes:
+                del template_json["passes"][pass_name]
+                continue
+
+        if args.export_mlflow_format:
+            template_json["engine"]["packaging_config"] = [
+                {
+                    "type": "Zipfile",
+                    "name": "mlflow_model",
+                    "config": {"export_in_mlflow_format": True},
+                }
             ]
-    if args.optimum_optimization or (args.finetune_method and not args.model_type):
-        # set evaluator as None:
-        template_json["engine"]["evaluate_input_model"] = False
-        template_json["engine"]["evaluator"] = None
-    used_passes = {pass_name for pass_flow in pass_flows for pass_name in pass_flow}
-    for pass_name in list(template_json["passes"].keys()):
-        if pass_name not in used_passes:
-            del template_json["passes"][pass_name]
-            continue
-
-    if args.export_mlflow_format:
-        template_json["engine"]["packaging_config"] = [
-            {
-                "type": "Zipfile",
-                "name": "mlflow_model",
-                "config": {"export_in_mlflow_format": True},
-            }
-        ]
-
-    with open("phi2_optimize.json", "w") as f:
-        json.dump(template_json, f, indent=4)
+
+        new_json_file = f"phi2_{model_type}.json"
+        with open(new_json_file, "w") as f:
+            json.dump(template_json, f, indent=4)
 
     # only evaluate onnx generate model
-    footprints = olive_run(template_json)  # pylint: disable=not-callable
-    if args.model_type:
+    footprints = olive_run(new_json_file)  # pylint: disable=not-callable
+    if args.genai_optimization and args.inference:
+        print("GenAI optimization does not support inference")  # noqa: T201
+    elif model_type:
         output_model_path = get_output_model_path(footprints)
         if args.inference and model_type in SUPPORTED_INFERENCE_CONFIG:
             from generate import run as generate_run

diff --git a/examples/phi2/phi2_genai.json b/examples/phi2/phi2_genai.json
@@ -0,0 +1,42 @@
+{
+    "input_model":{
+        "type": "PyTorchModel",
+        "config": {
+            "hf_config": {
+                "model_name": "microsoft/phi-2",
+                "task": "text-generation"
+            }
+        }
+    },
+    "systems": {
+        "local_system": {
+            "type": "LocalSystem",
+            "config": {
+                "accelerators": [
+                    {
+                        "device": "GPU",
+                        "execution_providers": [
+                            "CPUExecutionProvider",
+                            "CUDAExecutionProvider"
+                        ]
+                    }
+                ]
+            }
+        }
+    },
+    "passes": {
+        "genai_exporter": {
+            "type": "GenAIModelExporter",
+            "config": {
+                "precision": "int4"
+            }
+        }
+    },
+    "engine": {
+        "log_severity_level": 0,
+        "host": "local_system",
+        "target": "local_system",
+        "cache_dir": "cache",
+        "output_dir": "models/genai"
+    }
+}