intel · VincyZhang · Feb 21, 2024 · Jan 22, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/scripts/convert.py b/scripts/convert.py
@@ -13,8 +13,10 @@
 #  limitations under the License.
 
 import argparse
+import sys
 from pathlib import Path
 from typing import List, Optional
+from huggingface_hub import snapshot_download
 from neural_speed.convert import convert_model
 
 def main(args_in: Optional[List[str]] = None) -> None:
@@ -25,14 +27,24 @@ def main(args_in: Optional[List[str]] = None) -> None:
         help="output format, default: f32",
         default="f32",
     )
+    parser.add_argument(
+        "--token",
+        type=str,
+        help="Access token ID for models that require it (LLaMa2, etc..)",
+    )
     parser.add_argument("--outfile", type=Path, required=True, help="path to write to")
     parser.add_argument("model", type=Path, help="directory containing model file or model id")
     args = parser.parse_args(args_in)
 
     if args.model.exists():
         dir_model = args.model.as_posix()
     else:
-        dir_model = args.model
+        try:
+            dir_model = snapshot_download(repo_id=str(args.model), resume_download=True, token=args.token)
+        except Exception as e:
+            if e.response.status_code == 401:
+                print("You are required to input an acccess token ID for {}, please add it in option --token or download model weights locally".format(args.model))
+            sys.exit(f"{e}")
 
     convert_model(dir_model, args.outfile, args.outtype)
 

diff --git a/scripts/inference.py b/scripts/inference.py
@@ -133,12 +133,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
     if is_win():
         path = Path(args.build_dir, "./Bin/Release/run_{}.exe".format(model_name))
     else:
-        if args.one_click_run == "True":
-            import neural_speed
-            package_path = os.path.dirname(neural_speed.__file__)
-            path = Path(package_path, "./run_{}".format(model_name))
-        else:
-            path = Path(args.build_dir, "./bin/run_{}".format(model_name))
+        import neural_speed
+        package_path = os.path.dirname(neural_speed.__file__)
+        path = Path(package_path, "./run_{}".format(model_name))
 
     if not path.exists():
         print("Please build graph first or select the correct model name.")

diff --git a/scripts/quantize.py b/scripts/quantize.py
@@ -103,12 +103,9 @@ def main(args_in: Optional[List[str]] = None) -> None:
     if is_win():
         path = Path(args.build_dir, "./Bin/Release/quant_{}.exe".format(model_name))
     else:
-        if args.one_click_run == "True":
-            import neural_speed
-            package_path = os.path.dirname(neural_speed.__file__)
-            path = Path(package_path, "./quant_{}".format(model_name))
-        else:
-            path = Path(args.build_dir, "./bin/quant_{}".format(model_name))
+        import neural_speed
+        package_path = os.path.dirname(neural_speed.__file__)
+        path = Path(package_path, "./quant_{}".format(model_name))
     if not path.exists():
         print(path)
         print("Please build graph first or select the correct model name.")

diff --git a/scripts/run.py b/scripts/run.py
@@ -18,6 +18,7 @@
 from typing import List, Optional
 from transformers import AutoConfig
 import subprocess
+from huggingface_hub import snapshot_download
 
 model_maps = {"gpt_neox": "gptneox", "gpt_bigcode": "starcoder"}
 build_path = Path(Path(__file__).parent.absolute(), "../build/")
@@ -146,13 +147,24 @@ def main(args_in: Optional[List[str]] = None) -> None:
         action="store_true",
         help="Use ring-buffer and thus do not re-computing after reaching ctx_size (default: False)",
     )
+    parser.add_argument(
+        "--token",
+        type=str,
+        help="Access token ID for models that require it (LLaMa2, etc..)",
+    )
 
     args = parser.parse_args(args_in)
 
     if args.model.exists():
         dir_model = args.model.as_posix()
     else:
-        dir_model = args.model
+        try:
+            dir_model = snapshot_download(repo_id=str(args.model), resume_download=True, token=args.token)
+        # Handles Missing token ID for gated models
+        except Exception as e:
+            if e.response.status_code == 401:
+                print("You are required to input an acccess token ID for {}, please add it in option --token or download model weights locally".format(args.model))
+            sys.exit(f"{e}")
 
     parent_path = Path(__file__).parent.absolute()
     config = AutoConfig.from_pretrained(dir_model)
@@ -166,8 +178,8 @@ def main(args_in: Optional[List[str]] = None) -> None:
     convert_cmd = ["python", path]
     convert_cmd.extend(["--outfile", Path(work_path, "ne_{}_f32.bin".format(model_type))])
     convert_cmd.extend(["--outtype", "f32"])
-    convert_cmd.append(args.model)
-    print("convert model ...")
+    convert_cmd.append(dir_model)
+    print("Convert model ...")
     subprocess.run(convert_cmd)
 
     # 2. quantize
@@ -186,7 +198,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
         quant_cmd.extend(["--use_ggml"])
     quant_cmd.extend(["--build_dir", args.build_dir])
     quant_cmd.extend(["--one_click_run", "True"])
-    print("quantize model ...")
+    print("Quantize model ...")
     subprocess.run(quant_cmd)
 
     # 3. inference
@@ -208,7 +220,7 @@ def main(args_in: Optional[List[str]] = None) -> None:
         infer_cmd.extend(["--shift-roped-k"])
     if (model_type == "baichuan" or model_type == "qwen"):
         infer_cmd.extend(["--tokenizer", dir_model])
-    print("inferce model ...")
+    print("Inference model ...")
     subprocess.run(infer_cmd)