diff --git a/ramalama/cli.py b/ramalama/cli.py index 0e85df6a..2a525947 100644 --- a/ramalama/cli.py +++ b/ramalama/cli.py @@ -62,6 +62,13 @@ def init_cli(): action="store_false", help="do not run RamaLama in the default container", ) + parser.add_argument( + "--gpu", + dest="gpu", + default=False, + action="store_true", + help="offload the workload to the GPU", + ) parser.add_argument( "--runtime", default="llama.cpp", @@ -517,7 +524,7 @@ def run_container(args): if hasattr(args, "port"): conman_args += ["-p", f"{args.port}:{args.port}"] - if os.path.exists("/dev/dri"): + if args.gpu and (os.path.exists("/dev/dri") or sys.platform == "darwin"): conman_args += ["--device", "/dev/dri"] if os.path.exists("/dev/kfd"): diff --git a/ramalama/model.py b/ramalama/model.py index 533ce9b9..205eac9c 100644 --- a/ramalama/model.py +++ b/ramalama/model.py @@ -1,5 +1,6 @@ import os import sys +from pathlib import Path from ramalama.common import container_manager, exec_cmd, default_image @@ -13,8 +14,6 @@ class Model: def __init__(self, model): self.model = model - if sys.platform == "darwin": - self.common_params += ["-ngl", "99"] def login(self, args): raise NotImplementedError(f"ramalama login for {self.type} not implemented") @@ -100,6 +99,18 @@ def run(self, args): if not args.ARGS: exec_args.append("-cnv") + if args.gpu: + if sys.platform == "darwin": + # llama.cpp will default to the Metal backend on macOS, so we don't need + # any additional arguments. + pass + elif sys.platform == "linux" and Path("/dev/dri").exists(): + exec_args.extend(["-ngl", "99"]) + if not "q4_0.gguf" in symlink_path.lower(): + print("GPU offload requested but this doesn't seem to be a Q4_0 model, CPU will be used instead") + else: + print("GPU offload was requested but is not available on this system") + exec_cmd(exec_args, False) def serve(self, args):