Skip to content

Commit

Permalink
Add a cli option to enable GPU offload
Browse files Browse the repository at this point in the history
Add a "--gpu" that allows users to request the workload to be
offloaded to the GPU. This works natively on macOS using Metal and
in containers using Vulkan with llama.cpp's Kompute backend.

Signed-off-by: Sergio Lopez <[email protected]>
  • Loading branch information
slp committed Oct 9, 2024
1 parent 96a0efb commit 6afae3e
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
9 changes: 8 additions & 1 deletion ramalama/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,13 @@ def init_cli():
action="store_false",
help="do not run RamaLama in the default container",
)
parser.add_argument(
"--gpu",
dest="gpu",
default=False,
action="store_true",
help="offload the workload to the GPU",
)
parser.add_argument(
"--runtime",
default="llama.cpp",
Expand Down Expand Up @@ -517,7 +524,7 @@ def run_container(args):
if hasattr(args, "port"):
conman_args += ["-p", f"{args.port}:{args.port}"]

if os.path.exists("/dev/dri"):
if args.gpu and (os.path.exists("/dev/dri") or sys.platform == "darwin"):
conman_args += ["--device", "/dev/dri"]

if os.path.exists("/dev/kfd"):
Expand Down
15 changes: 13 additions & 2 deletions ramalama/model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import sys
from pathlib import Path
from ramalama.common import container_manager, exec_cmd, default_image


Expand All @@ -13,8 +14,6 @@ class Model:

def __init__(self, model):
self.model = model
if sys.platform == "darwin":
self.common_params += ["-ngl", "99"]

def login(self, args):
raise NotImplementedError(f"ramalama login for {self.type} not implemented")
Expand Down Expand Up @@ -100,6 +99,18 @@ def run(self, args):
if not args.ARGS:
exec_args.append("-cnv")

if args.gpu:
if sys.platform == "darwin":
# llama.cpp will default to the Metal backend on macOS, so we don't need
# any additional arguments.
pass
elif sys.platform == "linux" and Path("/dev/dri").exists():
exec_args.extend(["-ngl", "99"])
if not "q4_0.gguf" in symlink_path.lower():
print("GPU offload requested but this doesn't seem to be a Q4_0 model, CPU will be used instead")
else:
print("GPU offload was requested but is not available on this system")

exec_cmd(exec_args, False)

def serve(self, args):
Expand Down

0 comments on commit 6afae3e

Please sign in to comment.