diff --git a/ramalama/cli.py b/ramalama/cli.py
index 0e85df6a..2a525947 100644
--- a/ramalama/cli.py
+++ b/ramalama/cli.py
@@ -62,6 +62,13 @@ def init_cli():
         action="store_false",
         help="do not run RamaLama in the default container",
     )
+    parser.add_argument(
+        "--gpu",
+        dest="gpu",
+        default=False,
+        action="store_true",
+        help="offload the workload to the GPU",
+    )
     parser.add_argument(
         "--runtime",
         default="llama.cpp",
@@ -517,7 +524,7 @@ def run_container(args):
     if hasattr(args, "port"):
         conman_args += ["-p", f"{args.port}:{args.port}"]
 
-    if os.path.exists("/dev/dri"):
+    if args.gpu and (os.path.exists("/dev/dri") or sys.platform == "darwin"):
         conman_args += ["--device", "/dev/dri"]
 
     if os.path.exists("/dev/kfd"):
diff --git a/ramalama/model.py b/ramalama/model.py
index 533ce9b9..205eac9c 100644
--- a/ramalama/model.py
+++ b/ramalama/model.py
@@ -1,5 +1,6 @@
 import os
 import sys
+from pathlib import Path
 from ramalama.common import container_manager, exec_cmd, default_image
 
 
@@ -13,8 +14,6 @@ class Model:
 
     def __init__(self, model):
         self.model = model
-        if sys.platform == "darwin":
-            self.common_params += ["-ngl", "99"]
 
     def login(self, args):
         raise NotImplementedError(f"ramalama login for {self.type} not implemented")
@@ -100,6 +99,18 @@ def run(self, args):
         if not args.ARGS:
             exec_args.append("-cnv")
 
+        if args.gpu:
+            if sys.platform == "darwin":
+                # llama.cpp will default to the Metal backend on macOS, so we don't need
+                # any additional arguments.
+                pass
+            elif sys.platform == "linux" and Path("/dev/dri").exists():
+                exec_args.extend(["-ngl", "99"])
+                if not "q4_0.gguf" in symlink_path.lower():
+                    print("GPU offload requested but this doesn't seem to be a Q4_0 model, CPU will be used instead")
+            else:
+                print("GPU offload was requested but is not available on this system")
+
         exec_cmd(exec_args, False)
 
     def serve(self, args):