diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000000..c02f7c37fb1b
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,18 @@
+/.git
+/.github
+
+/CONTRIBUTING
+/docs
+/mac_inference
+/MAINTAINERS
+/notebooks
+/scripts
+/tests
+
+# Python
+/build
+/.tox
+*.egg-info
+*.dist-info
+__pycache__/
+*.pyc
diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt
index 3209032e8809..5dc92bebf2bc 100644
--- a/.spellcheck-en-custom.txt
+++ b/.spellcheck-en-custom.txt
@@ -19,6 +19,8 @@ PEFT
 PlantUML
 PyPI
 Quantizing
+RDNA
+ROCm
 cli
 compositional
 cpp
@@ -26,6 +28,7 @@ dataset
 dev
 ditaa
 dr
+env
 gpu
 lora
 orchestrator
diff --git a/containers/bin/debug-llama b/containers/bin/debug-llama
new file mode 100755
index 000000000000..f98dd8ad419b
--- /dev/null
+++ b/containers/bin/debug-llama
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+import argparse
+import os
+import sys
+import time
+
+import llama_cpp
+
+PROMPT = "Q: What is the answer to the ultimate question of life, the universe, and everything? A:"
+
+parser = argparse.ArgumentParser("test llama GPU/CPU")
+parser.add_argument("--model", default=None)
+parser.add_argument("--device", choices=["cpu", "gpu"], default="gpu")
+parser.add_argument("--quiet", action="store_false", dest="verbose")
+parser.add_argument("--repeat", type=int, default=5)
+parser.add_argument("--max-tokens", type=int, default=0)
+parser.add_argument("--prompt", default=PROMPT)
+
+
+def main(argv=None):
+    args = parser.parse_args(argv)
+    print(f"llama_cpp version: {llama_cpp.__version__}")
+    print(f"llama supports gpu offload: {llama_cpp.llama_supports_gpu_offload()}")
+    # -1: offload all layers to GPU
+    n_gpu_layers = -1 if args.device == "gpu" else 0
+    print(f"n_gpu_layers: {n_gpu_layers} ({args.device})")
+
+    if args.model is None:
+        from cli.config import read_config
+
+        cfg = read_config()
+        args.model = cfg.serve.model_path
+
+    if not os.path.isfile(args.model):
+        print(f"Model file '{args.model}' is missing.")
+        print("Run 'lab init' and 'lab download'.")
+        sys.exit(2)
+
+    start = time.monotonic()
+    llm = llama_cpp.Llama(
+        model_path=args.model,
+        n_gpu_layers=n_gpu_layers,
+        verbose=args.verbose,
+    )
+    t = time.monotonic() - start
+    print(f"Loaded model in {t:.2f}sec")
+
+    print()
+    print(
+        f"Prompt: '{args.prompt}' (repeats: {args.repeat}, max_tokens: {args.max_tokens})"
+    )
+    start = time.monotonic()
+    for _ in range(args.repeat):
+        result = llm(args.prompt, max_tokens=args.max_tokens)
+        for choice in result["choices"]:
+            print(choice["text"].strip())
+    t = time.monotonic() - start
+    print(f"Got {args.repeat} result(s) in {t:.2f}sec")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/containers/bin/debug-pytorch b/containers/bin/debug-pytorch
new file mode 100755
index 000000000000..3e23f9aa92cd
--- /dev/null
+++ b/containers/bin/debug-pytorch
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# based on Steffen Röcker's gist
+# https://gist.github.com/sroecker/f52646023d45ab4e281ddee05d6ef2a5
+import os
+
+import torch
+
+print(f"pytorch version: {torch.__version__}")
+print(f"HSA_OVERRIDE_GFX_VERSION={os.environ.get('HSA_OVERRIDE_GFX_VERSION')}")
+print(f"HIP_VISIBLE_DEVICES={os.environ.get('HIP_VISIBLE_DEVICES')}")
+print(f"GPU available: {torch.cuda.is_available()}")
+print(f"NVidia CUDA version: {torch.version.cuda or 'n/a'}")
+print(f"AMD ROCm HIP version: {torch.version.hip or 'n/a'}")
+print()
+if torch.cuda.is_available():
+    print(f"device count: {torch.cuda.device_count()}")
+    print(f"current device: {torch.cuda.current_device()}")
+    for idx in range(torch.cuda.device_count()):
+        print(f"  device {idx}: {torch.cuda.get_device_name(idx)}")
+    device = torch.device("cuda", torch.cuda.current_device())
+else:
+    device = torch.device("cpu")
+
+print(f"Testing with {device}")
+print()
+print("small matmul")
+a = torch.rand(3, 3).to(device)
+b = torch.rand(3, 3).to(device)
+res = torch.matmul(a, b)
+print(res)
+print(res.size())
+
+print()
+print("larger matmul")
+a = torch.rand(1280, 1280).to(device)
+b = torch.rand(1280, 1280).to(device)
+res = torch.matmul(a, b)
+print(res)
+print(res.size())
diff --git a/containers/rocm/Containerfile b/containers/rocm/Containerfile
new file mode 100644
index 000000000000..7cd13936db86
--- /dev/null
+++ b/containers/rocm/Containerfile
@@ -0,0 +1,98 @@
+# Christian Heimes <cheimes@redhat.com>
+# Based on Zack Zlotnik's container file
+
+# runtime container has libraries, CLI tools, and virtual env
+FROM registry.fedoraproject.org/fedora-toolbox:40 AS runtime
+# args and env (default to gfx1100, GFX level 11.0.0, first GPU only)
+ARG AMDGPU_ARCH=gfx1100
+ARG HSA_OVERRIDE_GFX_VERSION=11.0.0
+ARG HIP_VISIBLE_DEVICES=0
+ARG PYTORCH_ROCM_VERSION=5.7
+ENV AMDGPU_ARCH="${AMDGPU_ARCH}"
+ENV HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES}"
+ENV HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION}"
+ENV PYTORCH_ROCM_VERSION="${PYTORCH_ROCM_VERSION}"
+# runtime dependencies
+COPY containers/rocm/remove-gfx.sh /tmp/
+RUN --mount=type=cache,target=/var/cache/dnf,z \
+    dnf install -y --nodocs --setopt=install_weak_deps=False --setopt=keepcache=True \
+    rocm-runtime hipblas hiprand hipsparse lld-libs python3-pip nvtop radeontop make git gh && \
+    /tmp/remove-gfx.sh
+# virtual env, umask 0000 to allow end-user to write to venv later
+ENV VIRTUAL_ENV="/opt/rocm-venv"
+RUN umask 0000 && \
+    python3 -m venv ${VIRTUAL_ENV}
+ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
+ENV PS1="(rocm-venv) ${PS1}"
+# additional helpers to debug torch and llama
+COPY containers/bin/debug-* ${VIRTUAL_ENV}/bin
+
+
+# build env contains compilers and build dependencies
+FROM runtime AS buildenv
+RUN --mount=type=cache,target=/var/cache/dnf,z \
+    dnf install -y --nodocs --setopt=keepcache=True \
+    llvm compiler-rt clang-tools-extra lld python3-devel cmake ninja-build gcc \
+    rocblas-devel hip-devel hipblas-devel rocprim-devel rocthrust-devel hipsparse-devel hipcub-devel hiprand-devel
+
+
+FROM buildenv AS pytorch
+COPY requirements.txt ${VIRTUAL_ENV}/
+# pip constraint does not support optional dependencies.
+RUN sed 's/\[.*\]//' ${VIRTUAL_ENV}/requirements.txt > ${VIRTUAL_ENV}/constraints.txt
+# chcon to work around pip's SELinux context shenanigans
+RUN --mount=type=cache,target=/root/.cache/pip,z \
+    umask 0000 && \
+    ${VIRTUAL_ENV}/bin/pip install torch --index-url https://download.pytorch.org/whl/rocm${PYTORCH_ROCM_VERSION} && \
+    /tmp/remove-gfx.sh && \
+    $(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true)
+
+FROM pytorch AS llama
+# remove cached wheel to force rebuild
+RUN --mount=type=cache,target=/root/.cache/pip,z \
+    pip cache remove llama_cpp_python && \
+    umask 0000 && \
+    CMAKE_ARGS="-DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=/usr/bin/clang -DCMAKE_CXX_COMPILER=/usr/bin/clang++ -DAMDGPU_TARGETS=${AMDGPU_ARCH}" \
+    FORCE_CMAKE=1 \
+    ${VIRTUAL_ENV}/bin/pip install -c ${VIRTUAL_ENV}/constraints.txt llama-cpp-python && \
+    $(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true)
+
+
+FROM llama AS bitsandbytes
+RUN git clone --depth 1 -b rocm https://github.com/arlo-phoenix/bitsandbytes-rocm-5.6.git /tmp/bitsandbytes
+RUN git clone --depth 1 -b rocm-6.0.2 https://github.com/ROCm/hipBLASLt /tmp/hipblaslt
+RUN mkdir -p /tmp/bitsandbytes/include/hipblaslt && \
+    echo -e '#pragma once\n#ifndef HIPBLASLT_EXPORT\n#define HIPBLASLT_EXPORT\n#endif' > /tmp/bitsandbytes/include/hipblaslt/hipblaslt-export.h && \
+    touch /tmp/bitsandbytes/include/hipblaslt/hipblaslt-version.h && \
+    cp /tmp/hipblaslt/library/include/* /tmp/bitsandbytes/include/hipblaslt/
+RUN cd /tmp/bitsandbytes && \
+    make hip ROCM_TARGET="${AMDGPU_ARCH}" && \
+    umask 0000 && \
+    pip install -c ${VIRTUAL_ENV}/constraints.txt . && \
+    $(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true)
+
+
+# install from requirements.txt last. pip does not override installed
+# packages unless there is a version conflict.
+FROM bitsandbytes AS pip-install
+RUN --mount=type=cache,target=/root/.cache/pip,z \
+    umask 0000 && \
+    ${VIRTUAL_ENV}/bin/pip install -r ${VIRTUAL_ENV}/requirements.txt && \
+    $(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true)
+# debug and self-test
+RUN export
+RUN pip list
+# the attribute error from bitsandbytes is harmless
+RUN python3 -Wignore -c 'import llama_cpp, torch, bitsandbytes' 2>&1 | grep -v NoneType
+RUN find ${VIRTUAL_ENV} -name __pycache__ | xargs rm -rf
+
+
+# create final image from base runtime, copy virtual env into final stage
+FROM runtime as final
+COPY --from=pip-install /opt/rocm-venv/lib/python3.12/site-packages /opt/rocm-venv/lib/python3.12/site-packages
+COPY --from=pip-install /opt/rocm-venv/bin /opt/rocm-venv/bin
+LABEL com.github.containers.toolbox="true" \
+      name="instructlab-rocm-base-gfx${GFX_VERSION}" \
+      usage="This image is meant to be used with the toolbox(1) command" \
+      summary="PyTorch, llama.cpp, and instruct-lab dependencies for AMD ROCm GPU ${AMDGPU_ARCH}" \
+      maintainer="Christian Heimes <cheimes@redhat.com>"
diff --git a/containers/rocm/README.md b/containers/rocm/README.md
new file mode 100644
index 000000000000..b60e1c853184
--- /dev/null
+++ b/containers/rocm/README.md
@@ -0,0 +1,80 @@
+# instruct-lab toolbox container for AMD ROCm GPUs
+
+The ROCm container file is designed for AMD GPUs with RDNA3 architecture (`gfx1100`). The container can be build for RDNA2 (`gfx1030`) and older GPUs, too. Please refer to [AMD's system requirements](https://rocm.docs.amd.com/projects/install-on-linux/en/develop/reference/system-requirements.html) for a list of officially supported cards. ROCm is known to work on more consumer GPUs.
+
+The container file creates a [toolbox](https://github.com/containers/toolbox) container for [`toolbox(1)`](https://www.mankier.com/1/toolbox) command line tool. A toolbox containers has seamless access to the entire system including user's home directory, networking, hardware, SSH agent, and more.
+
+The container has all Python dependencies installed in a virtual env. The virtual env is already activated when you enter the container. However the lab `cli` is **not** installed. 
+
+## Quick start
+
+1. git clone the `cli` and `taxonomy` project into a common folder in your
+   home directory (e.g. `~/path/to/instruct-lab`)
+2. add your account to `render` and `video` group: `sudo usermod -a -G render,video $LOGNAME`
+3. install build dependency for this container: `sudo dnf install toolbox podman make`
+4. build the container for RDNA3: `podman build -t localhost/instructlab:rocm-gf1100 -f container/rocm/Containerfile .`
+5. create a toolbox `toolbox create --image localhost/instructlab:rocm-gf1100 instructlab`
+6. enter toolbox `toolbox enter instructlab`. The container has your
+   home directory mounted.
+7. install lab cli with `pip install -e ~/path/to/instruct-lab/cli/`
+
+`lab generate` and `lab chat` use the GPU automatically. `lab train` needs
+more powerful and recent GPU and therefore does not use GPU by default. To
+train on a GPU, run `lab train --device cuda`.
+
+
+## Building for other GPU architectures
+
+Use the `amdgpu-arch` or `rocminfo` tool to get the short name
+
+```shell
+dnf install clang-tools-extra rocminfo
+amdgpu-arch
+rocminfo | grep gfx
+```
+
+Map the name to a LLVM GPU target and an override GFX version. PyTorch 2.2.1+rocm5.7 provides a limited set of rocBLAS Kernels. Fedora 40's ROCm packages have more Kernels. For now we are limited to what PyTorch binaries provide until Fedora ships `python-torch` with ROCm support.
+
+| Name      | xnack/USM | Version  | PyTorch | Fedora |
+|-----------|-----------|----------|:-------:|:------:|
+| `gfx900`  |           | `9.0.0`  | ✅      | ✅     |
+| `gfx906`  | `xnack-`  | `9.0.6`  | ✅      | ✅     |
+| `gfx908`  | `xnack-`  | `9.0.8`  | ✅      | ✅     |
+| `gfx90a`  | `xnack-`  | `9.0.10` | ✅      | ✅     |
+| `gfx90a`  | `xnack+`  | `9.0.10` | ✅      | ✅     |
+| `gfx940`  |           |          | ❌      | ✅     |
+| `gfx941`  |           |          | ❌      | ✅     |
+| `gfx942`  |           |          | ❌      | ✅     |
+| `gfx1010` |           |          | ❌      | ✅     |
+| `gfx1012` |           |          | ❌      | ✅     |
+| `gfx1030` |           | `10.3.0` | ✅      | ✅     |
+| `gfx1100` |           | `11.0.0` | ✅      | ✅     |
+| `gfx1101` |           |          | ❌      | ✅     |
+| `gfx1102` |           |          | ❌      | ✅     |
+
+If your card is not listed or unsupported, try the closest smaller value, e.g. for `gfx1031` use target `gfx1030` and override `10.3.0`. See [ROCm/ROCR-Runtime `isa.cpp`](https://github.com/ROCm/ROCR-Runtime/blob/rocm-6.0.2/src/core/runtime/isa.cpp#L245) and [LLVM User Guide for AMDGPU](https://llvm.org/docs/AMDGPUUsage.html#processors) for more information.
+
+| Marketing Name         | Name      | Arch  | Target    | GFX version | Memory | Chat | Train |
+|------------------------|-----------|-------|-----------|-------------|--------|:----:|:-----:|
+| AMD Radeon RX 7900 XT  | `gfx1100` | RDNA3 | `gfx1100` | `11.0.0`    | 20 GiB | ✅   | ✅    |
+| AMD Radeon RX 7900 XTX |           | RDNA3 |           |             | 24 GiB | ✅   | ✅    |
+| AMD Radeon RX 6700     | `gfx1031` | RDNA2 | `gfx1030` | `10.3.0`    | 10 GiB | ✅   | ❌    |
+
+Build the container with additional build arguments:
+
+```shell
+podman build \
+    --build-arg AMDGPU_ARCH="gfx1030" \
+    --build-arg HSA_OVERRIDE_GFX_VERSION="10.3.0" \
+    -f container/rocm/Containerfile \
+    -t localhost/instructlab:rocm-gf1030
+```
+
+or use pre-defined build arguments from a config file:
+
+```shell
+podman build \
+    --build-args-file container/rocm/gfx1030.conf
+    -f container/rocm/Containerfile \
+    -t localhost/instructlab:rocm-gf1030
+```
diff --git a/containers/rocm/gfx-version.sh b/containers/rocm/gfx-version.sh
new file mode 100755
index 000000000000..1b68c254e41e
--- /dev/null
+++ b/containers/rocm/gfx-version.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+set -e
+
+NAME=$1
+
+if test -z "$NAME"; then
+    NAME=$(amdgpu-arch)
+fi
+
+case "$NAME" in
+	gfx803 | gfx900 | gfx906 | gfx908 | gfx90a | gfx940 | gfx941 | gfx942 | gfx1010 | gfx1012)
+        echo "ERROR: No mapping for '$NAME', yet" >&2
+        exit 3
+        ;;
+	gfx1030 | gfx1031 | gfx1032)
+        echo "AMDGPU_ARCH=gfx1030"
+        echo "HSA_OVERRIDE_GFX_VERSION=10.3.0"
+		;;
+	gfx1100 | gfx1101 | gfx1102)
+        echo "AMDGPU_ARCH=gfx1100"
+        echo "HSA_OVERRIDE_GFX_VERSION=11.0.0"
+		;;
+	*)
+		echo "ERROR: unknown or unsupported GFX name '$NAME'" >&2
+		exit 2
+esac
diff --git a/containers/rocm/gfx1030.conf b/containers/rocm/gfx1030.conf
new file mode 100644
index 000000000000..ecabc6a20fb0
--- /dev/null
+++ b/containers/rocm/gfx1030.conf
@@ -0,0 +1,3 @@
+# build arguments for AMD Radeon RX 6000 series
+AMDGPU_ARCH=gfx1030
+HSA_OVERRIDE_GFX_VERSION=10.3.0
diff --git a/containers/rocm/remove-gfx.sh b/containers/rocm/remove-gfx.sh
new file mode 100755
index 000000000000..41d37b0490a3
--- /dev/null
+++ b/containers/rocm/remove-gfx.sh
@@ -0,0 +1,46 @@
+#!/bin/sh
+set -e
+# Remove GPU support files that are not necessary for current GPU arch
+
+TORCH="/opt/rocm-venv/lib/python3.12/site-packages/torch"
+
+case "$AMDGPU_ARCH" in
+	gfx9*)
+		rm -rf /usr/lib*/rocm/gfx8
+		rm -rf /usr/lib*/rocm/gfx10
+		rm -rf /usr/lib*/rocm/gfx11
+		rm -f /usr/lib*/rocblas/library/*gfx8*
+		rm -f /usr/lib*/rocblas/library/*gfx10*
+		rm -f /usr/lib*/rocblas/library/*gfx11*
+		rm -f ${TORCH}/lib/rocblas/library/*gfx8*
+		rm -f ${TORCH}/lib/rocblas/library/*gfx10*
+		rm -f ${TORCH}/lib/rocblas/library/*gfx11*
+		;;
+	gfx10*)
+		rm -rf /usr/lib*/rocm/gfx8
+		rm -rf /usr/lib*/rocm/gfx9
+		rm -rf /usr/lib*/rocm/gfx11
+		rm -f /usr/lib*/rocblas/library/*gfx8*
+		rm -f /usr/lib*/rocblas/library/*gfx9*
+		rm -f /usr/lib*/rocblas/library/*gfx11*
+		rm -f ${TORCH}/lib/rocblas/library/*gfx8*
+		rm -f ${TORCH}/lib/rocblas/library/*gfx9*
+		rm -f ${TORCH}/lib/rocblas/library/*gfx11*
+		;;
+	gfx11*)
+		rm -rf /usr/lib*/rocm/gfx8
+		rm -rf /usr/lib*/rocm/gfx9
+		rm -rf /usr/lib*/rocm/gfx10
+		rm -f /usr/lib*/rocblas/library/*gfx8*
+		rm -f /usr/lib*/rocblas/library/*gfx9*
+		rm -f /usr/lib*/rocblas/library/*gfx10*
+		rm -f ${TORCH}/lib/rocblas/library/*gfx8*
+		rm -f ${TORCH}/lib/rocblas/library/*gfx9*
+		rm -f ${TORCH}/lib/rocblas/library/*gfx10*
+		;;
+	*)
+		echo "ERROR: $0 unknown AMDGPU_ARCH=$AMDGPU_ARCH"
+		exit 2
+esac
+
+# find /usr/lib* /opt/ -path '*/*gfx[189][0-9][0-9a-z]*' | grep -v $AMDGPU_ARCH | xargs rm -f