forked from NVIDIA/NeMo
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add container file for AMD ROCm GPUs (NVIDIA#636)
The new contailer file creates a Fedora 40 toolbox container with ROCm builds of PyTorch, llama-cpp, and bitsandbytes. The container image contains all Python packages from `requirements.txt`, ROCm support files, and tools. By default the container supports RDNA3 GPUs `gfx1100` like W7900 and RX 7900. It can be build for other GPU architectures. ``` podman build -t localhost/instructlab:rocm-gf1100 -f container/rocm/Containerfile toolbox create --image localhost/instructlab:rocm-gf1100 instructlab toolbox enter instructlab pip install path/to/cli ``` Signed-off-by: Christian Heimes <[email protected]>
- Loading branch information
Showing
9 changed files
with
376 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
/.git | ||
/.github | ||
|
||
/CONTRIBUTING | ||
/docs | ||
/mac_inference | ||
/MAINTAINERS | ||
/notebooks | ||
/scripts | ||
/tests | ||
|
||
# Python | ||
/build | ||
/.tox | ||
*.egg-info | ||
*.dist-info | ||
__pycache__/ | ||
*.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import argparse | ||
import os | ||
import sys | ||
import time | ||
|
||
import llama_cpp | ||
|
||
PROMPT = "Q: What is the answer to the ultimate question of life, the universe, and everything? A:" | ||
|
||
parser = argparse.ArgumentParser("test llama GPU/CPU") | ||
parser.add_argument("--model", default=None) | ||
parser.add_argument("--device", choices=["cpu", "gpu"], default="gpu") | ||
parser.add_argument("--quiet", action="store_false", dest="verbose") | ||
parser.add_argument("--repeat", type=int, default=5) | ||
parser.add_argument("--max-tokens", type=int, default=0) | ||
parser.add_argument("--prompt", default=PROMPT) | ||
|
||
|
||
def main(argv=None): | ||
args = parser.parse_args(argv) | ||
print(f"llama_cpp version: {llama_cpp.__version__}") | ||
print(f"llama supports gpu offload: {llama_cpp.llama_supports_gpu_offload()}") | ||
# -1: offload all layers to GPU | ||
n_gpu_layers = -1 if args.device == "gpu" else 0 | ||
print(f"n_gpu_layers: {n_gpu_layers} ({args.device})") | ||
|
||
if args.model is None: | ||
from cli.config import read_config | ||
|
||
cfg = read_config() | ||
args.model = cfg.serve.model_path | ||
|
||
if not os.path.isfile(args.model): | ||
print(f"Model file '{args.model}' is missing.") | ||
print("Run 'lab init' and 'lab download'.") | ||
sys.exit(2) | ||
|
||
start = time.monotonic() | ||
llm = llama_cpp.Llama( | ||
model_path=args.model, | ||
n_gpu_layers=n_gpu_layers, | ||
verbose=args.verbose, | ||
) | ||
t = time.monotonic() - start | ||
print(f"Loaded model in {t:.2f}sec") | ||
|
||
print() | ||
print( | ||
f"Prompt: '{args.prompt}' (repeats: {args.repeat}, max_tokens: {args.max_tokens})" | ||
) | ||
start = time.monotonic() | ||
for _ in range(args.repeat): | ||
result = llm(args.prompt, max_tokens=args.max_tokens) | ||
for choice in result["choices"]: | ||
print(choice["text"].strip()) | ||
t = time.monotonic() - start | ||
print(f"Got {args.repeat} result(s) in {t:.2f}sec") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env python3 | ||
# based on Steffen Röcker's gist | ||
# https://gist.github.com/sroecker/f52646023d45ab4e281ddee05d6ef2a5 | ||
import os | ||
|
||
import torch | ||
|
||
print(f"pytorch version: {torch.__version__}") | ||
print(f"HSA_OVERRIDE_GFX_VERSION={os.environ.get('HSA_OVERRIDE_GFX_VERSION')}") | ||
print(f"HIP_VISIBLE_DEVICES={os.environ.get('HIP_VISIBLE_DEVICES')}") | ||
print(f"GPU available: {torch.cuda.is_available()}") | ||
print(f"NVidia CUDA version: {torch.version.cuda or 'n/a'}") | ||
print(f"AMD ROCm HIP version: {torch.version.hip or 'n/a'}") | ||
print() | ||
if torch.cuda.is_available(): | ||
print(f"device count: {torch.cuda.device_count()}") | ||
print(f"current device: {torch.cuda.current_device()}") | ||
for idx in range(torch.cuda.device_count()): | ||
print(f" device {idx}: {torch.cuda.get_device_name(idx)}") | ||
device = torch.device("cuda", torch.cuda.current_device()) | ||
else: | ||
device = torch.device("cpu") | ||
|
||
print(f"Testing with {device}") | ||
print() | ||
print("small matmul") | ||
a = torch.rand(3, 3).to(device) | ||
b = torch.rand(3, 3).to(device) | ||
res = torch.matmul(a, b) | ||
print(res) | ||
print(res.size()) | ||
|
||
print() | ||
print("larger matmul") | ||
a = torch.rand(1280, 1280).to(device) | ||
b = torch.rand(1280, 1280).to(device) | ||
res = torch.matmul(a, b) | ||
print(res) | ||
print(res.size()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
# Christian Heimes <[email protected]> | ||
# Based on Zack Zlotnik's container file | ||
|
||
# runtime container has libraries, CLI tools, and virtual env | ||
FROM registry.fedoraproject.org/fedora-toolbox:40 AS runtime | ||
# args and env (default to gfx1100, GFX level 11.0.0, first GPU only) | ||
ARG AMDGPU_ARCH=gfx1100 | ||
ARG HSA_OVERRIDE_GFX_VERSION=11.0.0 | ||
ARG HIP_VISIBLE_DEVICES=0 | ||
ARG PYTORCH_ROCM_VERSION=5.7 | ||
ENV AMDGPU_ARCH="${AMDGPU_ARCH}" | ||
ENV HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES}" | ||
ENV HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION}" | ||
ENV PYTORCH_ROCM_VERSION="${PYTORCH_ROCM_VERSION}" | ||
# runtime dependencies | ||
COPY containers/rocm/remove-gfx.sh /tmp/ | ||
RUN --mount=type=cache,target=/var/cache/dnf,z \ | ||
dnf install -y --nodocs --setopt=install_weak_deps=False --setopt=keepcache=True \ | ||
rocm-runtime hipblas hiprand hipsparse lld-libs python3-pip nvtop radeontop make git gh && \ | ||
/tmp/remove-gfx.sh | ||
# virtual env, umask 0000 to allow end-user to write to venv later | ||
ENV VIRTUAL_ENV="/opt/rocm-venv" | ||
RUN umask 0000 && \ | ||
python3 -m venv ${VIRTUAL_ENV} | ||
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" | ||
ENV PS1="(rocm-venv) ${PS1}" | ||
# additional helpers to debug torch and llama | ||
COPY containers/bin/debug-* ${VIRTUAL_ENV}/bin | ||
|
||
|
||
# build env contains compilers and build dependencies | ||
FROM runtime AS buildenv | ||
RUN --mount=type=cache,target=/var/cache/dnf,z \ | ||
dnf install -y --nodocs --setopt=keepcache=True \ | ||
llvm compiler-rt clang-tools-extra lld python3-devel cmake ninja-build gcc \ | ||
rocblas-devel hip-devel hipblas-devel rocprim-devel rocthrust-devel hipsparse-devel hipcub-devel hiprand-devel | ||
|
||
|
||
FROM buildenv AS pytorch | ||
COPY requirements.txt ${VIRTUAL_ENV}/ | ||
# pip constraint does not support optional dependencies. | ||
RUN sed 's/\[.*\]//' ${VIRTUAL_ENV}/requirements.txt > ${VIRTUAL_ENV}/constraints.txt | ||
# chcon to work around pip's SELinux context shenanigans | ||
RUN --mount=type=cache,target=/root/.cache/pip,z \ | ||
umask 0000 && \ | ||
${VIRTUAL_ENV}/bin/pip install torch --index-url https://download.pytorch.org/whl/rocm${PYTORCH_ROCM_VERSION} && \ | ||
/tmp/remove-gfx.sh && \ | ||
$(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true) | ||
|
||
FROM pytorch AS llama | ||
# remove cached wheel to force rebuild | ||
RUN --mount=type=cache,target=/root/.cache/pip,z \ | ||
pip cache remove llama_cpp_python && \ | ||
umask 0000 && \ | ||
CMAKE_ARGS="-DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=/usr/bin/clang -DCMAKE_CXX_COMPILER=/usr/bin/clang++ -DAMDGPU_TARGETS=${AMDGPU_ARCH}" \ | ||
FORCE_CMAKE=1 \ | ||
${VIRTUAL_ENV}/bin/pip install -c ${VIRTUAL_ENV}/constraints.txt llama-cpp-python && \ | ||
$(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true) | ||
|
||
|
||
FROM llama AS bitsandbytes | ||
RUN git clone --depth 1 -b rocm https://github.com/arlo-phoenix/bitsandbytes-rocm-5.6.git /tmp/bitsandbytes | ||
RUN git clone --depth 1 -b rocm-6.0.2 https://github.com/ROCm/hipBLASLt /tmp/hipblaslt | ||
RUN mkdir -p /tmp/bitsandbytes/include/hipblaslt && \ | ||
echo -e '#pragma once\n#ifndef HIPBLASLT_EXPORT\n#define HIPBLASLT_EXPORT\n#endif' > /tmp/bitsandbytes/include/hipblaslt/hipblaslt-export.h && \ | ||
touch /tmp/bitsandbytes/include/hipblaslt/hipblaslt-version.h && \ | ||
cp /tmp/hipblaslt/library/include/* /tmp/bitsandbytes/include/hipblaslt/ | ||
RUN cd /tmp/bitsandbytes && \ | ||
make hip ROCM_TARGET="${AMDGPU_ARCH}" && \ | ||
umask 0000 && \ | ||
pip install -c ${VIRTUAL_ENV}/constraints.txt . && \ | ||
$(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true) | ||
|
||
|
||
# install from requirements.txt last. pip does not override installed | ||
# packages unless there is a version conflict. | ||
FROM bitsandbytes AS pip-install | ||
RUN --mount=type=cache,target=/root/.cache/pip,z \ | ||
umask 0000 && \ | ||
${VIRTUAL_ENV}/bin/pip install -r ${VIRTUAL_ENV}/requirements.txt && \ | ||
$(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true) | ||
# debug and self-test | ||
RUN export | ||
RUN pip list | ||
# the attribute error from bitsandbytes is harmless | ||
RUN python3 -Wignore -c 'import llama_cpp, torch, bitsandbytes' 2>&1 | grep -v NoneType | ||
RUN find ${VIRTUAL_ENV} -name __pycache__ | xargs rm -rf | ||
|
||
|
||
# create final image from base runtime, copy virtual env into final stage | ||
FROM runtime as final | ||
COPY --from=pip-install /opt/rocm-venv/lib/python3.12/site-packages /opt/rocm-venv/lib/python3.12/site-packages | ||
COPY --from=pip-install /opt/rocm-venv/bin /opt/rocm-venv/bin | ||
LABEL com.github.containers.toolbox="true" \ | ||
name="instructlab-rocm-base-gfx${GFX_VERSION}" \ | ||
usage="This image is meant to be used with the toolbox(1) command" \ | ||
summary="PyTorch, llama.cpp, and instruct-lab dependencies for AMD ROCm GPU ${AMDGPU_ARCH}" \ | ||
maintainer="Christian Heimes <[email protected]>" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# instruct-lab toolbox container for AMD ROCm GPUs | ||
|
||
The ROCm container file is designed for AMD GPUs with RDNA3 architecture (`gfx1100`). The container can be build for RDNA2 (`gfx1030`) and older GPUs, too. Please refer to [AMD's system requirements](https://rocm.docs.amd.com/projects/install-on-linux/en/develop/reference/system-requirements.html) for a list of officially supported cards. ROCm is known to work on more consumer GPUs. | ||
|
||
The container file creates a [toolbox](https://github.com/containers/toolbox) container for [`toolbox(1)`](https://www.mankier.com/1/toolbox) command line tool. A toolbox containers has seamless access to the entire system including user's home directory, networking, hardware, SSH agent, and more. | ||
|
||
The container has all Python dependencies installed in a virtual env. The virtual env is already activated when you enter the container. However the lab `cli` is **not** installed. | ||
|
||
## Quick start | ||
|
||
1. git clone the `cli` and `taxonomy` project into a common folder in your | ||
home directory (e.g. `~/path/to/instruct-lab`) | ||
2. add your account to `render` and `video` group: `sudo usermod -a -G render,video $LOGNAME` | ||
3. install build dependency for this container: `sudo dnf install toolbox podman make` | ||
4. build the container for RDNA3: `podman build -t localhost/instructlab:rocm-gf1100 -f container/rocm/Containerfile .` | ||
5. create a toolbox `toolbox create --image localhost/instructlab:rocm-gf1100 instructlab` | ||
6. enter toolbox `toolbox enter instructlab`. The container has your | ||
home directory mounted. | ||
7. install lab cli with `pip install -e ~/path/to/instruct-lab/cli/` | ||
|
||
`lab generate` and `lab chat` use the GPU automatically. `lab train` needs | ||
more powerful and recent GPU and therefore does not use GPU by default. To | ||
train on a GPU, run `lab train --device cuda`. | ||
|
||
|
||
## Building for other GPU architectures | ||
|
||
Use the `amdgpu-arch` or `rocminfo` tool to get the short name | ||
|
||
```shell | ||
dnf install clang-tools-extra rocminfo | ||
amdgpu-arch | ||
rocminfo | grep gfx | ||
``` | ||
|
||
Map the name to a LLVM GPU target and an override GFX version. PyTorch 2.2.1+rocm5.7 provides a limited set of rocBLAS Kernels. Fedora 40's ROCm packages have more Kernels. For now we are limited to what PyTorch binaries provide until Fedora ships `python-torch` with ROCm support. | ||
|
||
| Name | xnack/USM | Version | PyTorch | Fedora | | ||
|-----------|-----------|----------|:-------:|:------:| | ||
| `gfx900` | | `9.0.0` | ✅ | ✅ | | ||
| `gfx906` | `xnack-` | `9.0.6` | ✅ | ✅ | | ||
| `gfx908` | `xnack-` | `9.0.8` | ✅ | ✅ | | ||
| `gfx90a` | `xnack-` | `9.0.10` | ✅ | ✅ | | ||
| `gfx90a` | `xnack+` | `9.0.10` | ✅ | ✅ | | ||
| `gfx940` | | | ❌ | ✅ | | ||
| `gfx941` | | | ❌ | ✅ | | ||
| `gfx942` | | | ❌ | ✅ | | ||
| `gfx1010` | | | ❌ | ✅ | | ||
| `gfx1012` | | | ❌ | ✅ | | ||
| `gfx1030` | | `10.3.0` | ✅ | ✅ | | ||
| `gfx1100` | | `11.0.0` | ✅ | ✅ | | ||
| `gfx1101` | | | ❌ | ✅ | | ||
| `gfx1102` | | | ❌ | ✅ | | ||
|
||
If your card is not listed or unsupported, try the closest smaller value, e.g. for `gfx1031` use target `gfx1030` and override `10.3.0`. See [ROCm/ROCR-Runtime `isa.cpp`](https://github.com/ROCm/ROCR-Runtime/blob/rocm-6.0.2/src/core/runtime/isa.cpp#L245) and [LLVM User Guide for AMDGPU](https://llvm.org/docs/AMDGPUUsage.html#processors) for more information. | ||
|
||
| Marketing Name | Name | Arch | Target | GFX version | Memory | Chat | Train | | ||
|------------------------|-----------|-------|-----------|-------------|--------|:----:|:-----:| | ||
| AMD Radeon RX 7900 XT | `gfx1100` | RDNA3 | `gfx1100` | `11.0.0` | 20 GiB | ✅ | ✅ | | ||
| AMD Radeon RX 7900 XTX | | RDNA3 | | | 24 GiB | ✅ | ✅ | | ||
| AMD Radeon RX 6700 | `gfx1031` | RDNA2 | `gfx1030` | `10.3.0` | 10 GiB | ✅ | ❌ | | ||
|
||
Build the container with additional build arguments: | ||
|
||
```shell | ||
podman build \ | ||
--build-arg AMDGPU_ARCH="gfx1030" \ | ||
--build-arg HSA_OVERRIDE_GFX_VERSION="10.3.0" \ | ||
-f container/rocm/Containerfile \ | ||
-t localhost/instructlab:rocm-gf1030 | ||
``` | ||
|
||
or use pre-defined build arguments from a config file: | ||
|
||
```shell | ||
podman build \ | ||
--build-args-file container/rocm/gfx1030.conf | ||
-f container/rocm/Containerfile \ | ||
-t localhost/instructlab:rocm-gf1030 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#!/bin/sh | ||
set -e | ||
|
||
NAME=$1 | ||
|
||
if test -z "$NAME"; then | ||
NAME=$(amdgpu-arch) | ||
fi | ||
|
||
case "$NAME" in | ||
gfx803 | gfx900 | gfx906 | gfx908 | gfx90a | gfx940 | gfx941 | gfx942 | gfx1010 | gfx1012) | ||
echo "ERROR: No mapping for '$NAME', yet" >&2 | ||
exit 3 | ||
;; | ||
gfx1030 | gfx1031 | gfx1032) | ||
echo "AMDGPU_ARCH=gfx1030" | ||
echo "HSA_OVERRIDE_GFX_VERSION=10.3.0" | ||
;; | ||
gfx1100 | gfx1101 | gfx1102) | ||
echo "AMDGPU_ARCH=gfx1100" | ||
echo "HSA_OVERRIDE_GFX_VERSION=11.0.0" | ||
;; | ||
*) | ||
echo "ERROR: unknown or unsupported GFX name '$NAME'" >&2 | ||
exit 2 | ||
esac |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# build arguments for AMD Radeon RX 6000 series | ||
AMDGPU_ARCH=gfx1030 | ||
HSA_OVERRIDE_GFX_VERSION=10.3.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#!/bin/sh | ||
set -e | ||
# Remove GPU support files that are not necessary for current GPU arch | ||
|
||
TORCH="/opt/rocm-venv/lib/python3.12/site-packages/torch" | ||
|
||
case "$AMDGPU_ARCH" in | ||
gfx9*) | ||
rm -rf /usr/lib*/rocm/gfx8 | ||
rm -rf /usr/lib*/rocm/gfx10 | ||
rm -rf /usr/lib*/rocm/gfx11 | ||
rm -f /usr/lib*/rocblas/library/*gfx8* | ||
rm -f /usr/lib*/rocblas/library/*gfx10* | ||
rm -f /usr/lib*/rocblas/library/*gfx11* | ||
rm -f ${TORCH}/lib/rocblas/library/*gfx8* | ||
rm -f ${TORCH}/lib/rocblas/library/*gfx10* | ||
rm -f ${TORCH}/lib/rocblas/library/*gfx11* | ||
;; | ||
gfx10*) | ||
rm -rf /usr/lib*/rocm/gfx8 | ||
rm -rf /usr/lib*/rocm/gfx9 | ||
rm -rf /usr/lib*/rocm/gfx11 | ||
rm -f /usr/lib*/rocblas/library/*gfx8* | ||
rm -f /usr/lib*/rocblas/library/*gfx9* | ||
rm -f /usr/lib*/rocblas/library/*gfx11* | ||
rm -f ${TORCH}/lib/rocblas/library/*gfx8* | ||
rm -f ${TORCH}/lib/rocblas/library/*gfx9* | ||
rm -f ${TORCH}/lib/rocblas/library/*gfx11* | ||
;; | ||
gfx11*) | ||
rm -rf /usr/lib*/rocm/gfx8 | ||
rm -rf /usr/lib*/rocm/gfx9 | ||
rm -rf /usr/lib*/rocm/gfx10 | ||
rm -f /usr/lib*/rocblas/library/*gfx8* | ||
rm -f /usr/lib*/rocblas/library/*gfx9* | ||
rm -f /usr/lib*/rocblas/library/*gfx10* | ||
rm -f ${TORCH}/lib/rocblas/library/*gfx8* | ||
rm -f ${TORCH}/lib/rocblas/library/*gfx9* | ||
rm -f ${TORCH}/lib/rocblas/library/*gfx10* | ||
;; | ||
*) | ||
echo "ERROR: $0 unknown AMDGPU_ARCH=$AMDGPU_ARCH" | ||
exit 2 | ||
esac | ||
|
||
# find /usr/lib* /opt/ -path '*/*gfx[189][0-9][0-9a-z]*' | grep -v $AMDGPU_ARCH | xargs rm -f |