Skip to content

Commit

Permalink
Add container file for AMD ROCm GPUs (NVIDIA#636)
Browse files Browse the repository at this point in the history
The new contailer file creates a Fedora 40 toolbox container with ROCm
builds of PyTorch, llama-cpp, and bitsandbytes. The container image
contains all Python packages from `requirements.txt`, ROCm support
files, and tools. By default the container supports RDNA3 GPUs
`gfx1100` like W7900 and RX 7900. It can be build for other GPU
architectures.

```
podman build -t localhost/instructlab:rocm-gf1100 -f container/rocm/Containerfile
toolbox create --image localhost/instructlab:rocm-gf1100 instructlab
toolbox enter instructlab
pip install path/to/cli
```

Signed-off-by: Christian Heimes <[email protected]>
  • Loading branch information
tiran authored Mar 15, 2024
1 parent 9ed2971 commit 374abaf
Show file tree
Hide file tree
Showing 9 changed files with 376 additions and 0 deletions.
18 changes: 18 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
/.git
/.github

/CONTRIBUTING
/docs
/mac_inference
/MAINTAINERS
/notebooks
/scripts
/tests

# Python
/build
/.tox
*.egg-info
*.dist-info
__pycache__/
*.pyc
3 changes: 3 additions & 0 deletions .spellcheck-en-custom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,16 @@ PEFT
PlantUML
PyPI
Quantizing
RDNA
ROCm
cli
compositional
cpp
dataset
dev
ditaa
dr
env
gpu
lora
orchestrator
Expand Down
63 changes: 63 additions & 0 deletions containers/bin/debug-llama
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env python3

import argparse
import os
import sys
import time

import llama_cpp

PROMPT = "Q: What is the answer to the ultimate question of life, the universe, and everything? A:"

parser = argparse.ArgumentParser("test llama GPU/CPU")
parser.add_argument("--model", default=None)
parser.add_argument("--device", choices=["cpu", "gpu"], default="gpu")
parser.add_argument("--quiet", action="store_false", dest="verbose")
parser.add_argument("--repeat", type=int, default=5)
parser.add_argument("--max-tokens", type=int, default=0)
parser.add_argument("--prompt", default=PROMPT)


def main(argv=None):
args = parser.parse_args(argv)
print(f"llama_cpp version: {llama_cpp.__version__}")
print(f"llama supports gpu offload: {llama_cpp.llama_supports_gpu_offload()}")
# -1: offload all layers to GPU
n_gpu_layers = -1 if args.device == "gpu" else 0
print(f"n_gpu_layers: {n_gpu_layers} ({args.device})")

if args.model is None:
from cli.config import read_config

cfg = read_config()
args.model = cfg.serve.model_path

if not os.path.isfile(args.model):
print(f"Model file '{args.model}' is missing.")
print("Run 'lab init' and 'lab download'.")
sys.exit(2)

start = time.monotonic()
llm = llama_cpp.Llama(
model_path=args.model,
n_gpu_layers=n_gpu_layers,
verbose=args.verbose,
)
t = time.monotonic() - start
print(f"Loaded model in {t:.2f}sec")

print()
print(
f"Prompt: '{args.prompt}' (repeats: {args.repeat}, max_tokens: {args.max_tokens})"
)
start = time.monotonic()
for _ in range(args.repeat):
result = llm(args.prompt, max_tokens=args.max_tokens)
for choice in result["choices"]:
print(choice["text"].strip())
t = time.monotonic() - start
print(f"Got {args.repeat} result(s) in {t:.2f}sec")


if __name__ == "__main__":
main()
39 changes: 39 additions & 0 deletions containers/bin/debug-pytorch
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env python3
# based on Steffen Röcker's gist
# https://gist.github.com/sroecker/f52646023d45ab4e281ddee05d6ef2a5
import os

import torch

print(f"pytorch version: {torch.__version__}")
print(f"HSA_OVERRIDE_GFX_VERSION={os.environ.get('HSA_OVERRIDE_GFX_VERSION')}")
print(f"HIP_VISIBLE_DEVICES={os.environ.get('HIP_VISIBLE_DEVICES')}")
print(f"GPU available: {torch.cuda.is_available()}")
print(f"NVidia CUDA version: {torch.version.cuda or 'n/a'}")
print(f"AMD ROCm HIP version: {torch.version.hip or 'n/a'}")
print()
if torch.cuda.is_available():
print(f"device count: {torch.cuda.device_count()}")
print(f"current device: {torch.cuda.current_device()}")
for idx in range(torch.cuda.device_count()):
print(f" device {idx}: {torch.cuda.get_device_name(idx)}")
device = torch.device("cuda", torch.cuda.current_device())
else:
device = torch.device("cpu")

print(f"Testing with {device}")
print()
print("small matmul")
a = torch.rand(3, 3).to(device)
b = torch.rand(3, 3).to(device)
res = torch.matmul(a, b)
print(res)
print(res.size())

print()
print("larger matmul")
a = torch.rand(1280, 1280).to(device)
b = torch.rand(1280, 1280).to(device)
res = torch.matmul(a, b)
print(res)
print(res.size())
98 changes: 98 additions & 0 deletions containers/rocm/Containerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Christian Heimes <[email protected]>
# Based on Zack Zlotnik's container file

# runtime container has libraries, CLI tools, and virtual env
FROM registry.fedoraproject.org/fedora-toolbox:40 AS runtime
# args and env (default to gfx1100, GFX level 11.0.0, first GPU only)
ARG AMDGPU_ARCH=gfx1100
ARG HSA_OVERRIDE_GFX_VERSION=11.0.0
ARG HIP_VISIBLE_DEVICES=0
ARG PYTORCH_ROCM_VERSION=5.7
ENV AMDGPU_ARCH="${AMDGPU_ARCH}"
ENV HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES}"
ENV HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION}"
ENV PYTORCH_ROCM_VERSION="${PYTORCH_ROCM_VERSION}"
# runtime dependencies
COPY containers/rocm/remove-gfx.sh /tmp/
RUN --mount=type=cache,target=/var/cache/dnf,z \
dnf install -y --nodocs --setopt=install_weak_deps=False --setopt=keepcache=True \
rocm-runtime hipblas hiprand hipsparse lld-libs python3-pip nvtop radeontop make git gh && \
/tmp/remove-gfx.sh
# virtual env, umask 0000 to allow end-user to write to venv later
ENV VIRTUAL_ENV="/opt/rocm-venv"
RUN umask 0000 && \
python3 -m venv ${VIRTUAL_ENV}
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV PS1="(rocm-venv) ${PS1}"
# additional helpers to debug torch and llama
COPY containers/bin/debug-* ${VIRTUAL_ENV}/bin


# build env contains compilers and build dependencies
FROM runtime AS buildenv
RUN --mount=type=cache,target=/var/cache/dnf,z \
dnf install -y --nodocs --setopt=keepcache=True \
llvm compiler-rt clang-tools-extra lld python3-devel cmake ninja-build gcc \
rocblas-devel hip-devel hipblas-devel rocprim-devel rocthrust-devel hipsparse-devel hipcub-devel hiprand-devel


FROM buildenv AS pytorch
COPY requirements.txt ${VIRTUAL_ENV}/
# pip constraint does not support optional dependencies.
RUN sed 's/\[.*\]//' ${VIRTUAL_ENV}/requirements.txt > ${VIRTUAL_ENV}/constraints.txt
# chcon to work around pip's SELinux context shenanigans
RUN --mount=type=cache,target=/root/.cache/pip,z \
umask 0000 && \
${VIRTUAL_ENV}/bin/pip install torch --index-url https://download.pytorch.org/whl/rocm${PYTORCH_ROCM_VERSION} && \
/tmp/remove-gfx.sh && \
$(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true)

FROM pytorch AS llama
# remove cached wheel to force rebuild
RUN --mount=type=cache,target=/root/.cache/pip,z \
pip cache remove llama_cpp_python && \
umask 0000 && \
CMAKE_ARGS="-DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=/usr/bin/clang -DCMAKE_CXX_COMPILER=/usr/bin/clang++ -DAMDGPU_TARGETS=${AMDGPU_ARCH}" \
FORCE_CMAKE=1 \
${VIRTUAL_ENV}/bin/pip install -c ${VIRTUAL_ENV}/constraints.txt llama-cpp-python && \
$(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true)


FROM llama AS bitsandbytes
RUN git clone --depth 1 -b rocm https://github.com/arlo-phoenix/bitsandbytes-rocm-5.6.git /tmp/bitsandbytes
RUN git clone --depth 1 -b rocm-6.0.2 https://github.com/ROCm/hipBLASLt /tmp/hipblaslt
RUN mkdir -p /tmp/bitsandbytes/include/hipblaslt && \
echo -e '#pragma once\n#ifndef HIPBLASLT_EXPORT\n#define HIPBLASLT_EXPORT\n#endif' > /tmp/bitsandbytes/include/hipblaslt/hipblaslt-export.h && \
touch /tmp/bitsandbytes/include/hipblaslt/hipblaslt-version.h && \
cp /tmp/hipblaslt/library/include/* /tmp/bitsandbytes/include/hipblaslt/
RUN cd /tmp/bitsandbytes && \
make hip ROCM_TARGET="${AMDGPU_ARCH}" && \
umask 0000 && \
pip install -c ${VIRTUAL_ENV}/constraints.txt . && \
$(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true)


# install from requirements.txt last. pip does not override installed
# packages unless there is a version conflict.
FROM bitsandbytes AS pip-install
RUN --mount=type=cache,target=/root/.cache/pip,z \
umask 0000 && \
${VIRTUAL_ENV}/bin/pip install -r ${VIRTUAL_ENV}/requirements.txt && \
$(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true)
# debug and self-test
RUN export
RUN pip list
# the attribute error from bitsandbytes is harmless
RUN python3 -Wignore -c 'import llama_cpp, torch, bitsandbytes' 2>&1 | grep -v NoneType
RUN find ${VIRTUAL_ENV} -name __pycache__ | xargs rm -rf


# create final image from base runtime, copy virtual env into final stage
FROM runtime as final
COPY --from=pip-install /opt/rocm-venv/lib/python3.12/site-packages /opt/rocm-venv/lib/python3.12/site-packages
COPY --from=pip-install /opt/rocm-venv/bin /opt/rocm-venv/bin
LABEL com.github.containers.toolbox="true" \
name="instructlab-rocm-base-gfx${GFX_VERSION}" \
usage="This image is meant to be used with the toolbox(1) command" \
summary="PyTorch, llama.cpp, and instruct-lab dependencies for AMD ROCm GPU ${AMDGPU_ARCH}" \
maintainer="Christian Heimes <[email protected]>"
80 changes: 80 additions & 0 deletions containers/rocm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# instruct-lab toolbox container for AMD ROCm GPUs

The ROCm container file is designed for AMD GPUs with RDNA3 architecture (`gfx1100`). The container can be build for RDNA2 (`gfx1030`) and older GPUs, too. Please refer to [AMD's system requirements](https://rocm.docs.amd.com/projects/install-on-linux/en/develop/reference/system-requirements.html) for a list of officially supported cards. ROCm is known to work on more consumer GPUs.

The container file creates a [toolbox](https://github.com/containers/toolbox) container for [`toolbox(1)`](https://www.mankier.com/1/toolbox) command line tool. A toolbox containers has seamless access to the entire system including user's home directory, networking, hardware, SSH agent, and more.

The container has all Python dependencies installed in a virtual env. The virtual env is already activated when you enter the container. However the lab `cli` is **not** installed.

## Quick start

1. git clone the `cli` and `taxonomy` project into a common folder in your
home directory (e.g. `~/path/to/instruct-lab`)
2. add your account to `render` and `video` group: `sudo usermod -a -G render,video $LOGNAME`
3. install build dependency for this container: `sudo dnf install toolbox podman make`
4. build the container for RDNA3: `podman build -t localhost/instructlab:rocm-gf1100 -f container/rocm/Containerfile .`
5. create a toolbox `toolbox create --image localhost/instructlab:rocm-gf1100 instructlab`
6. enter toolbox `toolbox enter instructlab`. The container has your
home directory mounted.
7. install lab cli with `pip install -e ~/path/to/instruct-lab/cli/`

`lab generate` and `lab chat` use the GPU automatically. `lab train` needs
more powerful and recent GPU and therefore does not use GPU by default. To
train on a GPU, run `lab train --device cuda`.


## Building for other GPU architectures

Use the `amdgpu-arch` or `rocminfo` tool to get the short name

```shell
dnf install clang-tools-extra rocminfo
amdgpu-arch
rocminfo | grep gfx
```

Map the name to a LLVM GPU target and an override GFX version. PyTorch 2.2.1+rocm5.7 provides a limited set of rocBLAS Kernels. Fedora 40's ROCm packages have more Kernels. For now we are limited to what PyTorch binaries provide until Fedora ships `python-torch` with ROCm support.

| Name | xnack/USM | Version | PyTorch | Fedora |
|-----------|-----------|----------|:-------:|:------:|
| `gfx900` | | `9.0.0` |||
| `gfx906` | `xnack-` | `9.0.6` |||
| `gfx908` | `xnack-` | `9.0.8` |||
| `gfx90a` | `xnack-` | `9.0.10` |||
| `gfx90a` | `xnack+` | `9.0.10` |||
| `gfx940` | | |||
| `gfx941` | | |||
| `gfx942` | | |||
| `gfx1010` | | |||
| `gfx1012` | | |||
| `gfx1030` | | `10.3.0` |||
| `gfx1100` | | `11.0.0` |||
| `gfx1101` | | |||
| `gfx1102` | | |||

If your card is not listed or unsupported, try the closest smaller value, e.g. for `gfx1031` use target `gfx1030` and override `10.3.0`. See [ROCm/ROCR-Runtime `isa.cpp`](https://github.com/ROCm/ROCR-Runtime/blob/rocm-6.0.2/src/core/runtime/isa.cpp#L245) and [LLVM User Guide for AMDGPU](https://llvm.org/docs/AMDGPUUsage.html#processors) for more information.

| Marketing Name | Name | Arch | Target | GFX version | Memory | Chat | Train |
|------------------------|-----------|-------|-----------|-------------|--------|:----:|:-----:|
| AMD Radeon RX 7900 XT | `gfx1100` | RDNA3 | `gfx1100` | `11.0.0` | 20 GiB |||
| AMD Radeon RX 7900 XTX | | RDNA3 | | | 24 GiB |||
| AMD Radeon RX 6700 | `gfx1031` | RDNA2 | `gfx1030` | `10.3.0` | 10 GiB |||

Build the container with additional build arguments:

```shell
podman build \
--build-arg AMDGPU_ARCH="gfx1030" \
--build-arg HSA_OVERRIDE_GFX_VERSION="10.3.0" \
-f container/rocm/Containerfile \
-t localhost/instructlab:rocm-gf1030
```

or use pre-defined build arguments from a config file:

```shell
podman build \
--build-args-file container/rocm/gfx1030.conf
-f container/rocm/Containerfile \
-t localhost/instructlab:rocm-gf1030
```
26 changes: 26 additions & 0 deletions containers/rocm/gfx-version.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/sh
set -e

NAME=$1

if test -z "$NAME"; then
NAME=$(amdgpu-arch)
fi

case "$NAME" in
gfx803 | gfx900 | gfx906 | gfx908 | gfx90a | gfx940 | gfx941 | gfx942 | gfx1010 | gfx1012)
echo "ERROR: No mapping for '$NAME', yet" >&2
exit 3
;;
gfx1030 | gfx1031 | gfx1032)
echo "AMDGPU_ARCH=gfx1030"
echo "HSA_OVERRIDE_GFX_VERSION=10.3.0"
;;
gfx1100 | gfx1101 | gfx1102)
echo "AMDGPU_ARCH=gfx1100"
echo "HSA_OVERRIDE_GFX_VERSION=11.0.0"
;;
*)
echo "ERROR: unknown or unsupported GFX name '$NAME'" >&2
exit 2
esac
3 changes: 3 additions & 0 deletions containers/rocm/gfx1030.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# build arguments for AMD Radeon RX 6000 series
AMDGPU_ARCH=gfx1030
HSA_OVERRIDE_GFX_VERSION=10.3.0
46 changes: 46 additions & 0 deletions containers/rocm/remove-gfx.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/bin/sh
set -e
# Remove GPU support files that are not necessary for current GPU arch

TORCH="/opt/rocm-venv/lib/python3.12/site-packages/torch"

case "$AMDGPU_ARCH" in
gfx9*)
rm -rf /usr/lib*/rocm/gfx8
rm -rf /usr/lib*/rocm/gfx10
rm -rf /usr/lib*/rocm/gfx11
rm -f /usr/lib*/rocblas/library/*gfx8*
rm -f /usr/lib*/rocblas/library/*gfx10*
rm -f /usr/lib*/rocblas/library/*gfx11*
rm -f ${TORCH}/lib/rocblas/library/*gfx8*
rm -f ${TORCH}/lib/rocblas/library/*gfx10*
rm -f ${TORCH}/lib/rocblas/library/*gfx11*
;;
gfx10*)
rm -rf /usr/lib*/rocm/gfx8
rm -rf /usr/lib*/rocm/gfx9
rm -rf /usr/lib*/rocm/gfx11
rm -f /usr/lib*/rocblas/library/*gfx8*
rm -f /usr/lib*/rocblas/library/*gfx9*
rm -f /usr/lib*/rocblas/library/*gfx11*
rm -f ${TORCH}/lib/rocblas/library/*gfx8*
rm -f ${TORCH}/lib/rocblas/library/*gfx9*
rm -f ${TORCH}/lib/rocblas/library/*gfx11*
;;
gfx11*)
rm -rf /usr/lib*/rocm/gfx8
rm -rf /usr/lib*/rocm/gfx9
rm -rf /usr/lib*/rocm/gfx10
rm -f /usr/lib*/rocblas/library/*gfx8*
rm -f /usr/lib*/rocblas/library/*gfx9*
rm -f /usr/lib*/rocblas/library/*gfx10*
rm -f ${TORCH}/lib/rocblas/library/*gfx8*
rm -f ${TORCH}/lib/rocblas/library/*gfx9*
rm -f ${TORCH}/lib/rocblas/library/*gfx10*
;;
*)
echo "ERROR: $0 unknown AMDGPU_ARCH=$AMDGPU_ARCH"
exit 2
esac

# find /usr/lib* /opt/ -path '*/*gfx[189][0-9][0-9a-z]*' | grep -v $AMDGPU_ARCH | xargs rm -f

0 comments on commit 374abaf

Please sign in to comment.