diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000000..c02f7c37fb1b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,18 @@ +/.git +/.github + +/CONTRIBUTING +/docs +/mac_inference +/MAINTAINERS +/notebooks +/scripts +/tests + +# Python +/build +/.tox +*.egg-info +*.dist-info +__pycache__/ +*.pyc diff --git a/.spellcheck-en-custom.txt b/.spellcheck-en-custom.txt index 3209032e8809..5dc92bebf2bc 100644 --- a/.spellcheck-en-custom.txt +++ b/.spellcheck-en-custom.txt @@ -19,6 +19,8 @@ PEFT PlantUML PyPI Quantizing +RDNA +ROCm cli compositional cpp @@ -26,6 +28,7 @@ dataset dev ditaa dr +env gpu lora orchestrator diff --git a/containers/bin/debug-llama b/containers/bin/debug-llama new file mode 100755 index 000000000000..f98dd8ad419b --- /dev/null +++ b/containers/bin/debug-llama @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +import argparse +import os +import sys +import time + +import llama_cpp + +PROMPT = "Q: What is the answer to the ultimate question of life, the universe, and everything? A:" + +parser = argparse.ArgumentParser("test llama GPU/CPU") +parser.add_argument("--model", default=None) +parser.add_argument("--device", choices=["cpu", "gpu"], default="gpu") +parser.add_argument("--quiet", action="store_false", dest="verbose") +parser.add_argument("--repeat", type=int, default=5) +parser.add_argument("--max-tokens", type=int, default=0) +parser.add_argument("--prompt", default=PROMPT) + + +def main(argv=None): + args = parser.parse_args(argv) + print(f"llama_cpp version: {llama_cpp.__version__}") + print(f"llama supports gpu offload: {llama_cpp.llama_supports_gpu_offload()}") + # -1: offload all layers to GPU + n_gpu_layers = -1 if args.device == "gpu" else 0 + print(f"n_gpu_layers: {n_gpu_layers} ({args.device})") + + if args.model is None: + from cli.config import read_config + + cfg = read_config() + args.model = cfg.serve.model_path + + if not os.path.isfile(args.model): + print(f"Model file '{args.model}' is missing.") + print("Run 'lab init' and 'lab download'.") + sys.exit(2) + + start = time.monotonic() + llm = llama_cpp.Llama( + model_path=args.model, + n_gpu_layers=n_gpu_layers, + verbose=args.verbose, + ) + t = time.monotonic() - start + print(f"Loaded model in {t:.2f}sec") + + print() + print( + f"Prompt: '{args.prompt}' (repeats: {args.repeat}, max_tokens: {args.max_tokens})" + ) + start = time.monotonic() + for _ in range(args.repeat): + result = llm(args.prompt, max_tokens=args.max_tokens) + for choice in result["choices"]: + print(choice["text"].strip()) + t = time.monotonic() - start + print(f"Got {args.repeat} result(s) in {t:.2f}sec") + + +if __name__ == "__main__": + main() diff --git a/containers/bin/debug-pytorch b/containers/bin/debug-pytorch new file mode 100755 index 000000000000..3e23f9aa92cd --- /dev/null +++ b/containers/bin/debug-pytorch @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# based on Steffen Röcker's gist +# https://gist.github.com/sroecker/f52646023d45ab4e281ddee05d6ef2a5 +import os + +import torch + +print(f"pytorch version: {torch.__version__}") +print(f"HSA_OVERRIDE_GFX_VERSION={os.environ.get('HSA_OVERRIDE_GFX_VERSION')}") +print(f"HIP_VISIBLE_DEVICES={os.environ.get('HIP_VISIBLE_DEVICES')}") +print(f"GPU available: {torch.cuda.is_available()}") +print(f"NVidia CUDA version: {torch.version.cuda or 'n/a'}") +print(f"AMD ROCm HIP version: {torch.version.hip or 'n/a'}") +print() +if torch.cuda.is_available(): + print(f"device count: {torch.cuda.device_count()}") + print(f"current device: {torch.cuda.current_device()}") + for idx in range(torch.cuda.device_count()): + print(f" device {idx}: {torch.cuda.get_device_name(idx)}") + device = torch.device("cuda", torch.cuda.current_device()) +else: + device = torch.device("cpu") + +print(f"Testing with {device}") +print() +print("small matmul") +a = torch.rand(3, 3).to(device) +b = torch.rand(3, 3).to(device) +res = torch.matmul(a, b) +print(res) +print(res.size()) + +print() +print("larger matmul") +a = torch.rand(1280, 1280).to(device) +b = torch.rand(1280, 1280).to(device) +res = torch.matmul(a, b) +print(res) +print(res.size()) diff --git a/containers/rocm/Containerfile b/containers/rocm/Containerfile new file mode 100644 index 000000000000..7cd13936db86 --- /dev/null +++ b/containers/rocm/Containerfile @@ -0,0 +1,98 @@ +# Christian Heimes +# Based on Zack Zlotnik's container file + +# runtime container has libraries, CLI tools, and virtual env +FROM registry.fedoraproject.org/fedora-toolbox:40 AS runtime +# args and env (default to gfx1100, GFX level 11.0.0, first GPU only) +ARG AMDGPU_ARCH=gfx1100 +ARG HSA_OVERRIDE_GFX_VERSION=11.0.0 +ARG HIP_VISIBLE_DEVICES=0 +ARG PYTORCH_ROCM_VERSION=5.7 +ENV AMDGPU_ARCH="${AMDGPU_ARCH}" +ENV HIP_VISIBLE_DEVICES="${HIP_VISIBLE_DEVICES}" +ENV HSA_OVERRIDE_GFX_VERSION="${HSA_OVERRIDE_GFX_VERSION}" +ENV PYTORCH_ROCM_VERSION="${PYTORCH_ROCM_VERSION}" +# runtime dependencies +COPY containers/rocm/remove-gfx.sh /tmp/ +RUN --mount=type=cache,target=/var/cache/dnf,z \ + dnf install -y --nodocs --setopt=install_weak_deps=False --setopt=keepcache=True \ + rocm-runtime hipblas hiprand hipsparse lld-libs python3-pip nvtop radeontop make git gh && \ + /tmp/remove-gfx.sh +# virtual env, umask 0000 to allow end-user to write to venv later +ENV VIRTUAL_ENV="/opt/rocm-venv" +RUN umask 0000 && \ + python3 -m venv ${VIRTUAL_ENV} +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" +ENV PS1="(rocm-venv) ${PS1}" +# additional helpers to debug torch and llama +COPY containers/bin/debug-* ${VIRTUAL_ENV}/bin + + +# build env contains compilers and build dependencies +FROM runtime AS buildenv +RUN --mount=type=cache,target=/var/cache/dnf,z \ + dnf install -y --nodocs --setopt=keepcache=True \ + llvm compiler-rt clang-tools-extra lld python3-devel cmake ninja-build gcc \ + rocblas-devel hip-devel hipblas-devel rocprim-devel rocthrust-devel hipsparse-devel hipcub-devel hiprand-devel + + +FROM buildenv AS pytorch +COPY requirements.txt ${VIRTUAL_ENV}/ +# pip constraint does not support optional dependencies. +RUN sed 's/\[.*\]//' ${VIRTUAL_ENV}/requirements.txt > ${VIRTUAL_ENV}/constraints.txt +# chcon to work around pip's SELinux context shenanigans +RUN --mount=type=cache,target=/root/.cache/pip,z \ + umask 0000 && \ + ${VIRTUAL_ENV}/bin/pip install torch --index-url https://download.pytorch.org/whl/rocm${PYTORCH_ROCM_VERSION} && \ + /tmp/remove-gfx.sh && \ + $(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true) + +FROM pytorch AS llama +# remove cached wheel to force rebuild +RUN --mount=type=cache,target=/root/.cache/pip,z \ + pip cache remove llama_cpp_python && \ + umask 0000 && \ + CMAKE_ARGS="-DLLAMA_HIPBLAS=on -DCMAKE_C_COMPILER=/usr/bin/clang -DCMAKE_CXX_COMPILER=/usr/bin/clang++ -DAMDGPU_TARGETS=${AMDGPU_ARCH}" \ + FORCE_CMAKE=1 \ + ${VIRTUAL_ENV}/bin/pip install -c ${VIRTUAL_ENV}/constraints.txt llama-cpp-python && \ + $(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true) + + +FROM llama AS bitsandbytes +RUN git clone --depth 1 -b rocm https://github.com/arlo-phoenix/bitsandbytes-rocm-5.6.git /tmp/bitsandbytes +RUN git clone --depth 1 -b rocm-6.0.2 https://github.com/ROCm/hipBLASLt /tmp/hipblaslt +RUN mkdir -p /tmp/bitsandbytes/include/hipblaslt && \ + echo -e '#pragma once\n#ifndef HIPBLASLT_EXPORT\n#define HIPBLASLT_EXPORT\n#endif' > /tmp/bitsandbytes/include/hipblaslt/hipblaslt-export.h && \ + touch /tmp/bitsandbytes/include/hipblaslt/hipblaslt-version.h && \ + cp /tmp/hipblaslt/library/include/* /tmp/bitsandbytes/include/hipblaslt/ +RUN cd /tmp/bitsandbytes && \ + make hip ROCM_TARGET="${AMDGPU_ARCH}" && \ + umask 0000 && \ + pip install -c ${VIRTUAL_ENV}/constraints.txt . && \ + $(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true) + + +# install from requirements.txt last. pip does not override installed +# packages unless there is a version conflict. +FROM bitsandbytes AS pip-install +RUN --mount=type=cache,target=/root/.cache/pip,z \ + umask 0000 && \ + ${VIRTUAL_ENV}/bin/pip install -r ${VIRTUAL_ENV}/requirements.txt && \ + $(chcon -R -l s0 /root/.cache/pip 2>/dev/null || true) +# debug and self-test +RUN export +RUN pip list +# the attribute error from bitsandbytes is harmless +RUN python3 -Wignore -c 'import llama_cpp, torch, bitsandbytes' 2>&1 | grep -v NoneType +RUN find ${VIRTUAL_ENV} -name __pycache__ | xargs rm -rf + + +# create final image from base runtime, copy virtual env into final stage +FROM runtime as final +COPY --from=pip-install /opt/rocm-venv/lib/python3.12/site-packages /opt/rocm-venv/lib/python3.12/site-packages +COPY --from=pip-install /opt/rocm-venv/bin /opt/rocm-venv/bin +LABEL com.github.containers.toolbox="true" \ + name="instructlab-rocm-base-gfx${GFX_VERSION}" \ + usage="This image is meant to be used with the toolbox(1) command" \ + summary="PyTorch, llama.cpp, and instruct-lab dependencies for AMD ROCm GPU ${AMDGPU_ARCH}" \ + maintainer="Christian Heimes " diff --git a/containers/rocm/README.md b/containers/rocm/README.md new file mode 100644 index 000000000000..b60e1c853184 --- /dev/null +++ b/containers/rocm/README.md @@ -0,0 +1,80 @@ +# instruct-lab toolbox container for AMD ROCm GPUs + +The ROCm container file is designed for AMD GPUs with RDNA3 architecture (`gfx1100`). The container can be build for RDNA2 (`gfx1030`) and older GPUs, too. Please refer to [AMD's system requirements](https://rocm.docs.amd.com/projects/install-on-linux/en/develop/reference/system-requirements.html) for a list of officially supported cards. ROCm is known to work on more consumer GPUs. + +The container file creates a [toolbox](https://github.com/containers/toolbox) container for [`toolbox(1)`](https://www.mankier.com/1/toolbox) command line tool. A toolbox containers has seamless access to the entire system including user's home directory, networking, hardware, SSH agent, and more. + +The container has all Python dependencies installed in a virtual env. The virtual env is already activated when you enter the container. However the lab `cli` is **not** installed. + +## Quick start + +1. git clone the `cli` and `taxonomy` project into a common folder in your + home directory (e.g. `~/path/to/instruct-lab`) +2. add your account to `render` and `video` group: `sudo usermod -a -G render,video $LOGNAME` +3. install build dependency for this container: `sudo dnf install toolbox podman make` +4. build the container for RDNA3: `podman build -t localhost/instructlab:rocm-gf1100 -f container/rocm/Containerfile .` +5. create a toolbox `toolbox create --image localhost/instructlab:rocm-gf1100 instructlab` +6. enter toolbox `toolbox enter instructlab`. The container has your + home directory mounted. +7. install lab cli with `pip install -e ~/path/to/instruct-lab/cli/` + +`lab generate` and `lab chat` use the GPU automatically. `lab train` needs +more powerful and recent GPU and therefore does not use GPU by default. To +train on a GPU, run `lab train --device cuda`. + + +## Building for other GPU architectures + +Use the `amdgpu-arch` or `rocminfo` tool to get the short name + +```shell +dnf install clang-tools-extra rocminfo +amdgpu-arch +rocminfo | grep gfx +``` + +Map the name to a LLVM GPU target and an override GFX version. PyTorch 2.2.1+rocm5.7 provides a limited set of rocBLAS Kernels. Fedora 40's ROCm packages have more Kernels. For now we are limited to what PyTorch binaries provide until Fedora ships `python-torch` with ROCm support. + +| Name | xnack/USM | Version | PyTorch | Fedora | +|-----------|-----------|----------|:-------:|:------:| +| `gfx900` | | `9.0.0` | ✅ | ✅ | +| `gfx906` | `xnack-` | `9.0.6` | ✅ | ✅ | +| `gfx908` | `xnack-` | `9.0.8` | ✅ | ✅ | +| `gfx90a` | `xnack-` | `9.0.10` | ✅ | ✅ | +| `gfx90a` | `xnack+` | `9.0.10` | ✅ | ✅ | +| `gfx940` | | | ❌ | ✅ | +| `gfx941` | | | ❌ | ✅ | +| `gfx942` | | | ❌ | ✅ | +| `gfx1010` | | | ❌ | ✅ | +| `gfx1012` | | | ❌ | ✅ | +| `gfx1030` | | `10.3.0` | ✅ | ✅ | +| `gfx1100` | | `11.0.0` | ✅ | ✅ | +| `gfx1101` | | | ❌ | ✅ | +| `gfx1102` | | | ❌ | ✅ | + +If your card is not listed or unsupported, try the closest smaller value, e.g. for `gfx1031` use target `gfx1030` and override `10.3.0`. See [ROCm/ROCR-Runtime `isa.cpp`](https://github.com/ROCm/ROCR-Runtime/blob/rocm-6.0.2/src/core/runtime/isa.cpp#L245) and [LLVM User Guide for AMDGPU](https://llvm.org/docs/AMDGPUUsage.html#processors) for more information. + +| Marketing Name | Name | Arch | Target | GFX version | Memory | Chat | Train | +|------------------------|-----------|-------|-----------|-------------|--------|:----:|:-----:| +| AMD Radeon RX 7900 XT | `gfx1100` | RDNA3 | `gfx1100` | `11.0.0` | 20 GiB | ✅ | ✅ | +| AMD Radeon RX 7900 XTX | | RDNA3 | | | 24 GiB | ✅ | ✅ | +| AMD Radeon RX 6700 | `gfx1031` | RDNA2 | `gfx1030` | `10.3.0` | 10 GiB | ✅ | ❌ | + +Build the container with additional build arguments: + +```shell +podman build \ + --build-arg AMDGPU_ARCH="gfx1030" \ + --build-arg HSA_OVERRIDE_GFX_VERSION="10.3.0" \ + -f container/rocm/Containerfile \ + -t localhost/instructlab:rocm-gf1030 +``` + +or use pre-defined build arguments from a config file: + +```shell +podman build \ + --build-args-file container/rocm/gfx1030.conf + -f container/rocm/Containerfile \ + -t localhost/instructlab:rocm-gf1030 +``` diff --git a/containers/rocm/gfx-version.sh b/containers/rocm/gfx-version.sh new file mode 100755 index 000000000000..1b68c254e41e --- /dev/null +++ b/containers/rocm/gfx-version.sh @@ -0,0 +1,26 @@ +#!/bin/sh +set -e + +NAME=$1 + +if test -z "$NAME"; then + NAME=$(amdgpu-arch) +fi + +case "$NAME" in + gfx803 | gfx900 | gfx906 | gfx908 | gfx90a | gfx940 | gfx941 | gfx942 | gfx1010 | gfx1012) + echo "ERROR: No mapping for '$NAME', yet" >&2 + exit 3 + ;; + gfx1030 | gfx1031 | gfx1032) + echo "AMDGPU_ARCH=gfx1030" + echo "HSA_OVERRIDE_GFX_VERSION=10.3.0" + ;; + gfx1100 | gfx1101 | gfx1102) + echo "AMDGPU_ARCH=gfx1100" + echo "HSA_OVERRIDE_GFX_VERSION=11.0.0" + ;; + *) + echo "ERROR: unknown or unsupported GFX name '$NAME'" >&2 + exit 2 +esac diff --git a/containers/rocm/gfx1030.conf b/containers/rocm/gfx1030.conf new file mode 100644 index 000000000000..ecabc6a20fb0 --- /dev/null +++ b/containers/rocm/gfx1030.conf @@ -0,0 +1,3 @@ +# build arguments for AMD Radeon RX 6000 series +AMDGPU_ARCH=gfx1030 +HSA_OVERRIDE_GFX_VERSION=10.3.0 diff --git a/containers/rocm/remove-gfx.sh b/containers/rocm/remove-gfx.sh new file mode 100755 index 000000000000..41d37b0490a3 --- /dev/null +++ b/containers/rocm/remove-gfx.sh @@ -0,0 +1,46 @@ +#!/bin/sh +set -e +# Remove GPU support files that are not necessary for current GPU arch + +TORCH="/opt/rocm-venv/lib/python3.12/site-packages/torch" + +case "$AMDGPU_ARCH" in + gfx9*) + rm -rf /usr/lib*/rocm/gfx8 + rm -rf /usr/lib*/rocm/gfx10 + rm -rf /usr/lib*/rocm/gfx11 + rm -f /usr/lib*/rocblas/library/*gfx8* + rm -f /usr/lib*/rocblas/library/*gfx10* + rm -f /usr/lib*/rocblas/library/*gfx11* + rm -f ${TORCH}/lib/rocblas/library/*gfx8* + rm -f ${TORCH}/lib/rocblas/library/*gfx10* + rm -f ${TORCH}/lib/rocblas/library/*gfx11* + ;; + gfx10*) + rm -rf /usr/lib*/rocm/gfx8 + rm -rf /usr/lib*/rocm/gfx9 + rm -rf /usr/lib*/rocm/gfx11 + rm -f /usr/lib*/rocblas/library/*gfx8* + rm -f /usr/lib*/rocblas/library/*gfx9* + rm -f /usr/lib*/rocblas/library/*gfx11* + rm -f ${TORCH}/lib/rocblas/library/*gfx8* + rm -f ${TORCH}/lib/rocblas/library/*gfx9* + rm -f ${TORCH}/lib/rocblas/library/*gfx11* + ;; + gfx11*) + rm -rf /usr/lib*/rocm/gfx8 + rm -rf /usr/lib*/rocm/gfx9 + rm -rf /usr/lib*/rocm/gfx10 + rm -f /usr/lib*/rocblas/library/*gfx8* + rm -f /usr/lib*/rocblas/library/*gfx9* + rm -f /usr/lib*/rocblas/library/*gfx10* + rm -f ${TORCH}/lib/rocblas/library/*gfx8* + rm -f ${TORCH}/lib/rocblas/library/*gfx9* + rm -f ${TORCH}/lib/rocblas/library/*gfx10* + ;; + *) + echo "ERROR: $0 unknown AMDGPU_ARCH=$AMDGPU_ARCH" + exit 2 +esac + +# find /usr/lib* /opt/ -path '*/*gfx[189][0-9][0-9a-z]*' | grep -v $AMDGPU_ARCH | xargs rm -f