diff --git a/training/amd-bootc/Containerfile b/training/amd-bootc/Containerfile index 5b69e9fe..c26f52cb 100644 --- a/training/amd-bootc/Containerfile +++ b/training/amd-bootc/Containerfile @@ -1,43 +1,49 @@ -# Define the images to be used ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-amd:latest" ARG BASEIMAGE="quay.io/centos-bootc/centos-bootc:stream9" +ARG DRIVER_TOOLKIT_IMAGE="quay.io/ai-lab/nvidia-builder:latest" + +FROM ${DRIVER_TOOLKIT_IMAGE} AS builder + +COPY repos.d/amdgpu.repo /etc/yum.repos.d/amdgpu.repo +COPY repos.d/RPM-GPG-KEY-AMD-ROCM /etc/pki/rpm-gpg/RPM-GPG-KEY-AMD-ROCM + +USER root + +RUN rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-AMD-ROCM \ + && dnf install -y amdgpu-dkms \ + && dnf clean all -# Define the base image for the second stage FROM ${BASEIMAGE} ARG VENDOR='' LABEL vendor=${VENDOR} LABEL org.opencontainers.image.vendor=${VENDOR} -ADD rocm.repo /etc/yum.repos.d/rocm.repo +RUN --mount=type=bind,from=builder,source=/,destination=/tmp/builder,ro \ + export KERNEL_VERSION=$(rpm -q --qf '%{VERSION}-%{RELEASE}.%{ARCH}' kernel-core) \ + && rm -f /lib/modules/${KERNEL_VERSION}/kernel/drivers/gpu/drm/amd/amdgpu/amdgpu.ko.xz \ + && cp -r /tmp/builder/lib/modules/${KERNEL_VERSION}/extra /lib/modules/${KERNEL_VERSION}/extra \ + && cp -r /tmp/builder/lib/firmware/updates/amdgpu /lib/firmware/amdgpu \ + && depmod ${KERNEL_VERSION} ARG EXTRA_RPM_PACKAGES='' -RUN mv /etc/selinux /etc/selinux.tmp && \ - dnf install -y \ - cloud-init \ - pciutils \ - rocm-smi \ - tmux \ - rsync \ - skopeo \ - ${EXTRA_RPM_PACKAGES} \ - && dnf clean all \ - && mv /etc/selinux.tmp /etc/selinux \ - && ln -s ../cloud-init.target /usr/lib/systemd/system/default.target.wants -# Setup /usr/lib/containers/storage as an additional store for images. -# Remove once the base images have this set by default. -RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \ - sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \ - /etc/containers/storage.conf && \ - if [ -f "/run/.input/ilab" ]; then \ - cp /run/.input/ilab /usr/bin/ilab; \ - else \ - curl -o /usr/bin/ilab "https://raw.githubusercontent.com/containers/ai-lab-recipes/main/training/ilab-wrapper/ilab"; \ - fi \ - && chmod +x /usr/bin/ilab +COPY repos.d/rocm.repo /etc/yum.repos.d/rocm.repo +COPY repos.d/RPM-GPG-KEY-AMD-ROCM /etc/pki/rpm-gpg/RPM-GPG-KEY-AMD-ROCM -ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-amd:latest" +RUN rpm --import /etc/pki/rpm-gpg/RPM-GPG-KEY-AMD-ROCM \ + && mv /etc/selinux /etc/selinux.tmp \ + && dnf install -y \ + cloud-init \ + pciutils \ + rocm-smi \ + rsync \ + skopeo \ + tmux \ + ${EXTRA_RPM_PACKAGES} \ + && dnf clean all \ + && mv /etc/selinux.tmp /etc/selinux \ + && ln -s ../cloud-init.target /usr/lib/systemd/system/default.target.wants ARG SSHPUBKEY @@ -46,20 +52,35 @@ ARG SSHPUBKEY RUN if [ -n "${SSHPUBKEY}" ]; then \ set -eu; mkdir -p /usr/ssh && \ echo 'AuthorizedKeysFile /usr/ssh/%u.keys .ssh/authorized_keys .ssh/authorized_keys2' >> /etc/ssh/sshd_config.d/30-auth-system.conf && \ - echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \ + echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \ fi -RUN sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' /usr/bin/ilab -RUN sed -i 's/__REPLACE_CONTAINER_DEVICE__/nvidia.com\/gpu=all/' /usr/bin/ilab -RUN sed -i "s%__REPLACE_CONTAINER_NAME__%${INSTRUCTLAB_IMAGE}%" /usr/bin/ilab +# Setup /usr/lib/containers/storage as an additional store for images. +# Remove once the base images have this set by default. +RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \ + sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \ + /etc/containers/storage.conf + +ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-amd:latest" +ARG INSTRUCTLAB_IMAGE_PULL_SECRET="instructlab-amd-pull" + +COPY duplicated/ilab-wrapper/ilab /usr/bin/ilab +RUN chmod +x /usr/bin/ilab \ + && sed -i "s%__REPLACE_IMAGE_NAME__%${INSTRUCTLAB_IMAGE}%" /usr/bin/ilab # Added for running as an OCI Container to prevent Overlay on Overlay issues. VOLUME /var/lib/containers -# Prepull the instructlab image -RUN if [ -f "/run/.input/instructlab-amd/oci-layout" ]; then \ +RUN --mount=type=secret,id=${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson \ + if [ -f "/run/.input/instructlab-amd/oci-layout" ]; then \ IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/instructlab-amd) && \ podman --root /usr/lib/containers/storage image tag ${IID} ${INSTRUCTLAB_IMAGE}; \ + elif [ -f "/run/secrets/${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson" ]; then \ + IID=$(sudo podman --root /usr/lib/containers/storage pull --authfile /run/secrets/${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson ${INSTRUCTLAB_IMAGE}); \ else \ IID=$(sudo podman --root /usr/lib/containers/storage pull ${INSTRUCTLAB_IMAGE}); \ fi + +RUN podman system reset --force 2>/dev/null + +LABEL image_version_id="${IMAGE_VERSION_ID}" diff --git a/training/amd-bootc/Makefile b/training/amd-bootc/Makefile index 976a72a6..5154c582 100644 --- a/training/amd-bootc/Makefile +++ b/training/amd-bootc/Makefile @@ -11,7 +11,8 @@ bootc: prepare-files $(ARCH:%=--platform linux/%) \ $(BUILD_ARG_FILE:%=--build-arg-file=%) \ $(EXTRA_RPM_PACKAGES:%=--build-arg EXTRA_RPM_PACKAGES=%) \ - $(FROM:%=--from=%) \ + $(DRIVER_TOOLKIT_IMAGE:%=--build-arg DRIVER_TOOLKIT_IMAGE=%) \ + $(FROM:%=--build-arg BASEIMAGE=%) \ $(INSTRUCTLAB_IMAGE:%=--build-arg INSTRUCTLAB_IMAGE=%) \ $(SOURCE_DATE_EPOCH:%=--timestamp=%) \ $(VENDOR:%=--build-arg VENDOR=%) \ diff --git a/training/amd-bootc/duplicated/ilab-wrapper/ilab b/training/amd-bootc/duplicated/ilab-wrapper/ilab new file mode 100755 index 00000000..95a08578 --- /dev/null +++ b/training/amd-bootc/duplicated/ilab-wrapper/ilab @@ -0,0 +1,144 @@ +#!/bin/bash + +echo-err() { echo "$@" >&2; } + +verify_range() { + subuid_range="$1" + username="$2" + NUMBER_OF_MATCHING_SUBUID_RANGES=$(if [[ -z "$subuid_range" ]]; then echo 0; else wc -l <<<"$subuid_range"; fi) + + if [[ "$NUMBER_OF_MATCHING_SUBUID_RANGES" == 0 ]]; then + echo-err "No /etc/subuid range found for user $username ($UID)" + exit 1 + elif [[ "$NUMBER_OF_MATCHING_SUBUID_RANGES" != 1 ]]; then + # TODO: Handle multiple subuid ranges. But for now, hard fail + echo-err "Multiple /etc/subuid ranges found for user $username ($UID), this is currently unsupported:" + echo-err "$subuid_range" + exit 1 + fi +} + +check_insights() { + if [[ -f /etc/insights-client/machine-id ]]; then + return + fi + if [[ -f /etc/ilab/insights-opt-out ]]; then + return + fi + local ID + eval "$(grep ^ID= /etc/os-release)" + if [[ "$ID" != "rhel" ]]; then + return + fi + cat << EOF +This host is not connected to Red Hat Insights. + +To connect this host to Red Hat Insights run the following command: +sudo rhc connect --organization --activation-key + +To generate an Activation Key: +https://console.redhat.com/insights/connector/activation-keys (this page will also display your Organization ID). + +For more information on Red Hat Insights, please visit: +https://docs.redhat.com/en/documentation/subscription_central/1-latest/html/getting_started_with_activation_keys_on_the_hybrid_cloud_console/assembly-creating-managing-activation-keys +EOF + exit 1 +} + +check_insights + +# Template values replaced by container build +IMAGE_NAME="__REPLACE_IMAGE_NAME__" + +ENTRYPOINT="ilab" +PARAMS=("$@") + +if [[ -n "$ILAB_HOME" ]]; then + HOME="$ILAB_HOME" +fi + +for dir in "$HOME/.cache" "$HOME/.config" "$HOME/.local"; do + mkdir -p "$dir" +done + +if [[ "$1" = "shell" ]]; then + ENTRYPOINT=bash + PARAMS=() +fi + +# If you need to mount additional volumes into the container, you can specify them +# using the ILAB_ADDITIONAL_MOUNTS environment variable. +# +# Example ILAB_ADDITIONAL_MOUNTS usage: +# +# ILAB_ADDITIONAL_MOUNTS="/host/path:/container/path /host/path2:/container/path2" +# +# If your path contains spaces, you can use quotes: +# +# ILAB_ADDITIONAL_MOUNTS="/host/path:/container/path '/host/path with spaces':/container/path" +ADDITIONAL_MOUNTS=() +if [ -n "${ILAB_ADDITIONAL_MOUNTS}" ]; then + # (eval is used here to allow the user to specify mounts that might have spaces in them) + eval "ADDITIONAL_MOUNTS=(${ILAB_ADDITIONAL_MOUNTS})" +fi +ADDITIONAL_MOUNT_OPTIONS=() +for PODMAN_MOUNT in "${ADDITIONAL_MOUNTS[@]}"; do + ADDITIONAL_MOUNT_OPTIONS+=("-v" "$PODMAN_MOUNT") +done + +# Add pull-secret to additional mounts +# In case of normal user, /run/user is used (XDG_RUNTIME_DIR), if root, it will be /run/containers +for authfile in \ + "${XDG_RUNTIME_DIR}/containers/auth.json" \ + /run/user/${UID}/containers/auth.json \ + /run/containers/${UID}/auth.json +do + if [[ -f "$authfile" ]]; then + ADDITIONAL_MOUNT_OPTIONS+=("-v" "$authfile:/run/containers/0/auth.json") + break + fi +done + +# We run the container as sudo in order to be able to access the root container +# storage, which has the ilab image pre-pulled. But for security reasons we map +# root UID 0 inside the container to the current user's UID (and all the other +# subuids to the user's /etc/subuid range) so that we're effectively running +# the container as the current user. +# +# In the future, we will run podman as the current user, once we figure a +# reasonable way for the current user to access the root's user container +# storage. +if [[ "$UID" == 0 ]]; then + # If we're already running as root, we don't need to map any UIDs + IMPERSONATE_CURRENT_USER_PODMAN_FLAGS=() +else + CURRENT_USER_NAME=$(id --user --name) + CURRENT_USER_SUBUID_RANGE=$(awk \ + --field-separator ':' \ + --assign current_user="$CURRENT_USER_NAME" \ + --assign current_uid="$UID" \ + '$1 == current_user || $1 == current_uid {print $2 ":" $3}' \ + /etc/subuid) + + verify_range "$CURRENT_USER_SUBUID_RANGE" "$CURRENT_USER_NAME" + + IMPERSONATE_CURRENT_USER_PODMAN_FLAGS=("--uidmap" "0:$UID" "--uidmap" "1:$CURRENT_USER_SUBUID_RANGE") +fi + +PRESERVE_ENV="VLLM_LOGGING_LEVEL,NCCL_DEBUG,HOME,HF_TOKEN" +PODMAN_COMMAND=("sudo" "--preserve-env=$PRESERVE_ENV" "podman" "run" "--rm" "-it" + "${IMPERSONATE_CURRENT_USER_PODMAN_FLAGS[@]}" + "--device" "/dev/kfd" "--device" "/dev/dri" + "--security-opt" "label=disable" "--net" "host" + "--shm-size" "10G" + "--pids-limit" "-1" + "-v" "$HOME:$HOME" + "${ADDITIONAL_MOUNT_OPTIONS[@]}" + "--env" "VLLM_LOGGING_LEVEL" + "--env" "HOME" + "--env" "NCCL_DEBUG" + "--entrypoint" "$ENTRYPOINT" + "--env" "HF_TOKEN" + "${IMAGE_NAME}") + +exec "${PODMAN_COMMAND[@]}" "${PARAMS[@]}" diff --git a/training/amd-bootc/repos.d/RPM-GPG-KEY-AMD-ROCM b/training/amd-bootc/repos.d/RPM-GPG-KEY-AMD-ROCM new file mode 100644 index 00000000..89815d0a --- /dev/null +++ b/training/amd-bootc/repos.d/RPM-GPG-KEY-AMD-ROCM @@ -0,0 +1,52 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1 + +mQINBFefsSABEADmVqQyRi5bcUs/eG8mnKLdY+V+xuKuHLuujlXinSaMFRO640Md +C2HNYLSd58Z8cB1rKfiN639CZp+SkDWq60cFXDCcX9djT0JmBzsTD/gwoMr16tMY +O+Z2mje2pEYgDJdmYrephhXn29BfebW1IQKdA+4C7l675mJ/T8yVMUNXC0hqfGDA +h1MJUQy/lz1S2fGdjCKX0PiYOnCOyhNa7aTpw9PkZWgEa/s4BhplFZxvLohrCcf6 +ks0gUITHfeEhJvj2KurRfL68DgFifGnG+/fsMHgW1Xp19GsnIVaoh6cV7/iFHhrb +6YHI1fdOq/mwOfG8mJnXmDXC/o24Q7mRRwvoJcsT0j+thRirs8trV01mKY+7Hxd2 +CamWttibo062pjWN2aEUMPmEU2kmGOupsZtlpqn6SGCd2+6maOPMNEq/F0EWxhul +q6mgezVb8pvJ3bwvph2/lMSgfT9fHs6UIh4i/3rnA5/JaejFonlnS9xEuglKjklj +UoikSPBOwjvoPW2u99WCflURFSXVvuk7Ci+XkbVPIZyD6gFJjeY02Ic5MAv5tj/z +0fpgr/CfwEllms+z7qz768xRweA0kmPTTARdufVTna6EV3K3njxvCIIfnrp1cF6S +e3VrREd98gO0Rmzy74UFqkXl9Tb/+UILx1qVRmOBinwacKGqzo+k9jPUKQARAQAB +tChBTUQgTUxTRSBEZXZPcHMgPGRsLk1MU0UuRGV2T3BzQGFtZC5jb20+iQI+BBMB +AgAoAhsDBgsJCAcDAgYVCAIJCgsEFgIDAQIeAQIXgAUCYfuRkwUJE8Hh5wAKCRCT +hrSKGmk8XI1AEACSJLVGHCLJOOKz9fbUR4KWl7Gpv0RWccwxhH01jNZTSXUCEnKA +2KYmaqFvrT5szxWILobmCNYtAlbdkpUfb0mMaF3UtTu+1UMOw2ExzxHw1FyA+z6d +vLqDKXLldsOFUfojDUhD5cK6uvONPc1orCf/4ve6wnRG838bAzb4VrFR64IxfPjx +NukH+jo2nEXNpnNv44DEiq65CcObaPuwAVBFnRYD/ByPO4ZArxFXqNzHRxpoZkKv +iwzhbPG4cirioqzRR9y2SsC+a2sO4a/jH0wOL2+n4L86xShYcuCBxXvS/AwrV/aO +JxKOfAUV4VQegAOQz64L+iz7PslNSTILJGdvGcC5Ckgpo6evdWBT7KdGXhzf4S1f +wZjYyP9sfQa7LxqyrkLHZqYt4If4Jmukx7cApBYp1nPnuCQrLU6D4Arq0ZVWQuNV +hbABLeqwdVQcX+vG/Kr/ZC+Vkv3Z8oElwVGAAQ6HNXr/u8ud2bu6iNJ5mcQbM1HD +KTNt5LUrk0p588a8dk0/TyC5xeKSv51iNL+aOVaTr0pRwgaHtEVar2i0FPC1mkr4 +1hhIDddx8WLoUt/52f1juyr/4CpL1M5f1cbMVjV6i0kqIEx/hxrryc+fZZQT5R4M +vysxcsh8ttgpABG5vzz2rLOCanmQ4eDdmlugzn/u0ngoDdnC0gEfnVVutLkCDQRX +n7EgARAAlsWVKSOQicuBxBlo3U5tre5whSyAOWHuy6/heGwCkGssTahbIL8pRwOL +5nKJCPCKKJ4YYoZ+Jzer9WTsDRZU/zpQXK9C5WdfF6DN/Fai3lqhgeDDVyF0hUDr +NQigm/w66JEYTGtMcC5PnYv7S6Zrn9WN4anv9n5thNwfsqxpbbg6sAQ2aLHLsW96 +myQE9v1s0YoSZYc7rFYBwszE+tFX0kLlyBYSRVns/USQifu66RObO706d8DHp6Ro +vO6WgsTu+0RR2FEUabBx1q6iKe1cqK0FYtWd8tXCpqQBm0zGC6UwTp4Z4GMCX2Pk +3xAMmrItW5kPKCANB+P/8ZoOoZLIX5Fr9axQ496lUh0ZDhOACewJfj9Szk9GN5rq ++2QKnRepatevGBVaN0lCAEwg2q9/9xmrT6CixFrbnw2T6mWHM3jQrvduqmC0c1Cd +uMZBGDKSpjouaN0UKtC+udwWiY7w452pcjCnUjzjk7tR1IarSCnLLYeb+MDCK83M +CFH60SmBfdqjRiTiLas34KSKNnmbfUfrTYswf0Oed/qXAUSlYOCmWl4sV8n+Ebpy +XfY80/fzu95RbpMEZMhUTRtvr64O5jaWM/lFnubnegGTW3Bk/fBR2VRsBx56ZHlc +JH23f6IREjQ1x4B2UsINYfyYpmzb+R4qpMzycBVHv9ipiYQsQ8sAEQEAAYkCJQQY +AQoADwIbDAUCYfuRtwUJE8HiEAAKCRCThrSKGmk8XMAcEACd0jYXjnu7qoEY4U9Q +47X2SeJmWsuTavCrU5AWxjYwWd0mtDqK8EynxDPq7UFs+8+OukqrE++p0bfBbDl9 +TwnwmSSdizAZriHMSgeg9GR5KVL4mreNhFQdk/6mTFdlRhi5s7ZuvPayLSMIAWaj +ET5gFMeO1B/ABSpaKEZwQjRcXrto/hCUJ++7qoosblhcgwX7fiqZZbMxcoCEQIQQ +7ZasLxpVtaeDVfetp2zO5F0/e3D/sNbvBrlDofSt6D5V2cmKjLqONFVc6JrzSNeK +k9Gn8UVzAKfRfLaQyDaoFV0MbBf3q111UQQPkvwZYp0lPT6t2/G8zoubwFhHsM31 +K5ZBbt0384hI9RJITo9/krXVXLYFeCLcoPKn/fGWgAwyYAYr6C7JcocxTNUyCd1I +AVg4SO/JuC3NWFQK5LhknN/gJkFlLZdB2cWqu9dDIkx1cHXThaM2n/7GSxv7fzrI +Br1jhZjUPWJ2iOd8iHgVEkIEvZql8z+huSxcNemodEN1emmUUoIyY3Fh0lJmozDt +ZPATk3iPpksOApsDVhWXP96RjTYEozYCxgTxCnk+kX/iJIlt53BPNWm9HMTcmtDI +v3s7OEcw0DN3U2VKcL9Q4Sg3uNfhwQsw/xBJaxAHQn5lN/8t0eLt+U653ooEEr0o +ta5TfPumStSQ1UjP8pPny4l+JQ== +=UOE+ +-----END PGP PUBLIC KEY BLOCK----- diff --git a/training/amd-bootc/repos.d/amdgpu.repo b/training/amd-bootc/repos.d/amdgpu.repo new file mode 100644 index 00000000..5297bb93 --- /dev/null +++ b/training/amd-bootc/repos.d/amdgpu.repo @@ -0,0 +1,7 @@ +[amdgpu] +name=amdgpu +baseurl=https://repo.radeon.com/amdgpu/6.1.2/el/9.4/main/x86_64/ +enabled=1 +priority=50 +gpgcheck=1 +gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key diff --git a/training/amd-bootc/repos.d/rocm.repo b/training/amd-bootc/repos.d/rocm.repo new file mode 100644 index 00000000..bce007e9 --- /dev/null +++ b/training/amd-bootc/repos.d/rocm.repo @@ -0,0 +1,7 @@ +[ROCm-6.2] +name=ROCm6.2 +baseurl=https://repo.radeon.com/rocm/el9/6.1.2/main +enabled=1 +priority=50 +gpgcheck=1 +gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key diff --git a/training/amd-bootc/rocm.repo b/training/amd-bootc/rocm.repo deleted file mode 100644 index 3b9f9fcc..00000000 --- a/training/amd-bootc/rocm.repo +++ /dev/null @@ -1,7 +0,0 @@ -[ROCm-6.0.2] -name=ROCm6.0.2 -baseurl=https://repo.radeon.com/rocm/rhel$releasever/6.0.2/main -enabled=1 -priority=50 -gpgcheck=1 -gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key