From 062841f7245bbc7d9c36f936e1eb447ff358ef6a Mon Sep 17 00:00:00 2001 From: Yevgeny Shnaidman Date: Mon, 26 Aug 2024 19:48:13 +0300 Subject: [PATCH] updating AMD bootc image 1) using OOT driver and firmware instead of in-tree 2) moving to DKMS and ROCM 6.1.2 version Signed-off-by: Yevgeny Shnaidman --- training/amd-bootc/Containerfile | 40 ++++++++++++++++++++++++++-- training/amd-bootc/amdgpu.repo-6.1.2 | 7 +++++ training/amd-bootc/rocm.repo | 6 ++--- 3 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 training/amd-bootc/amdgpu.repo-6.1.2 diff --git a/training/amd-bootc/Containerfile b/training/amd-bootc/Containerfile index 5b69e9fe..09b91c72 100644 --- a/training/amd-bootc/Containerfile +++ b/training/amd-bootc/Containerfile @@ -1,6 +1,30 @@ # Define the images to be used ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-amd:latest" ARG BASEIMAGE="quay.io/centos-bootc/centos-bootc:stream9" +ARG DRIVER_TOOLKIT_IMAGE="quay.io/ai-lab/nvidia-builder:latest" + +# first stage image for installing the kernel modules and firmware sources +FROM ${BASEIMAGE} as sources +COPY amdgpu.repo-6.1.2 /etc/yum.repos.d/amdgpu-6.1.2.repo +RUN mv /etc/selinux /etc/selinux.tmp\ + && dnf install -y amdgpu-dkms \ + && dnf clean all \ + && mv /etc/selinux.tmp /etc/selinux \ + && ln -s ../cloud-init.target /usr/lib/systemd/system/default.target.wants + + +FROM ${DRIVER_TOOLKIT_IMAGE} as builder +USER root +#ARG KERNEL_VERSION +COPY --from=sources /usr/src/amdgpu-6.7.0-1781449.el9 /amdgpu-drivers-source +WORKDIR /amdgpu-drivers-source +RUN KERNEL_VERSION=$(cat /etc/driver-toolkit-release.sh | cut -d'=' -f2 | cut -d'"' -f2) \ + && ./amd/dkms/pre-build.sh ${KERNEL_VERSION} \ + && make TTM_NAME=amdttm SCHED_NAME=amd-sched -C /usr/src/kernels/${KERNEL_VERSION} M=/amdgpu-drivers-source \ + && ./amd/dkms/post-build.sh ${KERNEL_VERSION} +#RUN ./amd/dkms/pre-build.sh ${KERNEL_VERSION} +#RUN make TTM_NAME=amdttm SCHED_NAME=amd-sched -C /usr/src/kernels/${KERNEL_VERSION} M=/amdgpu-drivers-source +#RUN ./amd/dkms/post-build.sh ${KERNEL_VERSION} # Define the base image for the second stage FROM ${BASEIMAGE} @@ -9,10 +33,22 @@ ARG VENDOR='' LABEL vendor=${VENDOR} LABEL org.opencontainers.image.vendor=${VENDOR} +COPY --from=builder /amdgpu-drivers-source/amd/amdgpu/amdgpu.ko /lib/modules/*/amd/amdgpu/amdgpu.ko +COPY --from=builder /amdgpu-drivers-source/amd/amdkcl/amdkcl.ko /lib/modules/*/amd/amdkcl/amdkcl.ko +COPY --from=builder /amdgpu-drivers-source/amd/amdxcp/amdxcp.ko /lib/modules/*/amd/amdxcp/amdxcp.ko +COPY --from=builder /amdgpu-drivers-source/scheduler/amd-sched.ko /lib/modules/*/scheduler/amd-sched.ko +COPY --from=builder /amdgpu-drivers-source/ttm/amdttm.ko /lib/modules/*/ttm/amdttm.ko +COPY --from=builder /amdgpu-drivers-source/amddrm_buddy.ko /lib/modules/*/amddrm_buddy.ko +COPY --from=builder /amdgpu-drivers-source/amddrm_ttm_helper.ko /lib/modules/*/amddrm_ttm_helper.ko +RUN rm /lib/modules/*/kernel/drivers/gpu/drm/amd/amdgpu/amdgpu.ko.xz +RUN export KERNEL_VERSION=$(rpm -q --qf "%{VERSION}-%{RELEASE}.%{ARCH}" kernel-core) \ + && depmod $KERNEL_VERSION +COPY --from=sources /lib/firmware/updates/amdgpu /lib/firmware/amdgpu + ADD rocm.repo /etc/yum.repos.d/rocm.repo ARG EXTRA_RPM_PACKAGES='' -RUN mv /etc/selinux /etc/selinux.tmp && \ +RUN mv /etc/selinux /etc/selinux.tmp && \ dnf install -y \ cloud-init \ pciutils \ @@ -46,7 +82,7 @@ ARG SSHPUBKEY RUN if [ -n "${SSHPUBKEY}" ]; then \ set -eu; mkdir -p /usr/ssh && \ echo 'AuthorizedKeysFile /usr/ssh/%u.keys .ssh/authorized_keys .ssh/authorized_keys2' >> /etc/ssh/sshd_config.d/30-auth-system.conf && \ - echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \ + echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \ fi RUN sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' /usr/bin/ilab diff --git a/training/amd-bootc/amdgpu.repo-6.1.2 b/training/amd-bootc/amdgpu.repo-6.1.2 new file mode 100644 index 00000000..574df1cb --- /dev/null +++ b/training/amd-bootc/amdgpu.repo-6.1.2 @@ -0,0 +1,7 @@ +[amdgpu] +name=amdgpu +baseurl=https://repo.radeon.com/amdgpu/6.1.2/rhel/9.4/main/x86_64/ +enabled=1 +priority=50 +gpgcheck=1 +gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key diff --git a/training/amd-bootc/rocm.repo b/training/amd-bootc/rocm.repo index 3b9f9fcc..6936a7a9 100644 --- a/training/amd-bootc/rocm.repo +++ b/training/amd-bootc/rocm.repo @@ -1,6 +1,6 @@ -[ROCm-6.0.2] -name=ROCm6.0.2 -baseurl=https://repo.radeon.com/rocm/rhel$releasever/6.0.2/main +[ROCm-6.1.2] +name=ROCm6.1.2 +baseurl=https://repo.radeon.com/rocm/rhel9/6.1.2/main enabled=1 priority=50 gpgcheck=1