diff --git a/training/intel-bootc/Containerfile b/training/intel-bootc/Containerfile index 9cb0a43a..bc51411d 100644 --- a/training/intel-bootc/Containerfile +++ b/training/intel-bootc/Containerfile @@ -1,4 +1,4 @@ -ARG DRIVER_TOOLKIT_IMAGE="quay.io/ai-lab/nvidia-builder:latest" +ARG DRIVER_TOOLKIT_IMAGE="quay.io/ai-lab/intel-builder:latest" ARG BASEIMAGE="quay.io/centos-bootc/centos-bootc:stream9" FROM ${DRIVER_TOOLKIT_IMAGE} as builder @@ -40,6 +40,10 @@ COPY --from=builder /home/builder/usr/src/habanalabs-${DRIVER_VERSION}/drivers/i COPY --from=builder /home/builder/usr/src/habanalabs-${DRIVER_VERSION}/drivers/net/ethernet/intel/hbl_cn/habanalabs_cn.ko /tmp/extra/habanalabs_cn.ko COPY --from=builder /home/builder/usr/src/habanalabs-${DRIVER_VERSION}/drivers/net/ethernet/intel/hbl_en/habanalabs_en.ko /tmp/extra/habanalabs_en.ko COPY --from=builder /home/builder/lib/firmware/habanalabs /tmp/firmware/habanalabs +COPY --from=builder /home/builder/usr/bin/hl-smi /usr/bin/hl-smi + + +COPY duplicated/common/usr /usr RUN . /etc/os-release \ && export OS_VERSION_MAJOR=$(echo ${VERSION} | cut -d'.' -f 1) \ @@ -49,16 +53,15 @@ RUN . /etc/os-release \ && mv /tmp/firmware/habanalabs /lib/firmware \ && depmod -a ${KERNEL_VERSION}.${TARGET_ARCH} -RUN dnf install -y ${EXTRA_RPM_PACKAGES} \ +RUN mv /etc/selinux /etc/selinux.tmp \ + dnf install -y ${EXTRA_RPM_PACKAGES} \ cloud-init \ skopeo \ rsync \ && dnf clean all \ + && mv /etc/selinux.tmp /etc/selinux \ && ln -s ../cloud-init.target /usr/lib/systemd/system/default.target.wants -ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-intel:latest" - -ARG SSHPUBKEY # The --build-arg "SSHPUBKEY=$(cat ~/.ssh/id_rsa.pub)" option inserts your # public key into the image, allowing root access via ssh. @@ -68,10 +71,33 @@ RUN if [ -n "${SSHPUBKEY}" ]; then \ echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \ fi -# Prepull the instructlab image -RUN if [ -f "/run/.input/instructlab-intel/oci-layout" ]; then \ +# Setup /usr/lib/containers/storage as an additional store for images. +# Remove once the base images have this set by default. +# Also make sure not to duplicate if a base image already has it specified. +RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \ + sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \ + /etc/containers/storage.conf + +COPY duplicated/ilab-wrapper/ilab /usr/bin/ilab +RUN chmod +x /usr/bin/ilab + +ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-intel:latest" +ARG INSTRUCTLAB_IMAGE_PULL_SECRET="instructlab-intel-pull" + + +# Added for running as an OCI Container to prevent Overlay on Overlay issues. +VOLUME /var/lib/containers + +RUN --mount=type=secret,id=${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson \ + if [ -f "/run/.input/instructlab-intel/oci-layout" ]; then \ IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/instructlab-intel) && \ podman --root /usr/lib/containers/storage image tag ${IID} ${INSTRUCTLAB_IMAGE}; \ + elif [ -f "/run/secrets/${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson" ]; then \ + IID=$(sudo podman --root /usr/lib/containers/storage pull --authfile /run/secrets/${INSTRUCTLAB_IMAGE_PULL_SECRET}/.dockerconfigjson ${INSTRUCTLAB_IMAGE}); \ else \ IID=$(sudo podman --root /usr/lib/containers/storage pull ${INSTRUCTLAB_IMAGE}); \ fi +RUN podman system reset --force 2>/dev/null + +LABEL image_version_id="${IMAGE_VERSION_ID}" + diff --git a/training/intel-bootc/duplicated/common/usr/lib/systemd/system/basic.target.wants/upgrade-informer.service b/training/intel-bootc/duplicated/common/usr/lib/systemd/system/basic.target.wants/upgrade-informer.service new file mode 120000 index 00000000..e031b643 --- /dev/null +++ b/training/intel-bootc/duplicated/common/usr/lib/systemd/system/basic.target.wants/upgrade-informer.service @@ -0,0 +1 @@ +../upgrade-informer.service \ No newline at end of file diff --git a/training/intel-bootc/duplicated/common/usr/lib/systemd/system/timers.target.wants/upgrade-informer.timer b/training/intel-bootc/duplicated/common/usr/lib/systemd/system/timers.target.wants/upgrade-informer.timer new file mode 120000 index 00000000..2b4f7f08 --- /dev/null +++ b/training/intel-bootc/duplicated/common/usr/lib/systemd/system/timers.target.wants/upgrade-informer.timer @@ -0,0 +1 @@ +../upgrade-informer.timer \ No newline at end of file diff --git a/training/intel-bootc/duplicated/common/usr/lib/systemd/system/upgrade-informer.service b/training/intel-bootc/duplicated/common/usr/lib/systemd/system/upgrade-informer.service new file mode 100644 index 00000000..1e479959 --- /dev/null +++ b/training/intel-bootc/duplicated/common/usr/lib/systemd/system/upgrade-informer.service @@ -0,0 +1,12 @@ +[Unit] +Description=Check for available RHEL AI upgrade +ConditionPathExists=/run/ostree-booted +After=network-online.target +StartLimitIntervalSec=400 +StartLimitBurst=3 + +[Service] +Type=oneshot +ExecStart=/usr/libexec/upgrade-informer +Restart=on-failure +RestartSec=90 diff --git a/training/intel-bootc/duplicated/common/usr/lib/systemd/system/upgrade-informer.timer b/training/intel-bootc/duplicated/common/usr/lib/systemd/system/upgrade-informer.timer new file mode 100644 index 00000000..229db9fa --- /dev/null +++ b/training/intel-bootc/duplicated/common/usr/lib/systemd/system/upgrade-informer.timer @@ -0,0 +1,11 @@ +[Unit] +Description=Runs upgrade informer periodically +ConditionPathExists=/run/ostree-booted + +[Timer] +OnBootSec=1h +OnUnitInactiveSec=1day +RandomizedDelaySec=2h + +[Install] +WantedBy=timers.target diff --git a/training/intel-bootc/duplicated/common/usr/libexec/upgrade-informer b/training/intel-bootc/duplicated/common/usr/libexec/upgrade-informer new file mode 100755 index 00000000..5f8979c7 --- /dev/null +++ b/training/intel-bootc/duplicated/common/usr/libexec/upgrade-informer @@ -0,0 +1,37 @@ +#!/bin/bash + +# Run the command and capture its output +output=$(bootc upgrade --check | sed -e 1q) +message_file="/etc/motd.d/upgrade-message" +bootc_auth="/etc/ostree/auth.json" + +if [[ $output == Update\ available* ]]; then + if [[ ! -f $message_file ]]; then + echo "New version was found" + bootc_image=$(awk '{print $4}' <<< "$output") + # If auth file exists we should use it + auth_params="" + if [[ -f $bootc_auth ]]; then + auth_params="--authfile $bootc_auth" + fi + + # Get image version + # shellcheck disable=SC2086 + image_version_id=$(skopeo inspect --format json $auth_params "$bootc_image" | jq -r '.Labels | .["image_version_id"] // empty') + + # If upgrade available, write the output to the file + cat > $message_file << EOF + +** Attention! ** +** A new $image_version_id version is available ** +** In order to apply it run: bootc upgrade --apply +** Please note that the system will reboot after the upgrade ** + +EOF + fi +else + echo "No upgrade was found" + rm $message_file 2> /dev/null +fi + +echo "Finished running upgrade informer" diff --git a/training/intel-bootc/duplicated/ilab-wrapper/ilab b/training/intel-bootc/duplicated/ilab-wrapper/ilab new file mode 100755 index 00000000..26ac8634 --- /dev/null +++ b/training/intel-bootc/duplicated/ilab-wrapper/ilab @@ -0,0 +1,145 @@ +#!/bin/bash + +echo-err() { echo "$@" >&2; } + +verify_range() { + subuid_range="$1" + username="$2" + NUMBER_OF_MATCHING_SUBUID_RANGES=$(if [[ -z "$subuid_range" ]]; then echo 0; else wc -l <<<"$subuid_range"; fi) + + if [[ "$NUMBER_OF_MATCHING_SUBUID_RANGES" == 0 ]]; then + echo-err "No /etc/subuid range found for user $username ($UID)" + exit 1 + elif [[ "$NUMBER_OF_MATCHING_SUBUID_RANGES" != 1 ]]; then + # TODO: Handle multiple subuid ranges. But for now, hard fail + echo-err "Multiple /etc/subuid ranges found for user $username ($UID), this is currently unsupported:" + echo-err "$subuid_range" + exit 1 + fi +} + +check_insights() { + if [[ -f /etc/insights-client/machine-id ]]; then + return + fi + if [[ -f /etc/ilab/insights-opt-out ]]; then + return + fi + local ID + eval "$(grep ^ID= /etc/os-release)" + if [[ "$ID" != "rhel" ]]; then + return + fi + cat << EOF +This host is not connected to Red Hat Insights. + +To connect this host to Red Hat Insights run the following command: +sudo rhc connect --organization --activation-key + +To generate an Activation Key: +https://console.redhat.com/insights/connector/activation-keys (this page will also display your Organization ID). + +For more information on Red Hat Insights, please visit: +https://docs.redhat.com/en/documentation/subscription_central/1-latest/html/getting_started_with_activation_keys_on_the_hybrid_cloud_console/assembly-creating-managing-activation-keys +EOF + exit 1 +} + +check_insights + +# Template values replaced by container build +CONTAINER_DEVICE="__REPLACE_CONTAINER_DEVICE__" +IMAGE_NAME="__REPLACE_IMAGE_NAME__" + +ENTRYPOINT="ilab" +PARAMS=("$@") + +if [[ -n "$ILAB_HOME" ]]; then + HOME="$ILAB_HOME" +fi + +for dir in "$HOME/.cache" "$HOME/.config" "$HOME/.local"; do + mkdir -p "$dir" +done + +if [[ "$1" = "shell" ]]; then + ENTRYPOINT=bash + PARAMS=() +fi + +# If you need to mount additional volumes into the container, you can specify them +# using the ILAB_ADDITIONAL_MOUNTS environment variable. +# +# Example ILAB_ADDITIONAL_MOUNTS usage: +# +# ILAB_ADDITIONAL_MOUNTS="/host/path:/container/path /host/path2:/container/path2" +# +# If your path contains spaces, you can use quotes: +# +# ILAB_ADDITIONAL_MOUNTS="/host/path:/container/path '/host/path with spaces':/container/path" +ADDITIONAL_MOUNTS=() +if [ -n "${ILAB_ADDITIONAL_MOUNTS}" ]; then + # (eval is used here to allow the user to specify mounts that might have spaces in them) + eval "ADDITIONAL_MOUNTS=(${ILAB_ADDITIONAL_MOUNTS})" +fi +ADDITIONAL_MOUNT_OPTIONS=() +for PODMAN_MOUNT in "${ADDITIONAL_MOUNTS[@]}"; do + ADDITIONAL_MOUNT_OPTIONS+=("-v" "$PODMAN_MOUNT") +done + +# Add pull-secret to additional mounts +# In case of normal user, /run/user is used (XDG_RUNTIME_DIR), if root, it will be /run/containers +for authfile in \ + "${XDG_RUNTIME_DIR}/containers/auth.json" \ + /run/user/${UID}/containers/auth.json \ + /run/containers/${UID}/auth.json +do + if [[ -f "$authfile" ]]; then + ADDITIONAL_MOUNT_OPTIONS+=("-v" "$authfile:/run/containers/0/auth.json") + break + fi +done + +# We run the container as sudo in order to be able to access the root container +# storage, which has the ilab image pre-pulled. But for security reasons we map +# root UID 0 inside the container to the current user's UID (and all the other +# subuids to the user's /etc/subuid range) so that we're effectively running +# the container as the current user. +# +# In the future, we will run podman as the current user, once we figure a +# reasonable way for the current user to access the root's user container +# storage. +if [[ "$UID" == 0 ]]; then + # If we're already running as root, we don't need to map any UIDs + IMPERSONATE_CURRENT_USER_PODMAN_FLAGS=() +else + CURRENT_USER_NAME=$(id --user --name) + CURRENT_USER_SUBUID_RANGE=$(awk \ + --field-separator ':' \ + --assign current_user="$CURRENT_USER_NAME" \ + --assign current_uid="$UID" \ + '$1 == current_user || $1 == current_uid {print $2 ":" $3}' \ + /etc/subuid) + + verify_range "$CURRENT_USER_SUBUID_RANGE" "$CURRENT_USER_NAME" + + IMPERSONATE_CURRENT_USER_PODMAN_FLAGS=("--uidmap" "0:$UID" "--uidmap" "1:$CURRENT_USER_SUBUID_RANGE") +fi + +PRESERVE_ENV="VLLM_LOGGING_LEVEL,NCCL_DEBUG,HOME,HF_TOKEN" +PODMAN_COMMAND=("sudo" "--preserve-env=$PRESERVE_ENV" "podman" "run" "--rm" "-it" + "${IMPERSONATE_CURRENT_USER_PODMAN_FLAGS[@]}" + "--device" "${CONTAINER_DEVICE}" + "--security-opt" "label=disable" "--net" "host" + "--shm-size" "10G" + "--pids-limit" "-1" + "-v" "$HOME:$HOME" + "${ADDITIONAL_MOUNT_OPTIONS[@]}" + "--env" "VLLM_LOGGING_LEVEL" + "--env" "HOME" + "--env" "NCCL_DEBUG" + "--entrypoint" "$ENTRYPOINT" + "--env" "HF_TOKEN" + "${IMAGE_NAME}") + +exec "${PODMAN_COMMAND[@]}" "${PARAMS[@]}"