From 9cd0e706ab571eeb81e294b4e2e2c93c402d0445 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Fri, 17 May 2024 15:27:20 -0400 Subject: [PATCH] fix: reenable arm64 builds for docker (#3045) ### Summary Closes #3034 and reenables ARM64 in the docker build and publish job. This was taken out in #3039 because we've only build `libreoffice` for AMD64 and not ARM64. If Chainguard publishes an `apk` for `libreoffice`, we can support a Chainguard image for both architectures. The smoke test now differs for both architectures, to reflect differences in the directory structure. ### Testing Build and publish ran successfully for ARM64 (job [here](https://github.com/Unstructured-IO/unstructured/actions/runs/9129712470/job/25104907497)) and AMD64 (job [here](https://github.com/Unstructured-IO/unstructured/actions/runs/9129712470/job/25104907826)). --- .github/workflows/docker-publish.yml | 28 ++++++++----------- Dockerfile => Dockerfile-amd64 | 0 Dockerfile-arm64 | 41 ++++++++++++++++++++++++++++ scripts/docker-build.sh | 2 +- scripts/docker-smoke-test.sh | 14 +++++++--- 5 files changed, 63 insertions(+), 22 deletions(-) rename Dockerfile => Dockerfile-amd64 (100%) create mode 100644 Dockerfile-arm64 diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 5cdda5724c..b0d877effc 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -24,10 +24,7 @@ jobs: build-images: strategy: matrix: - # NOTE(robinson) - temporarily disabling arm since the libreoffice packages only - # works on amd right now - docker-platform: ["linux/amd64"] - # docker-platform: ["linux/arm64", "linux/amd64"] + docker-platform: ["linux/arm64", "linux/amd64"] runs-on: ubuntu-latest-m needs: set-short-sha env: @@ -53,6 +50,7 @@ jobs: make docker-dl-packages ARCH=$(cut -d "/" -f2 <<< ${{ matrix.docker-platform }}) DOCKER_BUILDKIT=1 docker buildx build --platform=$ARCH --load \ + -f Dockerfile-$ARCH \ --build-arg PIP_VERSION=$PIP_VERSION \ --build-arg BUILDKIT_INLINE_CACHE=1 \ --progress plain \ @@ -72,8 +70,7 @@ jobs: DOCKER_PLATFORM="${{ matrix.docker-platform }}" DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA" \ make docker-test CI=true TEST_FILE=test_unstructured/partition/test_text.py fi - # NOTE(robinson) - disabling smoke because there's no notebook user anymore - # DOCKER_IMAGE=$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA make docker-smoke-test + DOCKER_IMAGE=$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA make docker-smoke-test - name: Push images run: | # write to the build repository to cache for the publish-images job @@ -97,25 +94,22 @@ jobs: - name: Pull AMD image run: | docker pull $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA - # NOTE(robinson) - put this back in when we reenable ARM - # - name: Pull ARM image - # run: | - # docker pull $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA + - name: Pull ARM image + run: | + docker pull $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA - name: Push latest build tags for AMD and ARM run: | # these are used to construct the final manifest but also cache-from in subsequent runs docker tag $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 docker push $DOCKER_BUILD_REPOSITORY:amd64 - # NOTE(robinson) - update this when we reenable ARM - # docker tag $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:arm64 - # docker push $DOCKER_BUILD_REPOSITORY:arm64 + docker tag $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:arm64 + docker push $DOCKER_BUILD_REPOSITORY:arm64 - name: Push multiarch manifest run: | - # NOTE(robinson) - update this when we reenable ARM - docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd64 + docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 docker manifest push $DOCKER_REPOSITORY:latest - docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 + docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 docker manifest push $DOCKER_REPOSITORY:$SHORT_SHA VERSION=$(grep -Po '(?<=__version__ = ")[^"]*' unstructured/__version__.py) - docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 + docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 docker manifest push $DOCKER_REPOSITORY:$VERSION diff --git a/Dockerfile b/Dockerfile-amd64 similarity index 100% rename from Dockerfile rename to Dockerfile-amd64 diff --git a/Dockerfile-arm64 b/Dockerfile-arm64 new file mode 100644 index 0000000000..18e9839005 --- /dev/null +++ b/Dockerfile-arm64 @@ -0,0 +1,41 @@ +# syntax=docker/dockerfile:experimental +FROM quay.io/unstructured-io/base-images:rocky9.2-9@sha256:73d8492452f086144d4b92b7931aa04719f085c74d16cae81e8826ef873729c9 as base + +# NOTE(crag): NB_USER ARG for mybinder.org compat: +# https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html +ARG NB_USER=notebook-user +ARG NB_UID=1000 +ARG PIP_VERSION + +# Set up environment +ENV HOME /home/${NB_USER} +ENV PYTHONPATH="${PYTHONPATH}:${HOME}" +ENV PATH="/home/usr/.local/bin:${PATH}" + +RUN groupadd --gid ${NB_UID} ${NB_USER} +RUN useradd --uid ${NB_UID} --gid ${NB_UID} ${NB_USER} +WORKDIR ${HOME} + +FROM base as deps +# Copy and install Unstructured +COPY requirements requirements + +RUN python3.10 -m pip install pip==${PIP_VERSION} && \ + dnf -y groupinstall "Development Tools" && \ + find requirements/ -type f -name "*.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \ + dnf -y groupremove "Development Tools" && \ + dnf clean all + +RUN python3.10 -c "import nltk; nltk.download('punkt')" && \ + python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" + +FROM deps as code + +USER ${NB_USER} + +COPY example-docs example-docs +COPY unstructured unstructured + +RUN python3.10 -c "from unstructured.partition.model_init import initialize; initialize()" + +CMD ["/bin/bash"] diff --git a/scripts/docker-build.sh b/scripts/docker-build.sh index b10eb5ddb7..2d15884e6e 100755 --- a/scripts/docker-build.sh +++ b/scripts/docker-build.sh @@ -5,7 +5,7 @@ DOCKER_REPOSITORY="${DOCKER_REPOSITORY:-quay.io/unstructured-io/unstructured}" PIP_VERSION="${PIP_VERSION:-23.1.2}" DOCKER_IMAGE="${DOCKER_IMAGE:-unstructured:dev}" -DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile +DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile-amd64 --build-arg PIP_VERSION="$PIP_VERSION" --build-arg BUILDKIT_INLINE_CACHE=1 --progress plain diff --git a/scripts/docker-smoke-test.sh b/scripts/docker-smoke-test.sh index 6cace034bb..e327419e89 100755 --- a/scripts/docker-smoke-test.sh +++ b/scripts/docker-smoke-test.sh @@ -38,10 +38,16 @@ trap stop_container EXIT await_container # Run the tests -docker cp test_unstructured_ingest $CONTAINER_NAME:/app -docker cp requirements/ingest $CONTAINER_NAME:/app/requirements/ingest -docker exec -u root "$CONTAINER_NAME" /bin/bash -c "chown -R nonroot:nonroot /app/test_unstructured_ingest" -docker exec "$CONTAINER_NAME" /bin/bash -c "/app/test_unstructured_ingest/src/wikipedia.sh" +if [[ "$DOCKER_IMAGE" == *"arm64"* ]]; then + docker cp test_unstructured_ingest $CONTAINER_NAME:/home/notebook-user + docker exec -u root "$CONTAINER_NAME" /bin/bash -c "chown -R 1000:1000 /home/notebook-user/test_unstructured_ingest" + docker exec "$CONTAINER_NAME" /bin/bash -c "/home/notebook-user/test_unstructured_ingest/src/wikipedia.sh" +else + docker cp test_unstructured_ingest $CONTAINER_NAME:/app + docker cp requirements/ingest $CONTAINER_NAME:/app/requirements/ingest + docker exec -u root "$CONTAINER_NAME" /bin/bash -c "chown -R nonroot:nonroot /app/test_unstructured_ingest" + docker exec "$CONTAINER_NAME" /bin/bash -c "/app/test_unstructured_ingest/src/wikipedia.sh" +fi result=$? exit $result