From 3ee3d1ac606c95f6ba118777dede747d386db7eb Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Thu, 15 Feb 2024 18:18:45 +0100 Subject: [PATCH 01/23] [CI] Quantization workflow --- .github/workflows/self-scheduled.yml | 51 +++++++++++++++++ docker/transformers-all-latest-gpu/Dockerfile | 14 +---- .../transformers-quantization-gpu/Dockerfile | 56 +++++++++++++++++++ 3 files changed, 108 insertions(+), 13 deletions(-) create mode 100644 docker/transformers-quantization-gpu/Dockerfile diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index d44e9a29ecf0da..199451da6a3725 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -297,6 +297,57 @@ jobs: name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu + run_tests_quantization_torch_gpu: + name: Quantization tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + container: + image: huggingface/transformers-quantization-latest-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run quantization tests on GPU + working-directory: /transformers + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu tests/quantization + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu/failures_short.txt + + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu + run_extract_warnings: name: Extract warnings in CI artifacts runs-on: ubuntu-22.04 diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index e96eb9539c8bd2..7d46c225704116 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -46,22 +46,10 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/acc RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft -# Add bitsandbytes for mixed int8 testing -RUN python3 -m pip install --no-cache-dir bitsandbytes - -# Add auto-gptq for gtpq quantization testing -RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ - # Add einops for additional model testing RUN python3 -m pip install --no-cache-dir einops -# Add aqlm for quantization testing -RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.1 - -# Add autoawq for quantization testing -RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp38-cp38-linux_x86_64.whl - -# For bettertransformer + gptq +# For bettertransformer RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum # For video model testing diff --git a/docker/transformers-quantization-gpu/Dockerfile b/docker/transformers-quantization-gpu/Dockerfile new file mode 100644 index 00000000000000..5ab1ee4ba6c429 --- /dev/null +++ b/docker/transformers-quantization-gpu/Dockerfile @@ -0,0 +1,56 @@ +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +LABEL maintainer="Hugging Face" + +ARG DEBIAN_FRONTEND=noninteractive + +# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands) +SHELL ["sh", "-lc"] + +# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant +# to be used as arguments for docker build (so far). + +ARG PYTORCH='2.1.1' +# (not always a valid torch version) +ARG INTEL_TORCH_EXT='2.1.100' +# Example: `cu102`, `cu113`, etc. +ARG CUDA='cu118' + +RUN apt update +RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg +RUN python3 -m pip install --no-cache-dir --upgrade pip + +ARG REF=main +RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF + +# TODO: Handle these in a python utility script +RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile +RUN echo torch=$VERSION +# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build. +# Currently, let's just use their latest releases (when `torch` is installed with a release version) +# TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI). +RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA + +RUN python3 -m pip install --no-cache-dir -e ./transformers[dev] + +RUN python3 -m pip uninstall -y flax jax + +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + +# Add bitsandbytes for mixed int8 testing +RUN python3 -m pip install --no-cache-dir bitsandbytes + +# Add auto-gptq for gtpq quantization testing +RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ + +# Add aqlm for quantization testing +RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.1 + +# Add autoawq for quantization testing +RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp38-cp38-linux_x86_64.whl + +# For bettertransformer + gptq +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum + +# When installing in editable mode, `transformers` is not recognized as a package. +# this line must be added in order for python to be aware of transformers. +RUN cd transformers && python3 setup.py develop \ No newline at end of file From 3df06c1d1312a39d8a502fdbf018360c69c0ba5d Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Thu, 15 Feb 2024 19:26:39 +0100 Subject: [PATCH 02/23] build dockerfile --- .github/workflows/build-docker-images.yml | 38 +++++++++++++++++++ .../Dockerfile | 0 docs/source/en/hf_quantizer.md | 2 +- 3 files changed, 39 insertions(+), 1 deletion(-) rename docker/{transformers-quantization-gpu => transformers-quantization-latest-gpu}/Dockerfile (100%) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index be070a95d3a94f..a22dd10bea474d 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -307,3 +307,41 @@ jobs: # REF=main # push: true # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci + + latest-quantization-torch-docker: + name: "Latest Pytorch + Quantization [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-quantization-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }} \ No newline at end of file diff --git a/docker/transformers-quantization-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile similarity index 100% rename from docker/transformers-quantization-gpu/Dockerfile rename to docker/transformers-quantization-latest-gpu/Dockerfile diff --git a/docs/source/en/hf_quantizer.md b/docs/source/en/hf_quantizer.md index 154cfb54b9ebc8..8261a6bc4585e1 100644 --- a/docs/source/en/hf_quantizer.md +++ b/docs/source/en/hf_quantizer.md @@ -66,4 +66,4 @@ For some quantization methods, they may require "pre-quantizing" the models thro 7. Document everything! Make sure your quantization method is documented in the [`docs/source/en/quantization.md`](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/docs/source/en/quantization.md) file. -8. Add tests! You should add tests by first adding the package in our nightly Dockerfile inside `docker/transformers-all-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out how it is implemented for other quantization methods. +8. Add tests! You should add tests by first adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out how it is implemented for other quantization methods. From 69a3ac57e71f7fc3f5cc911a26a7cc6ed9c4fc07 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Thu, 15 Feb 2024 19:35:37 +0100 Subject: [PATCH 03/23] fix dockerfile --- .../Dockerfile | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 5ab1ee4ba6c429..7af84440cc6902 100644 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -10,8 +10,8 @@ SHELL ["sh", "-lc"] # to be used as arguments for docker build (so far). ARG PYTORCH='2.1.1' -# (not always a valid torch version) -ARG INTEL_TORCH_EXT='2.1.100' +ARG TORCH_VISION='' +ARG TORCH_AUDIO='' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu118' @@ -22,13 +22,9 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF -# TODO: Handle these in a python utility script -RUN [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile -RUN echo torch=$VERSION -# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build. -# Currently, let's just use their latest releases (when `torch` is installed with a release version) -# TODO: We might need to specify proper versions that work with a specific torch version (especially for past CI). -RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA +RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA +RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' || VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA +RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA RUN python3 -m pip install --no-cache-dir -e ./transformers[dev] @@ -42,15 +38,15 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes # Add auto-gptq for gtpq quantization testing RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ +# Add optimum for gptq quantization testing +RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum + # Add aqlm for quantization testing RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.1 # Add autoawq for quantization testing RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp38-cp38-linux_x86_64.whl -# For bettertransformer + gptq -RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum - # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop \ No newline at end of file From f36265f1e125c2c05c85e5d95e8fd6fe9af63176 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Thu, 15 Feb 2024 19:46:17 +0100 Subject: [PATCH 04/23] update self-cheduled.yml --- .github/workflows/self-scheduled.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 199451da6a3725..3b63b7a688b001 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -358,7 +358,8 @@ jobs: run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, - run_all_tests_torch_cuda_extensions_gpu + run_all_tests_torch_cuda_extensions_gpu, + run_tests_quantization_torch_gpu, ] steps: - name: Checkout transformers @@ -406,6 +407,7 @@ jobs: run_pipelines_tf_gpu, run_pipelines_torch_gpu, run_all_tests_torch_cuda_extensions_gpu, + run_tests_quantization_torch_gpu, run_extract_warnings ] steps: From 745435530222156291259862f365530a62012510 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 16 Feb 2024 22:16:18 +0100 Subject: [PATCH 05/23] test build dockerfile on push --- .github/workflows/build-docker-images.yml | 427 +++++++++++----------- 1 file changed, 214 insertions(+), 213 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index a22dd10bea474d..c52e1f49c1da86 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -3,7 +3,8 @@ name: Build docker images (scheduled) on: push: branches: - - build_ci_docker_image* + # - build_ci_docker_image* + - add-quantization-workflow repository_dispatch: workflow_call: inputs: @@ -18,195 +19,195 @@ concurrency: cancel-in-progress: false jobs: - latest-docker: - name: "Latest PyTorch + TensorFlow [dev]" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu-push-ci + # latest-docker: + # name: "Latest PyTorch + TensorFlow [dev]" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-all-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-all-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-all-latest-gpu-push-ci - latest-torch-deepspeed-docker: - name: "Latest PyTorch + DeepSpeed" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + # latest-torch-deepspeed-docker: + # name: "Latest PyTorch + DeepSpeed" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} - # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) - latest-torch-deepspeed-docker-for-push-ci-daily-build: - name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) + # latest-torch-deepspeed-docker-for-push-ci-daily-build: + # name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - doc-builder: - name: "Doc builder" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-doc-builder - push: true - tags: huggingface/transformers-doc-builder + # doc-builder: + # name: "Doc builder" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-doc-builder + # push: true + # tags: huggingface/transformers-doc-builder - latest-pytorch: - name: "Latest PyTorch [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-gpu + # latest-pytorch: + # name: "Latest PyTorch [dev]" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-gpu # Need to be fixed with the help from Guillaume. # latest-pytorch-amd: @@ -244,33 +245,33 @@ jobs: # push: true # tags: huggingface/transformers-pytorch-amd-gpu-push-ci - latest-tensorflow: - name: "Latest TensorFlow [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-tensorflow-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-tensorflow-gpu + # latest-tensorflow: + # name: "Latest TensorFlow [dev]" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-tensorflow-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-tensorflow-gpu # latest-pytorch-deepspeed-amd: # name: "PyTorch + DeepSpeed (AMD) [dev]" From c745704dd2b0e1872fdc2d7bc05cd05f5497286d Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 16 Feb 2024 22:45:52 +0100 Subject: [PATCH 06/23] fix torch install --- docker/transformers-quantization-latest-gpu/Dockerfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 7af84440cc6902..c8c3c3405a836f 100644 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -10,8 +10,6 @@ SHELL ["sh", "-lc"] # to be used as arguments for docker build (so far). ARG PYTORCH='2.1.1' -ARG TORCH_VISION='' -ARG TORCH_AUDIO='' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu118' @@ -22,9 +20,11 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF -RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA -RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' || VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA -RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA +RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile +RUN echo torch=$VERSION +# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build. +# Currently, let's just use their latest releases (when `torch` is installed with a release version) +RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA RUN python3 -m pip install --no-cache-dir -e ./transformers[dev] From 8c34b969a88460b4e665b78883dc8560f802e1cb Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 16 Feb 2024 23:58:40 +0100 Subject: [PATCH 07/23] udapte to python 3.10 --- docker/transformers-quantization-latest-gpu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index c8c3c3405a836f..21de9a1cce0588 100644 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -14,7 +14,7 @@ ARG PYTORCH='2.1.1' ARG CUDA='cu118' RUN apt update -RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg +RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3.10 python3-pip ffmpeg RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main From 7fc1a730d797c859bc1bdb3bc453fdacbc0960df Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Tue, 20 Feb 2024 20:50:00 +0100 Subject: [PATCH 08/23] update aqlm version --- docker/transformers-quantization-latest-gpu/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 21de9a1cce0588..e7df09bf415b48 100644 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -14,7 +14,7 @@ ARG PYTORCH='2.1.1' ARG CUDA='cu118' RUN apt update -RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3.10 python3-pip ffmpeg +RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python python3-pip ffmpeg RUN python3 -m pip install --no-cache-dir --upgrade pip ARG REF=main @@ -42,7 +42,7 @@ RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://hu RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum # Add aqlm for quantization testing -RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.1 +RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 # Add autoawq for quantization testing RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp38-cp38-linux_x86_64.whl From 471cb7b85382f72e6dbd7bf03670dbfec3b6a8fb Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 21 Feb 2024 15:43:44 +0100 Subject: [PATCH 09/23] uncomment build dockerfile --- .github/workflows/build-docker-images.yml | 495 +++++++++++----------- 1 file changed, 247 insertions(+), 248 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index c52e1f49c1da86..079b6e10f1f973 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -3,8 +3,7 @@ name: Build docker images (scheduled) on: push: branches: - # - build_ci_docker_image* - - add-quantization-workflow + - build_ci_docker_image* repository_dispatch: workflow_call: inputs: @@ -19,195 +18,195 @@ concurrency: cancel-in-progress: false jobs: - # latest-docker: - # name: "Latest PyTorch + TensorFlow [dev]" - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-all-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} - # # Push CI images still need to be re-built daily - # - - # name: Build and push (for Push CI) in a daily basis - # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - # if: inputs.image_postfix != '-push-ci' - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-all-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-all-latest-gpu-push-ci + latest-docker: + name: "Latest PyTorch + TensorFlow [dev]" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu-push-ci - # latest-torch-deepspeed-docker: - # name: "Latest PyTorch + DeepSpeed" - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + latest-torch-deepspeed-docker: + name: "Latest PyTorch + DeepSpeed" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} - # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) - # latest-torch-deepspeed-docker-for-push-ci-daily-build: - # name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # # Push CI images still need to be re-built daily - # - - # name: Build and push (for Push CI) in a daily basis - # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - # if: inputs.image_postfix != '-push-ci' - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) + latest-torch-deepspeed-docker-for-push-ci-daily-build: + name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - # doc-builder: - # name: "Doc builder" - # # Push CI doesn't need this image - # if: inputs.image_postfix != '-push-ci' - # runs-on: ubuntu-22.04 - # steps: - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-doc-builder - # push: true - # tags: huggingface/transformers-doc-builder + doc-builder: + name: "Doc builder" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-doc-builder + push: true + tags: huggingface/transformers-doc-builder - # latest-pytorch: - # name: "Latest PyTorch [dev]" - # # Push CI doesn't need this image - # if: inputs.image_postfix != '-push-ci' - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-gpu + latest-pytorch: + name: "Latest PyTorch [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-gpu # Need to be fixed with the help from Guillaume. # latest-pytorch-amd: @@ -245,69 +244,69 @@ jobs: # push: true # tags: huggingface/transformers-pytorch-amd-gpu-push-ci - # latest-tensorflow: - # name: "Latest TensorFlow [dev]" - # # Push CI doesn't need this image - # if: inputs.image_postfix != '-push-ci' - # runs-on: ubuntu-22.04 - # steps: - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-tensorflow-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-tensorflow-gpu + latest-tensorflow: + name: "Latest TensorFlow [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-tensorflow-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-tensorflow-gpu - # latest-pytorch-deepspeed-amd: - # name: "PyTorch + DeepSpeed (AMD) [dev]" + latest-pytorch-deepspeed-amd: + name: "PyTorch + DeepSpeed (AMD) [dev]" - # runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] - # steps: - # - name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - name: Check out code - # uses: actions/checkout@v3 - # - name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-amd-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} - # # Push CI images still need to be re-built daily - # - - # name: Build and push (for Push CI) in a daily basis - # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - # if: inputs.image_postfix != '-push-ci' - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-amd-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci + runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Check out code + uses: actions/checkout@v3 + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci latest-quantization-torch-docker: name: "Latest Pytorch + Quantization [dev]" From 2f45fdace277a152738d7afdc4a59908f2329ea2 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 21 Feb 2024 15:46:27 +0100 Subject: [PATCH 10/23] tests if the scheduler works --- .github/workflows/self-scheduled.yml | 475 ++++++++++++++------------- 1 file changed, 238 insertions(+), 237 deletions(-) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 3b63b7a688b001..626ca8f0698987 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -12,7 +12,8 @@ on: - cron: "17 2 * * *" push: branches: - - run_scheduled_ci* + # - run_scheduled_ci* + - add-quantization-workflow env: HF_HOME: /mnt/cache @@ -70,232 +71,232 @@ jobs: run: | nvidia-smi - run_tests_gpu: - name: " " - needs: setup - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} - uses: ./.github/workflows/model_jobs.yml - with: - folder_slices: ${{ needs.setup.outputs.folder_slices }} - machine_type: ${{ matrix.machine_type }} - slice_id: ${{ matrix.slice_id }} - secrets: inherit - - run_examples_gpu: - name: Examples directory - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - container: - image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_examples_gpu - path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - - run_pipelines_torch_gpu: - name: PyTorch pipelines - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - container: - image: huggingface/transformers-pytorch-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu - path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu - - run_pipelines_tf_gpu: - name: TensorFlow pipelines - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - container: - image: huggingface/transformers-tensorflow-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: | - git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run all pipeline tests on GPU - working-directory: /transformers - run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines - - - name: Failure short reports - if: ${{ always() }} - run: | - cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu - path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu - - run_all_tests_torch_cuda_extensions_gpu: - name: Torch CUDA extension tests - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu, multi-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - needs: setup - container: - image: huggingface/transformers-pytorch-deepspeed-latest-gpu - options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - steps: - - name: Update clone - working-directory: /workspace/transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /workspace/transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: Remove cached torch extensions - run: rm -rf /github/home/.cache/torch_extensions/ - - # To avoid unknown test failures - - name: Pre build DeepSpeed *again* - working-directory: /workspace - run: | - python3 -m pip uninstall -y deepspeed - DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /workspace/transformers - run: | - python utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /workspace/transformers - run: pip freeze - - - name: Run all tests on GPU - working-directory: /workspace/transformers - run: | - python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports - path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu +# run_tests_gpu: +# name: " " +# needs: setup +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu, multi-gpu] +# slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} +# uses: ./.github/workflows/model_jobs.yml +# with: +# folder_slices: ${{ needs.setup.outputs.folder_slices }} +# machine_type: ${{ matrix.machine_type }} +# slice_id: ${{ matrix.slice_id }} +# secrets: inherit + +# run_examples_gpu: +# name: Examples directory +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu] +# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] +# container: +# image: huggingface/transformers-all-latest-gpu +# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ +# needs: setup +# steps: +# - name: Update clone +# working-directory: /transformers +# run: git fetch && git checkout ${{ github.sha }} + +# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) +# working-directory: /transformers +# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + +# - name: NVIDIA-SMI +# run: | +# nvidia-smi + +# - name: Environment +# working-directory: /transformers +# run: | +# python3 utils/print_env.py + +# - name: Show installed libraries and their versions +# working-directory: /transformers +# run: pip freeze + +# - name: Run examples tests on GPU +# working-directory: /transformers +# run: | +# pip install -r examples/pytorch/_tests_requirements.txt +# python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + +# - name: Failure short reports +# if: ${{ failure() }} +# continue-on-error: true +# run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + +# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.machine_type }}_run_examples_gpu +# path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + +# run_pipelines_torch_gpu: +# name: PyTorch pipelines +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu, multi-gpu] +# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] +# container: +# image: huggingface/transformers-pytorch-gpu +# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ +# needs: setup +# steps: +# - name: Update clone +# working-directory: /transformers +# run: git fetch && git checkout ${{ github.sha }} + +# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) +# working-directory: /transformers +# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + +# - name: NVIDIA-SMI +# run: | +# nvidia-smi + +# - name: Environment +# working-directory: /transformers +# run: | +# python3 utils/print_env.py + +# - name: Show installed libraries and their versions +# working-directory: /transformers +# run: pip freeze + +# - name: Run all pipeline tests on GPU +# working-directory: /transformers +# run: | +# python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + +# - name: Failure short reports +# if: ${{ failure() }} +# continue-on-error: true +# run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + +# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu +# path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + +# run_pipelines_tf_gpu: +# name: TensorFlow pipelines +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu, multi-gpu] +# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] +# container: +# image: huggingface/transformers-tensorflow-gpu +# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ +# needs: setup +# steps: +# - name: Update clone +# working-directory: /transformers +# run: | +# git fetch && git checkout ${{ github.sha }} + +# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) +# working-directory: /transformers +# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + +# - name: NVIDIA-SMI +# run: | +# nvidia-smi + +# - name: Environment +# working-directory: /transformers +# run: | +# python3 utils/print_env.py + +# - name: Show installed libraries and their versions +# working-directory: /transformers +# run: pip freeze + +# - name: Run all pipeline tests on GPU +# working-directory: /transformers +# run: | +# python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines + +# - name: Failure short reports +# if: ${{ always() }} +# run: | +# cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt + +# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu" +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu +# path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu + +# run_all_tests_torch_cuda_extensions_gpu: +# name: Torch CUDA extension tests +# strategy: +# fail-fast: false +# matrix: +# machine_type: [single-gpu, multi-gpu] +# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] +# needs: setup +# container: +# image: huggingface/transformers-pytorch-deepspeed-latest-gpu +# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ +# steps: +# - name: Update clone +# working-directory: /workspace/transformers +# run: git fetch && git checkout ${{ github.sha }} + +# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) +# working-directory: /workspace/transformers +# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + +# - name: Remove cached torch extensions +# run: rm -rf /github/home/.cache/torch_extensions/ + +# # To avoid unknown test failures +# - name: Pre build DeepSpeed *again* +# working-directory: /workspace +# run: | +# python3 -m pip uninstall -y deepspeed +# DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + +# - name: NVIDIA-SMI +# run: | +# nvidia-smi + +# - name: Environment +# working-directory: /workspace/transformers +# run: | +# python utils/print_env.py + +# - name: Show installed libraries and their versions +# working-directory: /workspace/transformers +# run: pip freeze + +# - name: Run all tests on GPU +# working-directory: /workspace/transformers +# run: | +# python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + +# - name: Failure short reports +# if: ${{ failure() }} +# continue-on-error: true +# run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt + +# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports" +# if: ${{ always() }} +# uses: actions/upload-artifact@v3 +# with: +# name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports +# path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu run_tests_quantization_torch_gpu: name: Quantization tests @@ -354,11 +355,11 @@ jobs: if: always() needs: [ setup, - run_tests_gpu, - run_examples_gpu, - run_pipelines_tf_gpu, - run_pipelines_torch_gpu, - run_all_tests_torch_cuda_extensions_gpu, + # run_tests_gpu, + # run_examples_gpu, + # run_pipelines_tf_gpu, + # run_pipelines_torch_gpu, + # run_all_tests_torch_cuda_extensions_gpu, run_tests_quantization_torch_gpu, ] steps: @@ -402,11 +403,11 @@ jobs: if: always() needs: [ setup, - run_tests_gpu, - run_examples_gpu, - run_pipelines_tf_gpu, - run_pipelines_torch_gpu, - run_all_tests_torch_cuda_extensions_gpu, + # run_tests_gpu, + # run_examples_gpu, + # run_pipelines_tf_gpu, + # run_pipelines_torch_gpu, + # run_all_tests_torch_cuda_extensions_gpu, run_tests_quantization_torch_gpu, run_extract_warnings ] From 67cd706c474408ecd031772e20512bc4525e7e60 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 21 Feb 2024 15:48:51 +0100 Subject: [PATCH 11/23] fix docker --- .github/workflows/build-docker-images.yml | 68 +++++++++++------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 079b6e10f1f973..a22dd10bea474d 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -272,41 +272,41 @@ jobs: push: true tags: huggingface/transformers-tensorflow-gpu - latest-pytorch-deepspeed-amd: - name: "PyTorch + DeepSpeed (AMD) [dev]" + # latest-pytorch-deepspeed-amd: + # name: "PyTorch + DeepSpeed (AMD) [dev]" - runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] - steps: - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Check out code - uses: actions/checkout@v3 - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-amd-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-amd-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci + # runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + # steps: + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - name: Check out code + # uses: actions/checkout@v3 + # - name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci latest-quantization-torch-docker: name: "Latest Pytorch + Quantization [dev]" From 99d0456dca02738967987423e2758709aeb08ac2 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 21 Feb 2024 15:49:16 +0100 Subject: [PATCH 12/23] do not trigger on psuh again --- .github/workflows/self-scheduled.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 626ca8f0698987..d9fc7a8617a456 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -12,8 +12,8 @@ on: - cron: "17 2 * * *" push: branches: - # - run_scheduled_ci* - - add-quantization-workflow + - run_scheduled_ci* + # - add-quantization-workflow env: HF_HOME: /mnt/cache From aca17cf3cfe1225ad4c9f385e619fcaf9a273400 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 21 Feb 2024 17:03:39 +0100 Subject: [PATCH 13/23] add additional runs --- utils/notification_service.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/notification_service.py b/utils/notification_service.py index 39a0fb840cf5ad..ac97f988c97298 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -1043,6 +1043,7 @@ def prepare_reports(title, header, reports, to_truncate=True): "PyTorch pipelines": "run_tests_torch_pipeline_gpu", "TensorFlow pipelines": "run_tests_tf_pipeline_gpu", "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports", + "Quantization tests": "run_tests_quantization_torch_gpu" } if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"): From a796a5eea8cd7a979821f5b8989fecfaa3a20653 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 21 Feb 2024 17:04:04 +0100 Subject: [PATCH 14/23] test again --- .github/workflows/self-scheduled.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index d9fc7a8617a456..626ca8f0698987 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -12,8 +12,8 @@ on: - cron: "17 2 * * *" push: branches: - - run_scheduled_ci* - # - add-quantization-workflow + # - run_scheduled_ci* + - add-quantization-workflow env: HF_HOME: /mnt/cache From e60712d09f5a8a6feb6e22f168f5371fe0b5f24b Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 21 Feb 2024 17:36:03 +0100 Subject: [PATCH 15/23] all good --- .github/workflows/self-scheduled.yml | 475 +++++++++++++-------------- 1 file changed, 237 insertions(+), 238 deletions(-) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 626ca8f0698987..3b63b7a688b001 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -12,8 +12,7 @@ on: - cron: "17 2 * * *" push: branches: - # - run_scheduled_ci* - - add-quantization-workflow + - run_scheduled_ci* env: HF_HOME: /mnt/cache @@ -71,232 +70,232 @@ jobs: run: | nvidia-smi -# run_tests_gpu: -# name: " " -# needs: setup -# strategy: -# fail-fast: false -# matrix: -# machine_type: [single-gpu, multi-gpu] -# slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} -# uses: ./.github/workflows/model_jobs.yml -# with: -# folder_slices: ${{ needs.setup.outputs.folder_slices }} -# machine_type: ${{ matrix.machine_type }} -# slice_id: ${{ matrix.slice_id }} -# secrets: inherit - -# run_examples_gpu: -# name: Examples directory -# strategy: -# fail-fast: false -# matrix: -# machine_type: [single-gpu] -# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] -# container: -# image: huggingface/transformers-all-latest-gpu -# options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# needs: setup -# steps: -# - name: Update clone -# working-directory: /transformers -# run: git fetch && git checkout ${{ github.sha }} - -# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) -# working-directory: /transformers -# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - -# - name: NVIDIA-SMI -# run: | -# nvidia-smi - -# - name: Environment -# working-directory: /transformers -# run: | -# python3 utils/print_env.py - -# - name: Show installed libraries and their versions -# working-directory: /transformers -# run: pip freeze - -# - name: Run examples tests on GPU -# working-directory: /transformers -# run: | -# pip install -r examples/pytorch/_tests_requirements.txt -# python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - -# - name: Failure short reports -# if: ${{ failure() }} -# continue-on-error: true -# run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - -# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" -# if: ${{ always() }} -# uses: actions/upload-artifact@v3 -# with: -# name: ${{ matrix.machine_type }}_run_examples_gpu -# path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - -# run_pipelines_torch_gpu: -# name: PyTorch pipelines -# strategy: -# fail-fast: false -# matrix: -# machine_type: [single-gpu, multi-gpu] -# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] -# container: -# image: huggingface/transformers-pytorch-gpu -# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# needs: setup -# steps: -# - name: Update clone -# working-directory: /transformers -# run: git fetch && git checkout ${{ github.sha }} - -# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) -# working-directory: /transformers -# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - -# - name: NVIDIA-SMI -# run: | -# nvidia-smi - -# - name: Environment -# working-directory: /transformers -# run: | -# python3 utils/print_env.py - -# - name: Show installed libraries and their versions -# working-directory: /transformers -# run: pip freeze - -# - name: Run all pipeline tests on GPU -# working-directory: /transformers -# run: | -# python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines - -# - name: Failure short reports -# if: ${{ failure() }} -# continue-on-error: true -# run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt - -# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" -# if: ${{ always() }} -# uses: actions/upload-artifact@v3 -# with: -# name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu -# path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu - -# run_pipelines_tf_gpu: -# name: TensorFlow pipelines -# strategy: -# fail-fast: false -# matrix: -# machine_type: [single-gpu, multi-gpu] -# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] -# container: -# image: huggingface/transformers-tensorflow-gpu -# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# needs: setup -# steps: -# - name: Update clone -# working-directory: /transformers -# run: | -# git fetch && git checkout ${{ github.sha }} - -# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) -# working-directory: /transformers -# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - -# - name: NVIDIA-SMI -# run: | -# nvidia-smi - -# - name: Environment -# working-directory: /transformers -# run: | -# python3 utils/print_env.py - -# - name: Show installed libraries and their versions -# working-directory: /transformers -# run: pip freeze - -# - name: Run all pipeline tests on GPU -# working-directory: /transformers -# run: | -# python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines - -# - name: Failure short reports -# if: ${{ always() }} -# run: | -# cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt - -# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu" -# if: ${{ always() }} -# uses: actions/upload-artifact@v3 -# with: -# name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu -# path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu - -# run_all_tests_torch_cuda_extensions_gpu: -# name: Torch CUDA extension tests -# strategy: -# fail-fast: false -# matrix: -# machine_type: [single-gpu, multi-gpu] -# runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] -# needs: setup -# container: -# image: huggingface/transformers-pytorch-deepspeed-latest-gpu -# options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ -# steps: -# - name: Update clone -# working-directory: /workspace/transformers -# run: git fetch && git checkout ${{ github.sha }} - -# - name: Reinstall transformers in edit mode (remove the one installed during docker image build) -# working-directory: /workspace/transformers -# run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - -# - name: Remove cached torch extensions -# run: rm -rf /github/home/.cache/torch_extensions/ - -# # To avoid unknown test failures -# - name: Pre build DeepSpeed *again* -# working-directory: /workspace -# run: | -# python3 -m pip uninstall -y deepspeed -# DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check - -# - name: NVIDIA-SMI -# run: | -# nvidia-smi - -# - name: Environment -# working-directory: /workspace/transformers -# run: | -# python utils/print_env.py - -# - name: Show installed libraries and their versions -# working-directory: /workspace/transformers -# run: pip freeze - -# - name: Run all tests on GPU -# working-directory: /workspace/transformers -# run: | -# python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended - -# - name: Failure short reports -# if: ${{ failure() }} -# continue-on-error: true -# run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt - -# - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports" -# if: ${{ always() }} -# uses: actions/upload-artifact@v3 -# with: -# name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports -# path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu + run_tests_gpu: + name: " " + needs: setup + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} + uses: ./.github/workflows/model_jobs.yml + with: + folder_slices: ${{ needs.setup.outputs.folder_slices }} + machine_type: ${{ matrix.machine_type }} + slice_id: ${{ matrix.slice_id }} + secrets: inherit + + run_examples_gpu: + name: Examples directory + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run examples tests on GPU + working-directory: /transformers + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_examples_gpu + path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + + run_pipelines_torch_gpu: + name: PyTorch pipelines + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + container: + image: huggingface/transformers-pytorch-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all pipeline tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt + + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu + + run_pipelines_tf_gpu: + name: TensorFlow pipelines + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + container: + image: huggingface/transformers-tensorflow-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + needs: setup + steps: + - name: Update clone + working-directory: /transformers + run: | + git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all pipeline tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_tf_pipeline_gpu tests/pipelines + + - name: Failure short reports + if: ${{ always() }} + run: | + cat /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu/failures_short.txt + + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu + path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu + + run_all_tests_torch_cuda_extensions_gpu: + name: Torch CUDA extension tests + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + needs: setup + container: + image: huggingface/transformers-pytorch-deepspeed-latest-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /workspace/transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /workspace/transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Remove cached torch extensions + run: rm -rf /github/home/.cache/torch_extensions/ + + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: /workspace + run: | + python3 -m pip uninstall -y deepspeed + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /workspace/transformers + run: | + python utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /workspace/transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /workspace/transformers + run: | + python -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu/failures_short.txt + + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports + path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu run_tests_quantization_torch_gpu: name: Quantization tests @@ -355,11 +354,11 @@ jobs: if: always() needs: [ setup, - # run_tests_gpu, - # run_examples_gpu, - # run_pipelines_tf_gpu, - # run_pipelines_torch_gpu, - # run_all_tests_torch_cuda_extensions_gpu, + run_tests_gpu, + run_examples_gpu, + run_pipelines_tf_gpu, + run_pipelines_torch_gpu, + run_all_tests_torch_cuda_extensions_gpu, run_tests_quantization_torch_gpu, ] steps: @@ -403,11 +402,11 @@ jobs: if: always() needs: [ setup, - # run_tests_gpu, - # run_examples_gpu, - # run_pipelines_tf_gpu, - # run_pipelines_torch_gpu, - # run_all_tests_torch_cuda_extensions_gpu, + run_tests_gpu, + run_examples_gpu, + run_pipelines_tf_gpu, + run_pipelines_torch_gpu, + run_all_tests_torch_cuda_extensions_gpu, run_tests_quantization_torch_gpu, run_extract_warnings ] From 3e82d7ba5cbb5b0ad027e7181e37b75cb810aa4c Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 21 Feb 2024 17:41:10 +0100 Subject: [PATCH 16/23] style --- utils/notification_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/notification_service.py b/utils/notification_service.py index ac97f988c97298..d29e6994a232b2 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -1043,7 +1043,7 @@ def prepare_reports(title, header, reports, to_truncate=True): "PyTorch pipelines": "run_tests_torch_pipeline_gpu", "TensorFlow pipelines": "run_tests_tf_pipeline_gpu", "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports", - "Quantization tests": "run_tests_quantization_torch_gpu" + "Quantization tests": "run_tests_quantization_torch_gpu", } if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"): From 34e6048d4eb0f27422a781f90191f6153661834c Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Thu, 22 Feb 2024 10:14:55 -0500 Subject: [PATCH 17/23] Update .github/workflows/self-scheduled.yml Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --- .github/workflows/self-scheduled.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 3b63b7a688b001..b0e1717993a37f 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -333,7 +333,6 @@ jobs: - name: Run quantization tests on GPU working-directory: /transformers run: | - pip install -r examples/pytorch/_tests_requirements.txt python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu tests/quantization - name: Failure short reports From c5c567089f41bf928211bdd5f2e119331671f5b8 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Fri, 23 Feb 2024 17:17:28 +0100 Subject: [PATCH 18/23] test build dockerfile with torch 2.2.0 --- .github/workflows/build-docker-images.yml | 419 +++++++++--------- .../Dockerfile | 6 +- 2 files changed, 207 insertions(+), 218 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 8cae54cffa5d87..00781635ca6c3b 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -3,7 +3,8 @@ name: Build docker images (scheduled) on: push: branches: - - build_ci_docker_image* + # - build_ci_docker_image* + - add-quantization-workflow repository_dispatch: workflow_call: inputs: @@ -18,185 +19,185 @@ concurrency: cancel-in-progress: false jobs: - latest-docker: - name: "Latest PyTorch + TensorFlow [dev]" - runs-on: [intel-cpu, 8-cpu, ci] - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-all-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-all-latest-gpu-push-ci + # latest-docker: + # name: "Latest PyTorch + TensorFlow [dev]" + # runs-on: [intel-cpu, 8-cpu, ci] + # steps: + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-all-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-all-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-all-latest-gpu-push-ci - latest-torch-deepspeed-docker: - name: "Latest PyTorch + DeepSpeed" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + # latest-torch-deepspeed-docker: + # name: "Latest PyTorch + DeepSpeed" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} - # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) - latest-torch-deepspeed-docker-for-push-ci-daily-build: - name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-latest-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) + # latest-torch-deepspeed-docker-for-push-ci-daily-build: + # name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-latest-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - doc-builder: - name: "Doc builder" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-doc-builder - push: true - tags: huggingface/transformers-doc-builder + # doc-builder: + # name: "Doc builder" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-doc-builder + # push: true + # tags: huggingface/transformers-doc-builder - latest-pytorch: - name: "Latest PyTorch [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-gpu + # latest-pytorch: + # name: "Latest PyTorch [dev]" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - name: Cleanup disk + # run: | + # sudo ls -l /usr/local/lib/ + # sudo ls -l /usr/share/ + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # sudo rm -rf /usr/local/lib/android + # sudo rm -rf /usr/share/dotnet + # sudo du -sh /usr/local/lib/ + # sudo du -sh /usr/share/ + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-gpu # Need to be fixed with the help from Guillaume. # latest-pytorch-amd: @@ -234,33 +235,33 @@ jobs: # push: true # tags: huggingface/transformers-pytorch-amd-gpu-push-ci - latest-tensorflow: - name: "Latest TensorFlow [dev]" - # Push CI doesn't need this image - if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 - steps: - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - - name: Check out code - uses: actions/checkout@v3 - - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-tensorflow-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-tensorflow-gpu + # latest-tensorflow: + # name: "Latest TensorFlow [dev]" + # # Push CI doesn't need this image + # if: inputs.image_postfix != '-push-ci' + # runs-on: ubuntu-22.04 + # steps: + # - + # name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - + # name: Check out code + # uses: actions/checkout@v3 + # - + # name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - + # name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-tensorflow-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-tensorflow-gpu # latest-pytorch-deepspeed-amd: # name: "PyTorch + DeepSpeed (AMD) [dev]" @@ -302,18 +303,8 @@ jobs: name: "Latest Pytorch + Quantization [dev]" # Push CI doesn't need this image if: inputs.image_postfix != '-push-ci' - runs-on: ubuntu-22.04 + runs-on: [intel-cpu, 8-cpu, ci] steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index e7df09bf415b48..12c97f5a46152c 100644 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -9,7 +9,7 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.1.1' +ARG PYTORCH='2.2.0' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu118' @@ -26,9 +26,7 @@ RUN echo torch=$VERSION # Currently, let's just use their latest releases (when `torch` is installed with a release version) RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA -RUN python3 -m pip install --no-cache-dir -e ./transformers[dev] - -RUN python3 -m pip uninstall -y flax jax +RUN python3 -m pip install --no-cache-dir -e ./transformers[torch-dev] RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate From 4c757b88f9617fa3b5a9fa437fdacc0b1dfcbea2 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Mon, 26 Feb 2024 15:59:22 +0100 Subject: [PATCH 19/23] fix extra --- docker/transformers-quantization-latest-gpu/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 12c97f5a46152c..66bdcc42bae9fd 100644 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -26,7 +26,7 @@ RUN echo torch=$VERSION # Currently, let's just use their latest releases (when `torch` is installed with a release version) RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA -RUN python3 -m pip install --no-cache-dir -e ./transformers[torch-dev] +RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch] RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate From ce94146f5437ffc0c64833d6e14a932fc49f9869 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Mon, 26 Feb 2024 16:48:04 +0100 Subject: [PATCH 20/23] clean --- .github/workflows/build-docker-images.yml | 475 +++++++++++----------- 1 file changed, 237 insertions(+), 238 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 00781635ca6c3b..d9796490e5e75b 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -3,8 +3,7 @@ name: Build docker images (scheduled) on: push: branches: - # - build_ci_docker_image* - - add-quantization-workflow + - build_ci_docker_image* repository_dispatch: workflow_call: inputs: @@ -19,185 +18,185 @@ concurrency: cancel-in-progress: false jobs: - # latest-docker: - # name: "Latest PyTorch + TensorFlow [dev]" - # runs-on: [intel-cpu, 8-cpu, ci] - # steps: - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-all-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} - # # Push CI images still need to be re-built daily - # - - # name: Build and push (for Push CI) in a daily basis - # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - # if: inputs.image_postfix != '-push-ci' - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-all-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-all-latest-gpu-push-ci + latest-docker: + name: "Latest PyTorch + TensorFlow [dev]" + runs-on: [intel-cpu, 8-cpu, ci] + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu-push-ci - # latest-torch-deepspeed-docker: - # name: "Latest PyTorch + DeepSpeed" - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + latest-torch-deepspeed-docker: + name: "Latest PyTorch + DeepSpeed" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} - # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) - # latest-torch-deepspeed-docker-for-push-ci-daily-build: - # name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # # Push CI images still need to be re-built daily - # - - # name: Build and push (for Push CI) in a daily basis - # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - # if: inputs.image_postfix != '-push-ci' - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-latest-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) + latest-torch-deepspeed-docker-for-push-ci-daily-build: + name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci - # doc-builder: - # name: "Doc builder" - # # Push CI doesn't need this image - # if: inputs.image_postfix != '-push-ci' - # runs-on: ubuntu-22.04 - # steps: - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-doc-builder - # push: true - # tags: huggingface/transformers-doc-builder + doc-builder: + name: "Doc builder" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-doc-builder + push: true + tags: huggingface/transformers-doc-builder - # latest-pytorch: - # name: "Latest PyTorch [dev]" - # # Push CI doesn't need this image - # if: inputs.image_postfix != '-push-ci' - # runs-on: ubuntu-22.04 - # steps: - # - name: Cleanup disk - # run: | - # sudo ls -l /usr/local/lib/ - # sudo ls -l /usr/share/ - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /usr/share/dotnet - # sudo du -sh /usr/local/lib/ - # sudo du -sh /usr/share/ - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-gpu + latest-pytorch: + name: "Latest PyTorch [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - name: Cleanup disk + run: | + sudo ls -l /usr/local/lib/ + sudo ls -l /usr/share/ + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + sudo rm -rf /usr/local/lib/android + sudo rm -rf /usr/share/dotnet + sudo du -sh /usr/local/lib/ + sudo du -sh /usr/share/ + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-gpu # Need to be fixed with the help from Guillaume. # latest-pytorch-amd: @@ -235,69 +234,69 @@ jobs: # push: true # tags: huggingface/transformers-pytorch-amd-gpu-push-ci - # latest-tensorflow: - # name: "Latest TensorFlow [dev]" - # # Push CI doesn't need this image - # if: inputs.image_postfix != '-push-ci' - # runs-on: ubuntu-22.04 - # steps: - # - - # name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - - # name: Check out code - # uses: actions/checkout@v3 - # - - # name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - - # name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-tensorflow-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-tensorflow-gpu + latest-tensorflow: + name: "Latest TensorFlow [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: ubuntu-22.04 + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v3 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-tensorflow-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-tensorflow-gpu - # latest-pytorch-deepspeed-amd: - # name: "PyTorch + DeepSpeed (AMD) [dev]" + latest-pytorch-deepspeed-amd: + name: "PyTorch + DeepSpeed (AMD) [dev]" - # runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] - # steps: - # - name: Set up Docker Buildx - # uses: docker/setup-buildx-action@v3 - # - name: Check out code - # uses: actions/checkout@v3 - # - name: Login to DockerHub - # uses: docker/login-action@v3 - # with: - # username: ${{ secrets.DOCKERHUB_USERNAME }} - # password: ${{ secrets.DOCKERHUB_PASSWORD }} - # - name: Build and push - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-amd-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} - # # Push CI images still need to be re-built daily - # - - # name: Build and push (for Push CI) in a daily basis - # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - # if: inputs.image_postfix != '-push-ci' - # uses: docker/build-push-action@v5 - # with: - # context: ./docker/transformers-pytorch-deepspeed-amd-gpu - # build-args: | - # REF=main - # push: true - # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci + runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + steps: + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Check out code + uses: actions/checkout@v3 + - name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci latest-quantization-torch-docker: name: "Latest Pytorch + Quantization [dev]" From 4cb52b8822da9d1786a821a33e867e4fcc00d8fd Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 28 Feb 2024 15:26:37 +0100 Subject: [PATCH 21/23] revert changes --- src/transformers/modeling_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index b3102a37d37f31..4871110f5b6ffb 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -3552,6 +3552,7 @@ def from_pretrained( "device_map": device_map, "offload_dir": offload_folder, "offload_index": offload_index, + "force_hooks": True } if "skip_keys" in inspect.signature(dispatch_model).parameters: device_map_kwargs["skip_keys"] = model._skip_keys_device_placement From 750693276b98d42f3db7ee358286841e50ac9134 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 28 Feb 2024 15:42:45 +0100 Subject: [PATCH 22/23] Revert "revert changes" This reverts commit 4cb52b8822da9d1786a821a33e867e4fcc00d8fd. --- src/transformers/modeling_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 4871110f5b6ffb..b3102a37d37f31 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -3552,7 +3552,6 @@ def from_pretrained( "device_map": device_map, "offload_dir": offload_folder, "offload_index": offload_index, - "force_hooks": True } if "skip_keys" in inspect.signature(dispatch_model).parameters: device_map_kwargs["skip_keys"] = model._skip_keys_device_placement From 9209b46138335e1834a558223bb4d8de3dc1f337 Mon Sep 17 00:00:00 2001 From: Marc Sun Date: Wed, 28 Feb 2024 15:43:08 +0100 Subject: [PATCH 23/23] revert correct change --- .github/workflows/build-docker-images.yml | 68 +++++++++++------------ 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index d9796490e5e75b..6144f8036f96c9 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -262,41 +262,41 @@ jobs: push: true tags: huggingface/transformers-tensorflow-gpu - latest-pytorch-deepspeed-amd: - name: "PyTorch + DeepSpeed (AMD) [dev]" + # latest-pytorch-deepspeed-amd: + # name: "PyTorch + DeepSpeed (AMD) [dev]" - runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] - steps: - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - name: Check out code - uses: actions/checkout@v3 - - name: Login to DockerHub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_PASSWORD }} - - name: Build and push - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-amd-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} - # Push CI images still need to be re-built daily - - - name: Build and push (for Push CI) in a daily basis - # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. - # The later case is useful for manual image building for debugging purpose. Use another tag in this case! - if: inputs.image_postfix != '-push-ci' - uses: docker/build-push-action@v5 - with: - context: ./docker/transformers-pytorch-deepspeed-amd-gpu - build-args: | - REF=main - push: true - tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci + # runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210] + # steps: + # - name: Set up Docker Buildx + # uses: docker/setup-buildx-action@v3 + # - name: Check out code + # uses: actions/checkout@v3 + # - name: Login to DockerHub + # uses: docker/login-action@v3 + # with: + # username: ${{ secrets.DOCKERHUB_USERNAME }} + # password: ${{ secrets.DOCKERHUB_PASSWORD }} + # - name: Build and push + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} + # # Push CI images still need to be re-built daily + # - + # name: Build and push (for Push CI) in a daily basis + # # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + # if: inputs.image_postfix != '-push-ci' + # uses: docker/build-push-action@v5 + # with: + # context: ./docker/transformers-pytorch-deepspeed-amd-gpu + # build-args: | + # REF=main + # push: true + # tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci latest-quantization-torch-docker: name: "Latest Pytorch + Quantization [dev]"