huggingface · SunMarc · Feb 28, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
@@ -297,3 +297,31 @@ jobs:
   #           REF=main
   #         push: true
   #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
+
+  latest-quantization-torch-docker:
+    name: "Latest Pytorch + Quantization [dev]"
+     # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on: [intel-cpu, 8-cpu, ci]
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v3
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-quantization-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }}
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
@@ -297,6 +297,56 @@ jobs:
           name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports
           path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu
 
+  run_tests_quantization_torch_gpu:
+    name: Quantization tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
+    container:
+      image: huggingface/transformers-quantization-latest-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run quantization tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu tests/quantization
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu
+
   run_extract_warnings:
     name: Extract warnings in CI artifacts
     runs-on: ubuntu-22.04
@@ -307,7 +357,8 @@ jobs:
       run_examples_gpu,
       run_pipelines_tf_gpu,
       run_pipelines_torch_gpu,
-      run_all_tests_torch_cuda_extensions_gpu
+      run_all_tests_torch_cuda_extensions_gpu,
+      run_tests_quantization_torch_gpu,
     ]
     steps:
       - name: Checkout transformers
@@ -355,6 +406,7 @@ jobs:
       run_pipelines_tf_gpu,
       run_pipelines_torch_gpu,
       run_all_tests_torch_cuda_extensions_gpu,
+      run_tests_quantization_torch_gpu,
       run_extract_warnings
     ]
     steps:

diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile
@@ -0,0 +1,50 @@
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+# Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
+SHELL ["sh", "-lc"]
+
+# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
+# to be used as arguments for docker build (so far).
+
+ARG PYTORCH='2.2.0'
+# Example: `cu102`, `cu113`, etc.
+ARG CUDA='cu118'
+
+RUN apt update
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python python3-pip ffmpeg
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+ARG REF=main
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
+RUN echo torch=$VERSION
+# `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
+# Currently, let's just use their latest releases (when `torch` is installed with a release version)
+RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
+
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
+
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+# Add bitsandbytes for mixed int8 testing
+RUN python3 -m pip install --no-cache-dir bitsandbytes
+
+# Add auto-gptq for gtpq quantization testing
+RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
+
+# Add optimum for gptq quantization testing
+RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
+
+# Add aqlm for quantization testing
+RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
+
+# Add autoawq for quantization testing
+RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.8/autoawq-0.1.8+cu118-cp38-cp38-linux_x86_64.whl
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
diff --git a/docs/source/en/hf_quantizer.md b/docs/source/en/hf_quantizer.md
@@ -66,4 +66,4 @@ For some quantization methods, they may require "pre-quantizing" the models thro
 
 7. Document everything! Make sure your quantization method is documented in the [`docs/source/en/quantization.md`](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/docs/source/en/quantization.md) file.
 
-8. Add tests! You should add tests by first adding the package in our nightly Dockerfile inside `docker/transformers-all-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out how it is implemented for other quantization methods.
+8. Add tests! You should add tests by first adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out how it is implemented for other quantization methods.
diff --git a/utils/notification_service.py b/utils/notification_service.py
@@ -1043,6 +1043,7 @@ def prepare_reports(title, header, reports, to_truncate=True):
         "PyTorch pipelines": "run_tests_torch_pipeline_gpu",
         "TensorFlow pipelines": "run_tests_tf_pipeline_gpu",
         "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
+        "Quantization tests": "run_tests_quantization_torch_gpu",
     }
 
     if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
Original file line number	Diff line number	Diff line change
Expand Up		@@ -66,4 +66,4 @@ For some quantization methods, they may require "pre-quantizing" the models thro

		7. Document everything! Make sure your quantization method is documented in the [`docs/source/en/quantization.md`](https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/docs/source/en/quantization.md) file.

		8. Add tests! You should add tests by first adding the package in our nightly Dockerfile inside `docker/transformers-all-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out how it is implemented for other quantization methods.
		8. Add tests! You should add tests by first adding the package in our nightly Dockerfile inside `docker/transformers-quantization-latest-gpu` and then adding a new test file in `tests/quantization/xxx`. Feel free to check out how it is implemented for other quantization methods.