diff --git a/.circleci/config.yml b/.circleci/config.yml index ddb8869a6ae5aa..cdd97f4fcecaff 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -31,6 +31,7 @@ jobs: steps: - checkout - run: uv pip install -U -e . + - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV" - run: mkdir -p test_preparation - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt - store_artifacts: @@ -80,7 +81,7 @@ jobs: path: ~/transformers/test_preparation/filtered_test_list.txt - store_artifacts: path: test_preparation/examples_test_list.txt - - run: python .circleci/create_circleci_config.py --fetcher_folder test_preparation + - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation - run: | if [ ! -s test_preparation/generated_config.yml ]; then echo "No tests to run, exiting early!" @@ -97,7 +98,7 @@ jobs: fetch_all_tests: working_directory: ~/transformers docker: - - image: huggingface/transformers-consistency + - image: huggingface/transformers-quality parallelism: 1 steps: - checkout diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 8b6a773ee435c0..3f2c6df394e8eb 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -72,6 +72,12 @@ def __post_init__(self): if self.docker_image is None: # Let's avoid changing the default list and make a copy. self.docker_image = copy.deepcopy(DEFAULT_DOCKER_IMAGE) + else: + # BIG HACK WILL REMOVE ONCE FETCHER IS UPDATED + print(os.environ.get("GIT_COMMIT_MESSAGE")) + if "[build-ci-image]" in os.environ.get("GIT_COMMIT_MESSAGE", "") or os.environ.get("GIT_COMMIT_MESSAGE", "") == "dev-ci": + self.docker_image[0]["image"] = f"{self.docker_image[0]['image']}:dev" + print(f"Using {self.docker_image} docker image") if self.install_steps is None: self.install_steps = [] if self.pytest_options is None: diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index ff471096907ab8..51d713b2e1033d 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -17,50 +17,50 @@ body: description: | Your issue will be replied to more quickly if you can figure out the right person to tag with @ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**. - + All issues are read by one of the core maintainers, so if you don't know who to tag, just leave this blank and a core maintainer will ping the right person. - + Please tag fewer than 3 people. - + Models: - text models: @ArthurZucker and @younesbelkada - vision models: @amyeroberts - speech models: @sanchit-gandhi - graph models: @clefourrier - + Library: - + - flax: @sanchit-gandhi - - generate: @gante + - generate: @zucchini-nlp (visual-language models) or @gante (all others) - pipelines: @Narsil - tensorflow: @gante and @Rocketknight1 - tokenizers: @ArthurZucker - - trainer: @muellerzr and @pacman100 + - trainer: @muellerzr @SunMarc Integrations: - - deepspeed: HF Trainer/Accelerate: @pacman100 + - deepspeed: HF Trainer/Accelerate: @muellerzr - ray/raytune: @richardliaw, @amogkam - Big Model Inference: @SunMarc - quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada - + Documentation: @stevhliu - + Model hub: - for issues with a model, report at https://discuss.huggingface.co/ and tag the model's creator. - + HF projects: - + - accelerate: [different repo](https://github.com/huggingface/accelerate) - datasets: [different repo](https://github.com/huggingface/datasets) - diffusers: [different repo](https://github.com/huggingface/diffusers) - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers) - + Maintained examples (not research project or legacy): - + - Flax: @sanchit-gandhi - PyTorch: See Models above and tag the person corresponding to the modality of the example. - TensorFlow: @Rocketknight1 @@ -101,11 +101,11 @@ body: placeholder: | Steps to reproduce the behavior: - + 1. 2. 3. - + - type: textarea id: expected-behavior diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml index 318dc1f9b288c2..ff0d452a807f6e 100644 --- a/.github/ISSUE_TEMPLATE/feature-request.yml +++ b/.github/ISSUE_TEMPLATE/feature-request.yml @@ -1,6 +1,6 @@ name: "\U0001F680 Feature request" description: Submit a proposal/request for a new transformers feature -labels: [ "feature" ] +labels: [ "Feature request" ] body: - type: textarea id: feature-request @@ -19,7 +19,7 @@ body: label: Motivation description: | Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. - + - type: textarea id: contribution diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index c0f70fe8159f09..650e13d8dcabb6 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -47,15 +47,15 @@ Models: Library: - flax: @sanchit-gandhi -- generate: @gante +- generate: @zucchini-nlp (visual-language models) or @gante (all others) - pipelines: @Narsil - tensorflow: @gante and @Rocketknight1 - tokenizers: @ArthurZucker -- trainer: @muellerzr and @pacman100 +- trainer: @muellerzr and @SunMarc Integrations: -- deepspeed: HF Trainer/Accelerate: @pacman100 +- deepspeed: HF Trainer/Accelerate: @muellerzr - ray/raytune: @richardliaw, @amogkam - Big Model Inference: @SunMarc - quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada diff --git a/.github/actions/post-slack/action.yml b/.github/actions/post-slack/action.yml deleted file mode 100644 index 74075a4fedc427..00000000000000 --- a/.github/actions/post-slack/action.yml +++ /dev/null @@ -1,79 +0,0 @@ -name: Send message to slack - -description: 'Send results to slack' -author: 'Hugging Face' -inputs: - slack_channel: - required: true - type: string - title: - required: true - type: string - status: - required: true - type: string - slack_token: - required: true - type: string - -runs: - using: "composite" - steps: - - name: Create content to post - id: create-message - run: | - if [ "${{ inputs.status }}" == "success" ]; then - echo STATUS_MESSAGE='🟢 Tests are passing!' >> $GITHUB_ENV - else - echo STATUS_MESSAGE='🔴 Tests failed! Please check the GitHub action link below' >> $GITHUB_ENV - fi - shell: bash - - - name: Post Canceled results Slack channel - id: post-slack - uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 - with: - # Slack channel id, channel name, or user id to post message. - # See also: https://api.slack.com/methods/chat.postMessage#channels - channel-id: ${{ inputs.slack_channel }} - # For posting a rich message using Block Kit - payload: | - { - "text": "${{ inputs.title }}", - "blocks": [ - { - "type": "header", - "text": { - "type": "plain_text", - "text": "${{ inputs.title }}" - } - }, - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": "${{ env.STATUS_MESSAGE }}" - } - }, - { - "type": "section", - "text": {"type": "mrkdwn", "text": "*Click the button for more details about the commit*"}, - "accessory": { - "type": "button", - "text": {"type": "plain_text", "text": "Check Commit results"}, - "url": "${{ github.event.pull_request.html_url || github.event.head_commit.url }}" - } - }, - { - "type": "section", - "text": {"type": "mrkdwn", "text": "*Click here for more details about the action ran*"}, - "accessory": { - "type": "button", - "text": {"type": "plain_text", "text": "Check Action results"}, - "url": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - } - } - ] - } - env: - SLACK_BOT_TOKEN: ${{ inputs.slack_token }} \ No newline at end of file diff --git a/.github/workflows/build-ci-docker-images.yml b/.github/workflows/build-ci-docker-images.yml index f7b75a3a30e763..6f29df82769d82 100644 --- a/.github/workflows/build-ci-docker-images.yml +++ b/.github/workflows/build-ci-docker-images.yml @@ -3,7 +3,7 @@ name: Build pr ci-docker on: push: branches: - - change-ci # for now let's only build on this branch + - push-ci-image # for now let's only build on this branch repository_dispatch: workflow_call: inputs: @@ -22,7 +22,7 @@ jobs: build: runs-on: ubuntu-22.04 - if: ${{ contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' }} + if: ${{ contains(github.event.head_commit.message, '[build-ci-image]') || contains(github.event.head_commit.message, '[push-ci-image]') && '!cancelled()' || github.event_name == 'schedule' }} strategy: matrix: @@ -30,6 +30,16 @@ jobs: continue-on-error: true steps: + - + name: Set tag + run: | + if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then + echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV" + echo "setting it to DEV!" + else + echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV" + + fi - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -50,5 +60,5 @@ jobs: build-args: | REF=${{ github.sha }} file: "./docker/${{ matrix.file }}.dockerfile" - push: true - tags: huggingface/transformers-${{ matrix.file }} \ No newline at end of file + push: ${{ contains(github.event.head_commit.message, 'ci-image]') || github.event_name == 'schedule' }} + tags: ${{ env.TAG }} \ No newline at end of file diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 7c9e86d091b5ad..df772db773e262 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -57,20 +57,19 @@ jobs: push: true tags: huggingface/transformers-all-latest-gpu-push-ci + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + latest-torch-deepspeed-docker: name: "Latest PyTorch + DeepSpeed" runs-on: [intel-cpu, 8-cpu, ci] steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -93,21 +92,20 @@ jobs: push: true tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}} + title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) latest-torch-deepspeed-docker-for-push-ci-daily-build: name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" runs-on: [intel-cpu, 8-cpu, ci] steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -134,6 +132,15 @@ jobs: push: true tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + doc-builder: name: "Doc builder" # Push CI doesn't need this image @@ -160,22 +167,21 @@ jobs: push: true tags: huggingface/transformers-doc-builder + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the huggingface/transformers-doc-builder docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + latest-pytorch: name: "Latest PyTorch [dev]" # Push CI doesn't need this image if: inputs.image_postfix != '-push-ci' runs-on: [intel-cpu, 8-cpu, ci] steps: - - name: Cleanup disk - run: | - sudo ls -l /usr/local/lib/ - sudo ls -l /usr/share/ - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - sudo rm -rf /usr/local/lib/android - sudo rm -rf /usr/share/dotnet - sudo du -sh /usr/local/lib/ - sudo du -sh /usr/share/ - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -198,6 +204,15 @@ jobs: push: true tags: huggingface/transformers-pytorch-gpu + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + latest-pytorch-amd: name: "Latest PyTorch (AMD) [dev]" runs-on: [intel-cpu, 8-cpu, ci] @@ -237,6 +252,15 @@ jobs: push: true tags: huggingface/transformers-pytorch-amd-gpu-push-ci + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + latest-tensorflow: name: "Latest TensorFlow [dev]" # Push CI doesn't need this image @@ -265,6 +289,15 @@ jobs: push: true tags: huggingface/transformers-tensorflow-gpu + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + latest-pytorch-deepspeed-amd: name: "PyTorch + DeepSpeed (AMD) [dev]" runs-on: [intel-cpu, 8-cpu, ci] @@ -304,6 +337,15 @@ jobs: push: true tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + latest-quantization-torch-docker: name: "Latest Pytorch + Quantization [dev]" # Push CI doesn't need this image @@ -330,4 +372,13 @@ jobs: build-args: | REF=main push: true - tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }} \ No newline at end of file + tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }} + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the transformers-quantization-latest-gpu build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml index d7c18775a86e41..691369c765aec7 100644 --- a/.github/workflows/build-nightly-ci-docker-images.yml +++ b/.github/workflows/build-nightly-ci-docker-images.yml @@ -13,7 +13,7 @@ concurrency: jobs: latest-with-torch-nightly-docker: name: "Nightly PyTorch + Stable TensorFlow" - runs-on: ubuntu-22.04 + runs-on: [intel-cpu, 8-cpu, ci] steps: - name: Cleanup disk run: | @@ -50,7 +50,7 @@ jobs: nightly-torch-deepspeed-docker: name: "Nightly PyTorch + DeepSpeed" - runs-on: ubuntu-22.04 + runs-on: [intel-cpu, 8-cpu, ci] steps: - name: Cleanup disk run: | diff --git a/.github/workflows/build-past-ci-docker-images.yml b/.github/workflows/build-past-ci-docker-images.yml index 5ef7c7e7de9e94..6ee60b8a6b60f2 100644 --- a/.github/workflows/build-past-ci-docker-images.yml +++ b/.github/workflows/build-past-ci-docker-images.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: version: ["1.13", "1.12", "1.11"] - runs-on: ubuntu-22.04 + runs-on: [intel-cpu, 8-cpu, ci] steps: - name: Set up Docker Buildx @@ -60,7 +60,7 @@ jobs: fail-fast: false matrix: version: ["2.11", "2.10", "2.9", "2.8", "2.7", "2.6", "2.5"] - runs-on: ubuntu-22.04 + runs-on: [intel-cpu, 8-cpu, ci] steps: - name: Set up Docker Buildx diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index f88af8e39af27d..840df8b6979a53 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -80,7 +80,7 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -rs -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml index cf86a8fc8d5354..82e4d6476a794f 100644 --- a/.github/workflows/push-important-models.yml +++ b/.github/workflows/push-important-models.yml @@ -5,7 +5,6 @@ on: branches: [ main ] env: - IS_GITHUB_CI: "1" OUTPUT_SLACK_CHANNEL_ID: "C06L2SGMEEA" HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} HF_HOME: /mnt/cache @@ -86,7 +85,7 @@ jobs: - name: Run FA2 tests id: run_fa2_tests run: - pytest -rs -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_* + pytest -rsfE -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_* - name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests" if: ${{ always() }} @@ -97,7 +96,7 @@ jobs: - name: Post to Slack if: always() - uses: ./.github/actions/post-slack + uses: huggingface/hf-workflows/.github/actions/post-slack@main with: slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }} title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }} @@ -108,7 +107,7 @@ jobs: id: run_integration_tests if: always() run: - pytest -rs -k "IntegrationTest" --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_* + pytest -rsfE -k "IntegrationTest" --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_* - name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}" if: ${{ always() }} @@ -119,7 +118,7 @@ jobs: - name: Post to Slack if: always() - uses: ./.github/actions/post-slack + uses: huggingface/hf-workflows/.github/actions/post-slack@main with: slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }} title: 🤗 Results of the Integration tests - ${{ matrix.model-name }} diff --git a/.github/workflows/self-pr-slow-ci.yml b/.github/workflows/self-pr-slow-ci.yml index 10a2156f210fbc..2729c436b10637 100644 --- a/.github/workflows/self-pr-slow-ci.yml +++ b/.github/workflows/self-pr-slow-ci.yml @@ -110,7 +110,7 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v -rs --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + run: python3 -m pytest -v -rsfE --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} diff --git a/.github/workflows/self-push-amd-mi300-caller.yml b/.github/workflows/self-push-amd-mi300-caller.yml new file mode 100644 index 00000000000000..a8ee4e540ecf3f --- /dev/null +++ b/.github/workflows/self-push-amd-mi300-caller.yml @@ -0,0 +1,25 @@ +name: Self-hosted runner (AMD mi300 CI caller) + +on: + workflow_run: + workflows: ["Self-hosted runner (push-caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + +jobs: + run_amd_ci: + name: AMD mi300 + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci')))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi300 + secrets: inherit diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml index 8705f398b2b510..8d68002e329418 100644 --- a/.github/workflows/self-push-amd.yml +++ b/.github/workflows/self-push-amd.yml @@ -36,7 +36,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -57,7 +57,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -155,7 +155,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }} machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -230,7 +230,7 @@ jobs: - name: Run all non-slow selected tests on GPU working-directory: /transformers run: | - python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml index cdb968901058b6..6abba6894aaffa 100644 --- a/.github/workflows/self-scheduled-amd-mi210-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml @@ -16,4 +16,5 @@ jobs: uses: ./.github/workflows/self-scheduled-amd.yml with: gpu_flavor: mi210 + slack_report_channel: "#transformers-ci-daily-amd" secrets: inherit diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml index dc7d12f173935e..36365d4a67f1e2 100644 --- a/.github/workflows/self-scheduled-amd-mi250-caller.yml +++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml @@ -16,4 +16,5 @@ jobs: uses: ./.github/workflows/self-scheduled-amd.yml with: gpu_flavor: mi250 + slack_report_channel: "#transformers-ci-daily-amd" secrets: inherit diff --git a/.github/workflows/self-scheduled-amd-mi300-caller.yml b/.github/workflows/self-scheduled-amd-mi300-caller.yml new file mode 100644 index 00000000000000..a9e7b934c34b77 --- /dev/null +++ b/.github/workflows/self-scheduled-amd-mi300-caller.yml @@ -0,0 +1,21 @@ +name: Self-hosted runner (AMD mi300 scheduled CI caller) + +on: + workflow_run: + workflows: ["Self-hosted runner (AMD scheduled CI caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - run_amd_scheduled_ci_caller* + +jobs: + run_amd_ci: + name: AMD mi300 + needs: build-docker-containers + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci')))) + uses: ./.github/workflows/self-scheduled-amd.yml + with: + gpu_flavor: mi300 + slack_report_channel: "#transformers-ci-daily-amd" + secrets: inherit diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index d2ab90d1331848..e9f280f51ab43d 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -34,7 +34,7 @@ jobs: fetch-depth: 2 - name: Check Runner Status - run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1,hf-amd-mi300-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} check_runners: name: Check Runners @@ -42,7 +42,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -63,7 +63,7 @@ jobs: strategy: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -116,7 +116,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -162,7 +162,7 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} @@ -184,7 +184,7 @@ jobs: matrix: folders: ${{ fromJson(needs.setup.outputs.matrix) }} machine_type: [multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -230,7 +230,7 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} @@ -250,7 +250,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -287,7 +287,7 @@ jobs: working-directory: /transformers run: | pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_examples_gpu_test_reports examples/pytorch -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} @@ -307,7 +307,7 @@ jobs: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] container: image: huggingface/transformers-pytorch-amd-gpu options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -343,7 +343,7 @@ jobs: - name: Run all pipeline tests on GPU working-directory: /transformers run: | - python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} @@ -364,7 +364,7 @@ jobs: matrix: machine_type: [single-gpu, multi-gpu] - runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] needs: setup container: image: huggingface/transformers-pytorch-deepspeed-amd-gpu @@ -400,7 +400,7 @@ jobs: - name: Run all tests on GPU working-directory: /transformers - run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended + run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended -m "not not_device_test" - name: Failure short reports if: ${{ failure() }} diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index 77cfdc8c140241..9339e6a7b455fa 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -19,6 +19,8 @@ on: required: true type: string +env: + TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} jobs: send_results: @@ -54,18 +56,17 @@ jobs: # empty string, and the called script still get one argument (which is the emtpy string). run: | sudo apt-get install -y curl + pip install huggingface_hub pip install slack_sdk pip show slack_sdk python utils/notification_service.py "${{ inputs.folder_slices }}" # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - name: Failure table artifacts - # Only the model testing job is concerned for this step - if: ${{ inputs.job == 'run_models_gpu' }} uses: actions/upload-artifact@v4 with: - name: ci_results - path: ci_results + name: ci_results_${{ inputs.job }} + path: ci_results_${{ inputs.job }} - uses: actions/checkout@v4 - uses: actions/download-artifact@v4 @@ -77,11 +78,21 @@ jobs: SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} CI_EVENT: scheduled CI_SHA: ${{ github.sha }} + CI_TEST_JOB: ${{ inputs.job }} SETUP_STATUS: ${{ inputs.setup_status }} # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`. run: | sudo apt-get install -y curl + pip install huggingface_hub pip install slack_sdk pip show slack_sdk python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" + + # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. + - name: Failure table artifacts + if: ${{ inputs.job == 'run_quantization_torch_gpu' }} + uses: actions/upload-artifact@v4 + with: + name: ci_results_${{ inputs.job }} + path: ci_results_${{ inputs.job }} \ No newline at end of file diff --git a/.github/workflows/ssh-runner.yml b/.github/workflows/ssh-runner.yml index 5ea83f2005fdcb..7b47c0f437fa85 100644 --- a/.github/workflows/ssh-runner.yml +++ b/.github/workflows/ssh-runner.yml @@ -9,9 +9,11 @@ on: docker_image: description: 'Name of the Docker image' required: true + num_gpus: + description: 'Type of the number of gpus to use (`single` or `multi`)' + required: true env: - IS_GITHUB_CI: "1" HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} HF_HOME: /mnt/cache TRANSFORMERS_IS_CI: yes @@ -20,12 +22,13 @@ env: RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} TF_FORCE_GPU_ALLOW_GROWTH: true + CUDA_VISIBLE_DEVICES: 0,1 RUN_PT_TF_CROSS_TESTS: 1 jobs: ssh_runner: name: "SSH" - runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci] + runs-on: ["${{ github.event.inputs.num_gpus }}-gpu", nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci] container: image: ${{ github.event.inputs.docker_image }} options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -52,7 +55,7 @@ jobs: nvidia-smi - name: Tailscale # In order to be able to SSH when a test fails - uses: huggingface/tailscale-action@v1 + uses: huggingface/tailscale-action@main with: authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} diff --git a/Makefile b/Makefile index ebc66d922cdd1b..f9b2a8c9a7c620 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,11 @@ -.PHONY: deps_table_update modified_only_fixup extra_style_checks quality style fixup fix-copies test test-examples +.PHONY: deps_table_update modified_only_fixup extra_style_checks quality style fixup fix-copies test test-examples benchmark # make sure to test the local checkout in scripts and not the pre-installed one (don't use quotes!) export PYTHONPATH = src check_dirs := examples tests src utils -exclude_folders := examples/research_projects +exclude_folders := "" modified_only_fixup: $(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs))) @@ -96,6 +96,11 @@ test: test-examples: python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/ +# Run benchmark + +benchmark: + python3 benchmark/benchmark.py --config-dir benchmark/config --config-name generation --commit=diff backend.model=google/gemma-2b backend.cache_implementation=null,static backend.torch_compile=false,true --multirun + # Run tests for SageMaker DLC release test-sagemaker: # install sagemaker dependencies in advance with pip install .[sagemaker] diff --git a/README_fr.md b/README_fr.md index d58bb0bbca385d..0fffb6d936076d 100644 --- a/README_fr.md +++ b/README_fr.md @@ -288,7 +288,6 @@ Suivez les pages d'installation de Flax, PyTorch ou TensorFlow pour voir comment Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen) - 🤗 Transformers fournit actuellement les architectures suivantes: consultez [ici](https://huggingface.co/docs/transformers/model_summary) pour un résumé global de chacune d'entre elles. Pour vérifier si chaque modèle a une implémentation en Flax, PyTorch ou TensorFlow, ou s'il a un tokenizer associé pris en charge par la bibliothèque 🤗 Tokenizers, consultez [ce tableau](https://huggingface.co/docs/transformers/index#supported-frameworks). diff --git a/README_te.md b/README_te.md index 19cbe320624186..f23476efda5f2f 100644 --- a/README_te.md +++ b/README_te.md @@ -293,7 +293,6 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 🤗 ట్రాన్స్‌ఫార్మర్లు ప్రస్తుతం కింది ఆర్కిటెక్చర్‌లను అందజేస్తున్నాయి: వాటిలో ప్రతి ఒక్కటి ఉన్నత స్థాయి సారాంశం కోసం [ఇక్కడ](https://huggingface.co/docs/transformers/model_summary) చూడండి. - ఈ అమలులు అనేక డేటాసెట్‌లలో పరీక్షించబడ్డాయి (ఉదాహరణ స్క్రిప్ట్‌లను చూడండి) మరియు అసలైన అమలుల పనితీరుతో సరిపోలాలి. మీరు [డాక్యుమెంటేషన్](https://github.com/huggingface/transformers/tree/main/examples) యొక్క ఉదాహరణల విభాగంలో పనితీరుపై మరిన్ని వివరాలను కనుగొనవచ్చు. ## ఇంకా నేర్చుకో diff --git a/SECURITY.md b/SECURITY.md index f5a3acc5a91b93..fcb8b9b6f18f28 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -14,7 +14,7 @@ Models uploaded on the Hugging Face Hub come in different formats. We heavily re models in the [`safetensors`](https://github.com/huggingface/safetensors) format (which is the default prioritized by the transformers library), as developed specifically to prevent arbitrary code execution on your system. -To avoid loading models from unsafe formats(e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetenstors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model. +To avoid loading models from unsafe formats(e.g. [pickle](https://docs.python.org/3/library/pickle.html), you should use the `use_safetensors` parameter. If doing so, in the event that no .safetensors file is present, transformers will error when loading the model. ### Remote code diff --git a/tests/models/deta/__init__.py b/benchmark/__init__.py similarity index 100% rename from tests/models/deta/__init__.py rename to benchmark/__init__.py diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py new file mode 100644 index 00000000000000..9e38c1f70a14ae --- /dev/null +++ b/benchmark/benchmark.py @@ -0,0 +1,310 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run benchmark using the `optimum-benchmark` library with some customization in `transformers`. + +Assume we are under `transformers` root directory: (make sure the commits are valid commits) +```bash +python benchmark/benchmark.py --config-dir benchmark/config --config-name generation --commit=9b9c7f03da625b13643e99205c691fe046461724 --metrics=decode.latency.mean,per_token.latency.mean,per_token.throughput.value backend.model=google/gemma-2b benchmark.input_shapes.sequence_length=5,7 benchmark.input_shapes.batch_size=1,2 --multirun +``` +""" + +import argparse +import glob +import json +import os.path +import re +import tempfile +from contextlib import contextmanager +from pathlib import Path + +from git import Repo + +from optimum_benchmark import Benchmark +from optimum_benchmark_wrapper import main + + +PATH_TO_REPO = Path(__file__).parent.parent.resolve() + + +@contextmanager +def checkout_commit(repo: Repo, commit_id: str): + """ + Context manager that checks out a given commit when entered, but gets back to the reference it was at on exit. + Args: + repo (`git.Repo`): A git repository (for instance the Transformers repo). + commit_id (`str`): The commit reference to checkout inside the context manager. + """ + current_head = repo.head.commit if repo.head.is_detached else repo.head.ref + + try: + repo.git.checkout(commit_id) + yield + + finally: + repo.git.checkout(current_head) + + +def summarize(run_dir, metrics, expand_metrics=False): + """Produce a summary for each optimum-benchmark launched job's output directory found in `run_dir`. + + Each summary's format is as follows (for `expand_metrics=False`): + ``` + { + "model": "google/gemma-2b", + "commit": "3cd6ed22e4d49219f300f5055e71e3929aba20d7", + "config": "benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5", + "metrics": { + "decode.latency.mean": 1.624666809082031, + "per_token.latency.mean": 0.012843788806628804, + "per_token.throughput.value": 77.85864553330948 + } + } + ``` + """ + reports = glob.glob(os.path.join(run_dir, "**/benchmark_report.json"), recursive=True) + report_dirs = [str(Path(report).parent) for report in reports] + + summaries = [] + for report_dir in report_dirs: + commit = re.search(r"/commit=([^/]+)", report_dir).groups()[0] + + if not os.path.isfile(os.path.join(report_dir, "benchmark.json")): + continue + benchmark = Benchmark.from_json(os.path.join(report_dir, "benchmark.json")) + report = benchmark.report + + model = benchmark.config.backend["model"] + + # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`. + # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.) + benchmark_name = re.sub(f"backend.model={model},*", "", report_dir) + benchmark_name = str(Path(benchmark_name).parts[-1]) + if benchmark_name.startswith("commit="): + benchmark_name = benchmark.config.name + + metrics_values = {} + # post-processing of report: show a few selected/important metric + for metric in metrics: + keys = metric.split(".") + value = report + current = metrics_values + for key in keys: + # Avoid KeyError when a user's specified metric has typo. + # TODO: Give warnings. + if key not in value: + continue + value = value[key] + + if expand_metrics: + if isinstance(value, dict): + if key not in current: + current[key] = {} + current = current[key] + else: + current[key] = value + + if not expand_metrics: + metrics_values[metric] = value + + # show some config information + print(f"model: {model}") + print(f"commit: {commit}") + print(f"config: {benchmark_name}") + if len(metrics_values) > 0: + print("metrics:") + if expand_metrics: + print(metrics_values) + else: + for metric, value in metrics_values.items(): + print(f" - {metric}: {value}") + print("-" * 80) + + summary = { + "model": model, + "commit": commit, + "config": benchmark_name, + "metrics": metrics_values, + } + summaries.append(summary) + + with open(os.path.join(report_dir, "summary.json"), "w") as fp: + json.dump(summary, fp, indent=4) + + # TODO: upload to Hub + return summaries + + +def combine_summaries(summaries): + """Combine a list of summary obtained from the function `summarize`. + + The combined summary's format is as follows: + ``` + "google/gemma-2b": { + "benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5": { + "3cd6ed22e4d49219f300f5055e71e3929aba20d7": { + "metrics": {"decode.latency.mean": 1.624666809082031} + }, + "c97ee28b117c0abe8e08891f402065e4df6d72aa": { + "metrics": {"decode.latency.mean": 1.6278163452148438} + } + }, + "benchmark.input_shapes.batch_size=2,benchmark.input_shapes.sequence_length=5": { + "3cd6ed22e4d49219f300f5055e71e3929aba20d7": { + "metrics": {"decode.latency.mean": 1.6947791748046876} + }, + "c97ee28b117c0abe8e08891f402065e4df6d72aa": { + "metrics": { + "decode.latency.mean": 1.6980519409179688} + } + } + } + ``` + """ + combined = {} + for summary in summaries: + model = summary["model"] + config = summary["config"] + commit = summary["commit"] + + if model not in combined: + combined[model] = {} + + if config not in combined[model]: + combined[model][config] = {} + + if commit not in combined[model][config]: + combined[model][config][commit] = {"metrics": summary["metrics"]} + + with open(os.path.join(exp_run_dir, "summary.json"), "w") as fp: + json.dump(combined, fp, indent=4) + + # TODO: upload to Hub + print(json.dumps(combined, indent=4)) + + return combined + + +if __name__ == "__main__": + + def list_str(values): + return values.split(",") + + parser = argparse.ArgumentParser() + + parser.add_argument("--config-dir", type=str, required=True, help="The path to the config directory.") + parser.add_argument("--config-name", type=str, required=True, help="The config name.") + + # arguments specific to this wrapper for our own customization + parser.add_argument("--ensure_empty", type=bool, default=True, help="If to create a temporary directory.") + parser.add_argument( + "--commit", + type=list_str, + default="", + help="Comma-separated list of branch names and/or commit sha values on which the benchmark will run. If `diff` is specified, it will run on both the current head and the `main` branch.", + ) + parser.add_argument("--metrics", type=str, help="The metrics to be included in the summary.") + args, optimum_benchmark_args = parser.parse_known_args() + + repo = Repo(PATH_TO_REPO) + + metrics = [ + "prefill.latency.mean", + "prefill.throughput.value", + "decode.latency.mean", + "decode.throughput.value", + "per_token.latency.mean", + "per_token.throughput.value", + ] + if args.metrics is not None: + metrics = args.metrics.split(",") + + # Get `backend.model` in a hacky way: We want to control the experiment flow manually. + models = [""] + for idx, arg in enumerate(optimum_benchmark_args): + if arg.startswith("backend.model="): + models = arg[len("backend.model=") :] + models = models.split(",") + break + optimum_benchmark_args = [arg for arg in optimum_benchmark_args if not arg.startswith("backend.model=")] + + # Get the commit(s) + current_head = str(repo.head.commit) if repo.head.is_detached else str(repo.head.ref) + commits = [x for x in args.commit if x != ""] + if len(commits) == 0: + commits = [current_head] + elif len(commits) == 1 and commits[0] == "diff": + # compare to `main` + commits = ["main", current_head] + + # Get the specified run directory + run_dir_arg_idx, run_dir = -1, None + sweep_dir_arg_idx, sweep_dir = -1, None + for idx, arg in enumerate(optimum_benchmark_args): + if arg.startswith("hydra.run.dir="): + run_dir = arg[len("hydra.run.dir=") :] + run_dir_arg_idx = idx + elif arg.startswith("hydra.sweep.dir="): + sweep_dir = arg[len("hydra.sweep.dir=") :] + sweep_dir_arg_idx = idx + exp_run_dir, arg_dix, arg_name = ( + (sweep_dir, sweep_dir_arg_idx, "hydra.sweep.dir") + if "--multirun" in optimum_benchmark_args + else (run_dir, run_dir_arg_idx, "hydra.run.dir") + ) + + # TODO: not hardcoded + if exp_run_dir is None and args.ensure_empty: + exp_run_dir = "_benchmark" + + if args.ensure_empty: + os.makedirs(exp_run_dir, exist_ok=True) + exp_run_dir = tempfile.mkdtemp(dir=exp_run_dir) + + run_summaries = [] + for commit in commits: + with checkout_commit(repo, commit): + commit = str(repo.head.commit) + + commit_run_dir = exp_run_dir + if exp_run_dir is not None: + commit_run_dir = os.path.join(exp_run_dir, rf"commit\={commit}") + + print(f"Run benchmark on commit: {commit}") + + for model in models: + model_arg = [f"backend.model={model}"] if model != "" else [] + dir_args = [] + if commit_run_dir is not None: + if arg_dix > -1: + optimum_benchmark_args[arg_dix] = f"{arg_name}={commit_run_dir}" + else: + dir_args = [ + f"hydra.sweep.dir={commit_run_dir}", + f"hydra.run.dir={commit_run_dir}/" + "${hydra.job.override_dirname}", + ] + main(args.config_dir, args.config_name, model_arg + dir_args + optimum_benchmark_args) + + if commit_run_dir is not None: + # Need to remove the `\` character + summaries = summarize(commit_run_dir.replace("\\", ""), metrics) + run_summaries.extend(summaries) + + # aggregate the information across the commits + if exp_run_dir is not None: + with open(os.path.join(exp_run_dir, "summaries.json"), "w") as fp: + json.dump(run_summaries, fp, indent=4) + + combined_summary = combine_summaries(run_summaries) diff --git a/benchmark/config/generation.yaml b/benchmark/config/generation.yaml new file mode 100644 index 00000000000000..44a3f9ea490154 --- /dev/null +++ b/benchmark/config/generation.yaml @@ -0,0 +1,57 @@ +defaults: + - benchmark # inheriting benchmark schema + - scenario: inference + - launcher: process + - backend: pytorch + - _self_ # for hydra 1.1 compatibility + +name: pytorch_generate + +launcher: + start_method: spawn + device_isolation: true + device_isolation_action: warn + +backend: + device: cuda + device_ids: 0 + no_weights: true + model: meta-llama/Llama-2-7b-hf + cache_implementation: static + torch_compile: true + torch_dtype: float16 + torch_compile_config: + backend: inductor + mode: reduce-overhead + fullgraph: true + +scenario: + input_shapes: + batch_size: 1 + sequence_length: 7 + generate_kwargs: + max_new_tokens: 128 + min_new_tokens: 128 + do_sample: false + memory: true + latency: true + iterations: 2 + duration: 0 + + +# hydra/cli specific settings +hydra: + run: + # where to store run results + dir: runs/${name} + job: + # change working directory to the run directory + chdir: true + env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before + OVERRIDE_BENCHMARKS: 1 + LOG_LEVEL: WARN + sweep: + dir: multirun + subdir: ${hydra.job.override_dirname} \ No newline at end of file diff --git a/benchmark/optimum_benchmark_wrapper.py b/benchmark/optimum_benchmark_wrapper.py new file mode 100644 index 00000000000000..c43e9a73e3160d --- /dev/null +++ b/benchmark/optimum_benchmark_wrapper.py @@ -0,0 +1,16 @@ +import argparse +import subprocess + + +def main(config_dir, config_name, args): + subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--config-dir", type=str, required=True, help="The path to the config directory.") + parser.add_argument("--config-name", type=str, required=True, help="The config name.") + args, unknown = parser.parse_known_args() + + main(args.config_dir, args.config_name, unknown) diff --git a/docker/consistency.dockerfile b/docker/consistency.dockerfile index fa94259e2dedae..c9200799ae1ae4 100644 --- a/docker/consistency.dockerfile +++ b/docker/consistency.dockerfile @@ -1,12 +1,13 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 USER root +ARG REF=main RUN apt-get update && apt-get install -y time git pkg-config make git-lfs -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-cache-dir tensorflow-cpu tf-keras -RUN uv pip install --no-cache-dir "transformers[flax,quality,vision,testing]" +RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,vision,testing]" RUN git lfs install RUN pip uninstall -y transformers diff --git a/docker/custom-tokenizers.dockerfile b/docker/custom-tokenizers.dockerfile index 19860841da629e..5d95e689654ad6 100644 --- a/docker/custom-tokenizers.dockerfile +++ b/docker/custom-tokenizers.dockerfile @@ -2,7 +2,7 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 USER root RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN wget https://github.com/ku-nlp/jumanpp/releases/download/v2.0.0-rc3/jumanpp-2.0.0-rc3.tar.xz diff --git a/docker/examples-tf.dockerfile b/docker/examples-tf.dockerfile index 898f199504da54..9281630d3af2c9 100644 --- a/docker/examples-tf.dockerfile +++ b/docker/examples-tf.dockerfile @@ -3,7 +3,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 USER root RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git RUN apt-get install -y g++ cmake -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv RUN uv pip install --no-cache-dir -U pip setuptools albumentations seqeval RUN pip install --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]" diff --git a/docker/examples-torch.dockerfile b/docker/examples-torch.dockerfile index fa8d63865da12b..da9afcb801da11 100644 --- a/docker/examples-torch.dockerfile +++ b/docker/examples-torch.dockerfile @@ -2,7 +2,7 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 USER root RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu diff --git a/docker/exotic-models.dockerfile b/docker/exotic-models.dockerfile index ea2db367402563..2371ffb91c97ce 100644 --- a/docker/exotic-models.dockerfile +++ b/docker/exotic-models.dockerfile @@ -3,7 +3,7 @@ ENV PYTHONDONTWRITEBYTECODE=1 ARG REF=main USER root RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-cache-dir --no-deps timm accelerate diff --git a/docker/jax-light.dockerfile b/docker/jax-light.dockerfile index 838333062839e8..315b526a7144d3 100644 --- a/docker/jax-light.dockerfile +++ b/docker/jax-light.dockerfile @@ -1,9 +1,10 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 +ARG REF=main USER root RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools -RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]" +RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,testing,sentencepiece,flax-speech,vision]" RUN pip uninstall -y transformers RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean \ No newline at end of file diff --git a/docker/pipeline-tf.dockerfile b/docker/pipeline-tf.dockerfile index 81af0ea9c15b91..393738ff87ff17 100644 --- a/docker/pipeline-tf.dockerfile +++ b/docker/pipeline-tf.dockerfile @@ -1,9 +1,10 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 +ARG REF=main USER root RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake g++ -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools -RUN pip install --no-cache-dir "transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]" +RUN pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]" RUN uv pip install --no-cache-dir "protobuf==3.20.3" tensorflow_probability RUN apt-get clean && rm -rf /var/lib/apt/lists/* \ No newline at end of file diff --git a/docker/pipeline-torch.dockerfile b/docker/pipeline-torch.dockerfile index 554d9783144232..992891a54a417c 100644 --- a/docker/pipeline-torch.dockerfile +++ b/docker/pipeline-torch.dockerfile @@ -1,10 +1,11 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 +ARG REF=main USER root RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu -RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" +RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" RUN pip uninstall -y transformers \ No newline at end of file diff --git a/docker/quality.dockerfile b/docker/quality.dockerfile index 471af0526b4f53..7a4145517a7666 100644 --- a/docker/quality.dockerfile +++ b/docker/quality.dockerfile @@ -1,8 +1,9 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 +ARG REF=main USER root RUN apt-get update && apt-get install -y time git -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip install uv && uv venv -RUN uv pip install --no-cache-dir -U pip setuptools GitPython transformers "ruff==0.1.5" urllib3 +RUN uv pip install --no-cache-dir -U pip setuptools GitPython "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ruff]" urllib3 RUN apt-get install -y jq curl && apt-get clean && rm -rf /var/lib/apt/lists/* \ No newline at end of file diff --git a/docker/tf-light.dockerfile b/docker/tf-light.dockerfile index 23dcd40db2094b..7168ddae1227cf 100644 --- a/docker/tf-light.dockerfile +++ b/docker/tf-light.dockerfile @@ -1,11 +1,12 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 +ARG REF=main USER root RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ pkg-config openssh-client git RUN apt-get install -y cmake -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools -RUN pip install --upgrade --no-cache-dir "transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]" +RUN pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]" RUN uv pip install --no-cache-dir "protobuf==3.20.3" RUN pip uninstall -y transformers RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean \ No newline at end of file diff --git a/docker/torch-jax-light.dockerfile b/docker/torch-jax-light.dockerfile index ef28563ec102ba..7cfa141732fefd 100644 --- a/docker/torch-jax-light.dockerfile +++ b/docker/torch-jax-light.dockerfile @@ -1,12 +1,13 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 +ARG REF=main USER root RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN uv pip install --no-deps accelerate RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu -RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax, audio, sklearn,sentencepiece,vision,testing]" +RUN pip install --no-cache-dir "scipy<1.13" "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,audio,sklearn,sentencepiece,vision,testing]" # RUN pip install --no-cache-dir "scipy<1.13" "transformers[flax,testing,sentencepiece,flax-speech,vision]" diff --git a/docker/torch-light.dockerfile b/docker/torch-light.dockerfile index 2172d66ca8a769..524a68fd55407f 100644 --- a/docker/torch-light.dockerfile +++ b/docker/torch-light.dockerfile @@ -1,10 +1,11 @@ FROM python:3.10-slim ENV PYTHONDONTWRITEBYTECODE=1 +ARG REF=main USER root RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu -RUN uv pip install --no-cache-dir librosa "transformers[sklearn,sentencepiece,vision,testing]" +RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" RUN pip uninstall -y transformers \ No newline at end of file diff --git a/docker/torch-tf-light.dockerfile b/docker/torch-tf-light.dockerfile index f7122930b444ec..ac35b6be81f872 100644 --- a/docker/torch-tf-light.dockerfile +++ b/docker/torch-tf-light.dockerfile @@ -4,7 +4,7 @@ ARG REF=main RUN echo ${REF} USER root RUN apt-get update && apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs -ENV VIRTUAL_ENV=/usr/local +ENV UV_PYTHON=/usr/local/bin/python RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools RUN uv pip install --no-cache-dir --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 3d9ddfb258d223..378a65d1bf37b8 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive @@ -9,11 +9,11 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.2.1' +ARG PYTORCH='2.3.0' # (not always a valid torch version) -ARG INTEL_TORCH_EXT='2.2.0' +ARG INTEL_TORCH_EXT='2.3.0' # Example: `cu102`, `cu113`, etc. -ARG CUDA='cu118' +ARG CUDA='cu121' RUN apt update RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs @@ -48,6 +48,13 @@ RUN python3 -m pip install --no-cache-dir decord av==9.2.0 # Some slow tests require bnb RUN python3 -m pip install --no-cache-dir bitsandbytes +# Some tests require quanto +RUN python3 -m pip install --no-cache-dir quanto + +# `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests +# (`deformable_detr`, `rwkv`, `mra`) +RUN python3 -m pip uninstall -y ninja + # For `dinat` model # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent) RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile index 0b070c93a64f3d..da91906d621429 100644 --- a/docker/transformers-pytorch-amd-gpu/Dockerfile +++ b/docker/transformers-pytorch-amd-gpu/Dockerfile @@ -1,24 +1,19 @@ -FROM rocm/dev-ubuntu-20.04:5.6 +FROM rocm/dev-ubuntu-22.04:6.0.2 # rocm/pytorch has no version with 2.1.0 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive -ARG PYTORCH='2.1.0' -ARG TORCH_VISION='0.16.0' -ARG TORCH_AUDIO='2.1.0' -ARG ROCM='5.6' - RUN apt update && \ - apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip ffmpeg && \ + apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg && \ apt clean && \ rm -rf /var/lib/apt/lists/* -RUN python3 -m pip install --no-cache-dir --upgrade pip +RUN python3 -m pip install --no-cache-dir --upgrade pip numpy -RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM +RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0 -RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0" +RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0" ARG REF=main WORKDIR / @@ -35,5 +30,5 @@ RUN python3 -m pip uninstall -y tensorflow flax # this line must be added in order for python to be aware of transformers. RUN cd transformers && python3 setup.py develop -# Remove nvml as it is not compatible with ROCm -RUN python3 -m pip uninstall py3nvml pynvml -y +# Remove nvml as it is not compatible with ROCm. apex is not tested on NVIDIA either. +RUN python3 -m pip uninstall py3nvml pynvml apex -y diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile index a45210e7d1148c..c9f77a78ce9b83 100644 --- a/docker/transformers-pytorch-gpu/Dockerfile +++ b/docker/transformers-pytorch-gpu/Dockerfile @@ -11,7 +11,7 @@ ARG REF=main RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF # If set to nothing, will install the latest version -ARG PYTORCH='2.1.1' +ARG PYTORCH='2.3.0' ARG TORCH_VISION='' ARG TORCH_AUDIO='' # Example: `cu102`, `cu113`, etc. diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 47fcd11fd766d7..6d94dbee5aa0e9 100755 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -48,6 +48,9 @@ RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 # Add hqq for quantization testing RUN python3 -m pip install --no-cache-dir hqq +# For GGUF tests +RUN python3 -m pip install --no-cache-dir gguf + # Add autoawq for quantization testing # >=v0.2.3 needed for compatibility with torch 2.2.1 RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl @@ -60,4 +63,4 @@ RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git # When installing in editable mode, `transformers` is not recognized as a package. # this line must be added in order for python to be aware of transformers. -RUN cd transformers && python3 setup.py develop \ No newline at end of file +RUN cd transformers && python3 setup.py develop diff --git a/docker/transformers-tensorflow-gpu/Dockerfile b/docker/transformers-tensorflow-gpu/Dockerfile index df9039a0c4d28e..adccee1ace4998 100644 --- a/docker/transformers-tensorflow-gpu/Dockerfile +++ b/docker/transformers-tensorflow-gpu/Dockerfile @@ -1,4 +1,4 @@ -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04 LABEL maintainer="Hugging Face" ARG DEBIAN_FRONTEND=noninteractive diff --git a/docs/source/de/installation.md b/docs/source/de/installation.md index 55d0f2d8512d47..1bd34f73302b27 100644 --- a/docs/source/de/installation.md +++ b/docs/source/de/installation.md @@ -162,7 +162,7 @@ Transformers verwendet die Shell-Umgebungsvariablen `PYTORCH_TRANSFORMERS_CACHE` ## Offline Modus -Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `TRANSFORMERS_OFFLINE=1`, um dieses Verhalten zu aktivieren. +Transformers ist in der Lage, in einer Firewall- oder Offline-Umgebung zu laufen, indem es nur lokale Dateien verwendet. Setzen Sie die Umgebungsvariable `HF_HUB_OFFLINE=1`, um dieses Verhalten zu aktivieren. @@ -179,7 +179,7 @@ python examples/pytorch/translation/run_translation.py --model_name_or_path goog Führen Sie das gleiche Programm in einer Offline-Instanz mit aus: ```bash -HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \ +HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \ python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ... ``` diff --git a/docs/source/de/peft.md b/docs/source/de/peft.md index bdc0684d798d3a..eda8ce9435a055 100644 --- a/docs/source/de/peft.md +++ b/docs/source/de/peft.md @@ -86,10 +86,10 @@ model.load_adapter(peft_model_id) Die `bitsandbytes`-Integration unterstützt Datentypen mit 8bit und 4bit Genauigkeit, was für das Laden großer Modelle nützlich ist, weil es Speicher spart (lesen Sie den `bitsandbytes`-Integrations [guide](./quantization#bitsandbytes-integration), um mehr zu erfahren). Fügen Sie die Parameter `load_in_8bit` oder `load_in_4bit` zu [`~PreTrainedModel.from_pretrained`] hinzu und setzen Sie `device_map="auto"`, um das Modell effektiv auf Ihre Hardware zu verteilen: ```py -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig peft_model_id = "ybelkada/opt-350m-lora" -model = AutoModelForCausalLM.from_pretrained(peft_model_id, device_map="auto", load_in_8bit=True) +model = AutoModelForCausalLM.from_pretrained(peft_model_id, quantization_config=BitsAndBytesConfig(load_in_8bit=True)) ``` ## Einen neuen Adapter hinzufügen diff --git a/docs/source/en/_redirects.yml b/docs/source/en/_redirects.yml index b6575a6b02f205..ff70547c722841 100644 --- a/docs/source/en/_redirects.yml +++ b/docs/source/en/_redirects.yml @@ -1,3 +1,5 @@ # Optimizing inference perf_infer_gpu_many: perf_infer_gpu_one +transformers_agents: agents +quantization: quantization/overview diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index fbdf3c4f7bb515..f81e712cf5b4fa 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -135,16 +135,36 @@ title: Community resources - local: troubleshooting title: Troubleshoot - - local: hf_quantizer - title: Contribute new quantization method + - local: gguf + title: Interoperability with GGUF files title: Developer guides +- sections: + - local: quantization/overview + title: Getting started + - local: quantization/bitsandbytes + title: bitsandbytes + - local: quantization/gptq + title: GPTQ + - local: quantization/awq + title: AWQ + - local: quantization/aqlm + title: AQLM + - local: quantization/quanto + title: Quanto + - local: quantization/eetq + title: EETQ + - local: quantization/hqq + title: HQQ + - local: quantization/optimum + title: Optimum + - local: quantization/contribute + title: Contribute new quantization method + title: Quantization Methods - sections: - local: performance title: Overview - local: llm_optims title: LLM inference optimization - - local: quantization - title: Quantization - sections: - local: perf_train_gpu_one title: Methods and tools for efficient training on a single GPU @@ -386,6 +406,8 @@ title: I-BERT - local: model_doc/jamba title: Jamba + - local: model_doc/jetmoe + title: JetMoe - local: model_doc/jukebox title: Jukebox - local: model_doc/led @@ -784,6 +806,8 @@ title: OWL-ViT - local: model_doc/owlv2 title: OWLv2 + - local: model_doc/paligemma + title: PaliGemma - local: model_doc/perceiver title: Perceiver - local: model_doc/pix2struct @@ -804,6 +828,8 @@ title: TVP - local: model_doc/udop title: UDOP + - local: model_doc/video_llava + title: VideoLlava - local: model_doc/vilt title: ViLT - local: model_doc/vipllava diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md index ae9e5db2b7897b..2cacaed5902c4d 100644 --- a/docs/source/en/agents.md +++ b/docs/source/en/agents.md @@ -28,8 +28,8 @@ An agent is a system that uses an LLM as its engine, and it has access to functi These *tools* are functions for performing a task, and they contain all necessary description for the agent to properly use them. The agent can be programmed to: -- devise a series of actions/tools and run them all at once like the `CodeAgent` for example -- plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one like the `ReactJsonAgent` for example +- devise a series of actions/tools and run them all at once like the [`CodeAgent`] for example +- plan and execute actions/tools one by one and wait for the outcome of each action before launching the next one like the [`ReactJsonAgent`] for example ### Types of agents @@ -42,8 +42,8 @@ This agent has a planning step, then generates python code to execute all its ac This is the go-to agent to solve reasoning tasks, since the ReAct framework ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) makes it really efficient to think on the basis of its previous observations. We implement two versions of ReactJsonAgent: -- [`~ReactJsonAgent`] generates tool calls as a JSON in its output. -- [`~ReactCodeAgent`] is a new type of ReactJsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance. +- [`ReactJsonAgent`] generates tool calls as a JSON in its output. +- [`ReactCodeAgent`] is a new type of ReactJsonAgent that generates its tool calls as blobs of code, which works really well for LLMs that have strong coding performance. > [!TIP] > Read [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) blog post to learn more the ReAct agent. @@ -124,7 +124,7 @@ You could use any `llm_engine` method as long as: You also need a `tools` argument which accepts a list of `Tools`. You can provide an empty list for `tools`, but use the default toolbox with the optional argument `add_base_tools=True`. -Now you can create an agent, like `CodeAgent`, and run it. For convenience, we also provide the `HfEngine` class that uses `huggingface_hub.InferenceClient` under the hood. +Now you can create an agent, like [`CodeAgent`], and run it. For convenience, we also provide the [`HfEngine`] class that uses `huggingface_hub.InferenceClient` under the hood. ```python from transformers import CodeAgent, HfEngine @@ -139,7 +139,7 @@ agent.run( ``` This will be handy in case of emergency baguette need! -You can even leave the argument `llm_engine` undefined, and an [~HfEngine] will be created by default. +You can even leave the argument `llm_engine` undefined, and an [`HfEngine`] will be created by default. ```python from transformers import CodeAgent @@ -181,13 +181,27 @@ You can also run an agent consecutively for different tasks: each time the attri A Python interpreter executes the code on a set of inputs passed along with your tools. This should be safe because the only functions that can be called are the tools you provided (especially if it's only tools by Hugging Face) and the print function, so you're already limited in what can be executed. -The Python interpreter also doesn't allow any attribute lookup or imports (which shouldn't be needed for passing inputs/outputs to a small set of functions) so all the most obvious attacks shouldn't be an issue. +The Python interpreter also doesn't allow imports by default outside of a safe list, so all the most obvious attacks shouldn't be an issue. +You can still authorize additional imports by passing the authorized modules as a list of strings in argument `additional_authorized_imports` upon initialization of your [`ReactCodeAgent`] or [`CodeAgent`]: + +```py +>>> from transformers import ReactCodeAgent + +>>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4']) +>>>agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?") + +(...) +'Hugging Face – Blog' +``` The execution will stop at any code trying to perform an illegal operation or if there is a regular Python error with the code generated by the agent. +> [!WARNING] +> The LLM can generate arbitrary code that will then be executed: do not add any unsafe imports! + ### The system prompt -An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the `ReactCodeAgent` (below version is slightly simplified). +An agent, or rather the LLM that drives the agent, generates an output based on the system prompt. The system prompt can be customized and tailored to the intended task. For example, check the system prompt for the [`ReactCodeAgent`] (below version is slightly simplified). ```text You will be given a task to solve as best you can. @@ -246,7 +260,7 @@ of the available tools. A tool is an atomic function to be used by an agent. -You can for instance check the [~PythonInterpreterTool]: it has a name, a description, input descriptions, an output type, and a `__call__` method to perform the action. +You can for instance check the [`PythonInterpreterTool`]: it has a name, a description, input descriptions, an output type, and a `__call__` method to perform the action. When the agent is initialized, the tool attributes are used to generate a tool description which is baked into the agent's system prompt. This lets the agent know which tools it can use and why. @@ -259,7 +273,7 @@ Transformers comes with a default toolbox for empowering agents, that you can ad - **Speech to text**: given an audio recording of a person talking, transcribe the speech into text ([Whisper](./model_doc/whisper)) - **Text to speech**: convert text to speech ([SpeechT5](./model_doc/speecht5)) - **Translation**: translates a given sentence from source language to target language. -- **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [~ReactJsonAgent] if you use `add_base_tools=True`, since code-based tools can already execute Python code +- **Python code interpreter**: runs your the LLM generated Python code in a secure environment. This tool will only be added to [`ReactJsonAgent`] if you use `add_base_tools=True`, since code-based tools can already execute Python code You can manually use a tool by calling the [`load_tool`] function and a task to perform. diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index 3d4829c3e37f18..b000cc06779918 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -173,6 +173,92 @@ your screen, one word at a time: An increasing sequence: one, two, three, four, five, six, seven, eight, nine, ten, eleven, ``` + +## KV Cache Quantization + +The `generate()` method supports caching keys and values to enhance efficiency and avoid re-computations. However the key and value +cache can occupy a large portion of memory, becoming a bottleneck for long-context generation, especially for Large Language Models. +Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed. + +KV Cache quantization in `transformers` is largely inspired by the paper [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache] +(https://arxiv.org/abs/2402.02750) and currently supports `quanto` and `HQQ` as backends. For more information on the inner workings see the paper. + +To enable quantization of the key-value cache, one needs to indicate `cache_implementation="quantized"` in the `generation_config`. +Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`QuantizedCacheConfig`] class. +One has to indicate which quantization backend to use in the [`QuantizedCacheConfig`], the default is `quanto`. + + + +Cache quantization can be detrimental if the context length is short and there is enough GPU VRAM available to run without cache quantization. + + + + +```python +>>> import torch +>>> from transformers import AutoTokenizer, AutoModelForCausalLM + +>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") +>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0") +>>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device) + +>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"}) +>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) +I like rock music because it's loud and energetic. It's a great way to express myself and rel + +>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20) +>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) +I like rock music because it's loud and energetic. I like to listen to it when I'm feeling +``` + +## Watermarking + +The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green". +When generating the "green" will have a small 'bias' value added to their logits, thus having a higher chance to be generated. +The watermarked text can be detected by calculating the proportion of "green" tokens in the text and estimating how likely it is +statistically to obtain that amount of "green" tokens for human-generated text. This watermarking strategy was proposed in the paper +["On the Reliability of Watermarks for Large Language Models"](https://arxiv.org/abs/2306.04634). For more information on +the inner functioning of watermarking, it is recommended to refer to the paper. + +The watermarking can be used with any generative model in `tranformers` and does not require an extra classification model +to detect watermarked text. To trigger watermarking, pass in a [`WatermarkingConfig`] with needed arguments directly to the +`.generate()` method or add it to the [`GenerationConfig`]. Watermarked text can be later detected with a [`WatermarkDetector`]. + + + + +The WatermarkDetector internally relies on the proportion of "green" tokens, and whether generated text follows the coloring pattern. +That is why it is recommended to strip off the prompt text, if it is much longer than the generated text. +This also can have an effect when one sequence in the batch is a lot longer causing other rows to be padded. +Additionally, the detector **must** be initiated with identical watermark configuration arguments used when generating. + + + +Let's generate some text with watermarking. In the below code snippet, we set the bias to 2.5 which is a value that +will be added to "green" tokens' logits. After generating watermarked text, we can pass it directly to the `WatermarkDetector` +to check if the text is machine-generated (outputs `True` for machine-generated and `False` otherwise). + +```python +>>> from transformers import AutoTokenizer, AutoModelForCausalLM, WatermarkDetector, WatermarkingConfig + +>>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") +>>> tok = AutoTokenizer.from_pretrained("openai-community/gpt2") +>>> tok.pad_token_id = tok.eos_token_id +>>> tok.padding_side = "left" + +>>> inputs = tok(["This is the beginning of a long story", "Alice and Bob are"], padding=True, return_tensors="pt") +>>> input_len = inputs["input_ids"].shape[-1] + +>>> watermarking_config = WatermarkingConfig(bias=2.5, seeding_scheme="selfhash") +>>> out = model.generate(**inputs, watermarking_config=watermarking_config, do_sample=False, max_length=20) + +>>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config) +>>> detection_out = detector(out, return_dict=True) +>>> detection_out.prediction +array([True, True]) +``` + + ## Decoding strategies Certain combinations of the `generate()` parameters, and ultimately `generation_config`, can be used to enable specific diff --git a/docs/source/en/gguf.md b/docs/source/en/gguf.md new file mode 100644 index 00000000000000..359ed4d5e1e8c6 --- /dev/null +++ b/docs/source/en/gguf.md @@ -0,0 +1,97 @@ + + +# GGUF and interaction with Transformers + +The GGUF file format is used to store models for inference with [GGML](https://github.com/ggerganov/ggml) and other +libraries that depend on it, like the very popular [llama.cpp](https://github.com/ggerganov/llama.cpp) or +[whisper.cpp](https://github.com/ggerganov/whisper.cpp). + +It is a file format [supported by the Hugging Face Hub](https://huggingface.co/docs/hub/en/gguf) with features +allowing for quick inspection of tensors and metadata within the file. + +This file format is designed as a "single-file-format" where a single file usually contains both the configuration +attributes, the tokenizer vocabulary and other attributes, as well as all tensors to be loaded in the model. These +files come in different formats according to the quantization type of the file. We briefly go over some of them +[here](https://huggingface.co/docs/hub/en/gguf#quantization-types). + +## Support within Transformers + +We have added the ability to load `gguf` files within `transformers` in order to offer further training/fine-tuning +capabilities to gguf models, before converting back those models to `gguf` to use within the `ggml` ecosystem. When +loading a model, we first dequantize it to fp32, before loading the weights to be used in PyTorch. + +> [!NOTE] +> The support is still very exploratory and we welcome contributions in order to solidify it across quantization types +> and model architectures. + +For now, here are the supported model architectures and quantization types: + +### Supported quantization types + +The initial supported quantization types are decided according to the popular quantized files that have been shared +on the Hub. + +- F32 +- Q2_K +- Q3_K +- Q4_0 +- Q4_K +- Q5_K +- Q6_K +- Q8_0 + +We take example from the excellent [99991/pygguf](https://github.com/99991/pygguf) Python parser to dequantize the +weights. + +### Supported model architectures + +For now the supported model architectures are the architectures that have been very popular on the Hub, namely: + +- LLaMa +- Mistral +- Qwen2 + +## Example usage + +In order to load `gguf` files in `transformers`, you should specify the `gguf_file` argument to the `from_pretrained` +methods of both tokenizers and models. Here is how one would load a tokenizer and a model, which can be loaded +from the exact same file: + +```py +from transformers import AutoTokenizer, AutoModelForCausalLM + +model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" +filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf" + +tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename) +model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename) +``` + +Now you have access to the full, unquantized version of the model in the PyTorch ecosystem, where you can combine it +with a plethora of other tools. + +In order to convert back to a `gguf` file, we recommend using the +[`convert-hf-to-gguf.py` file](https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py) from llama.cpp. + +Here's how you would complete the script above to save the model and export it back to `gguf`: + +```py +tokenizer.save_pretrained('directory') +model.save_pretrained('directory') + +!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory} +``` diff --git a/docs/source/en/index.md b/docs/source/en/index.md index b12913a7222577..31d8b770ed8403 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -160,12 +160,13 @@ Flax), PyTorch, and/or TensorFlow. | [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | | [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | -| [IDEFICS](model_doc/idefics) | ✅ | ❌ | ❌ | +| [IDEFICS](model_doc/idefics) | ✅ | ✅ | ❌ | | [Idefics2](model_doc/idefics2) | ✅ | ❌ | ❌ | | [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | | [Informer](model_doc/informer) | ✅ | ❌ | ❌ | | [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | | [Jamba](model_doc/jamba) | ✅ | ❌ | ❌ | +| [JetMoe](model_doc/jetmoe) | ✅ | ❌ | ❌ | | [Jukebox](model_doc/jukebox) | ✅ | ❌ | ❌ | | [KOSMOS-2](model_doc/kosmos-2) | ✅ | ❌ | ❌ | | [LayoutLM](model_doc/layoutlm) | ✅ | ✅ | ❌ | @@ -200,7 +201,7 @@ Flax), PyTorch, and/or TensorFlow. | [Megatron-BERT](model_doc/megatron-bert) | ✅ | ❌ | ❌ | | [Megatron-GPT2](model_doc/megatron_gpt2) | ✅ | ✅ | ✅ | | [MGP-STR](model_doc/mgp-str) | ✅ | ❌ | ❌ | -| [Mistral](model_doc/mistral) | ✅ | ❌ | ✅ | +| [Mistral](model_doc/mistral) | ✅ | ✅ | ✅ | | [Mixtral](model_doc/mixtral) | ✅ | ❌ | ❌ | | [mLUKE](model_doc/mluke) | ✅ | ❌ | ❌ | | [MMS](model_doc/mms) | ✅ | ✅ | ✅ | @@ -230,6 +231,7 @@ Flax), PyTorch, and/or TensorFlow. | [OPT](model_doc/opt) | ✅ | ✅ | ✅ | | [OWL-ViT](model_doc/owlvit) | ✅ | ❌ | ❌ | | [OWLv2](model_doc/owlv2) | ✅ | ❌ | ❌ | +| [PaliGemma](model_doc/paligemma) | ✅ | ❌ | ❌ | | [PatchTSMixer](model_doc/patchtsmixer) | ✅ | ❌ | ❌ | | [PatchTST](model_doc/patchtst) | ✅ | ❌ | ❌ | | [Pegasus](model_doc/pegasus) | ✅ | ✅ | ✅ | @@ -303,6 +305,7 @@ Flax), PyTorch, and/or TensorFlow. | [UnivNet](model_doc/univnet) | ✅ | ❌ | ❌ | | [UPerNet](model_doc/upernet) | ✅ | ❌ | ❌ | | [VAN](model_doc/van) | ✅ | ❌ | ❌ | +| [VideoLlava](model_doc/video_llava) | ✅ | ❌ | ❌ | | [VideoMAE](model_doc/videomae) | ✅ | ❌ | ❌ | | [ViLT](model_doc/vilt) | ✅ | ❌ | ❌ | | [VipLlava](model_doc/vipllava) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/installation.md b/docs/source/en/installation.md index 7ece8eae44cabd..3ed4edf3d8ec5c 100644 --- a/docs/source/en/installation.md +++ b/docs/source/en/installation.md @@ -169,7 +169,7 @@ Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/hu ## Offline mode -Run 🤗 Transformers in a firewalled or offline environment with locally cached files by setting the environment variable `TRANSFORMERS_OFFLINE=1`. +Run 🤗 Transformers in a firewalled or offline environment with locally cached files by setting the environment variable `HF_HUB_OFFLINE=1`. @@ -178,7 +178,7 @@ Add [🤗 Datasets](https://huggingface.co/docs/datasets/) to your offline train ```bash -HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \ +HF_DATASETS_OFFLINE=1 HF_HUB_OFFLINE=1 \ python examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small --dataset_name wmt16 --dataset_config ro-en ... ``` diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md index 19b80914c90b69..5bf8b5c4a0b36f 100644 --- a/docs/source/en/internal/generation_utils.md +++ b/docs/source/en/internal/generation_utils.md @@ -209,6 +209,10 @@ generation. [[autodoc]] WhisperTimeStampLogitsProcessor - __call__ +[[autodoc]] WatermarkLogitsProcessor + - __call__ + + ### TensorFlow [[autodoc]] TFForcedBOSTokenLogitsProcessor @@ -356,6 +360,12 @@ A [`Constraint`] can be used to force the generation to include specific tokens [[autodoc]] Cache - update +[[autodoc]] CacheConfig + - update + +[[autodoc]] QuantizedCacheConfig + - validate + [[autodoc]] DynamicCache - update - get_seq_length @@ -363,6 +373,14 @@ A [`Constraint`] can be used to force the generation to include specific tokens - to_legacy_cache - from_legacy_cache +[[autodoc]] QuantizedCache + - update + - get_seq_length + +[[autodoc]] QuantoQuantizedCache + +[[autodoc]] HQQQuantizedCache + [[autodoc]] SinkCache - update - get_seq_length @@ -371,4 +389,11 @@ A [`Constraint`] can be used to force the generation to include specific tokens [[autodoc]] StaticCache - update - get_seq_length - - reorder_cache + - reset + + +## Watermark Utils + +[[autodoc]] WatermarkDetector + - __call__ + diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md index 4b44c1d78c81f0..5e49f0e1ebd3ab 100644 --- a/docs/source/en/llm_optims.md +++ b/docs/source/en/llm_optims.md @@ -29,7 +29,7 @@ To optimize this, you can use a kv-cache to store the past keys and values inste The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with torch.compile for up to a 4x speed up. > [!WARNING] -> Currently, only [Command R](./model_doc/cohere), [Gemma](./model_doc/gemma) and [Llama](./model_doc/llama2) models support static kv-cache and torch.compile. +> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and torch.compile. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list. For this example, let's load the [Gemma](https://hf.co/google/gemma-2b) model. diff --git a/docs/source/en/main_classes/text_generation.md b/docs/source/en/main_classes/text_generation.md index dec524d257137f..e2c5ce9c0ba0ce 100644 --- a/docs/source/en/main_classes/text_generation.md +++ b/docs/source/en/main_classes/text_generation.md @@ -41,6 +41,8 @@ like token streaming. - validate - get_generation_mode +[[autodoc]] generation.WatermarkingConfig + ## GenerationMixin [[autodoc]] generation.GenerationMixin diff --git a/docs/source/en/model_doc/audio-spectrogram-transformer.md b/docs/source/en/model_doc/audio-spectrogram-transformer.md index 3eac3781667eb4..d83c3bbb6cf2fe 100644 --- a/docs/source/en/model_doc/audio-spectrogram-transformer.md +++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md @@ -43,6 +43,34 @@ the authors compute the stats for a downstream dataset. - Note that the AST needs a low learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the [PSLA paper](https://arxiv.org/abs/2102.01243)) and converges quickly, so please search for a suitable learning rate and learning rate scheduler for your task. +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +``` +from transformers import ASTForAudioClassification +model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", attn_implementation="sdpa", torch_dtype=torch.float16) +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `MIT/ast-finetuned-audioset-10-10-0.4593` model, we saw the following speedups during inference. + +| Batch size | Average inference time (ms), eager mode | Average inference time (ms), sdpa model | Speed up, Sdpa / Eager (x) | +|--------------|-------------------------------------------|-------------------------------------------|------------------------------| +| 1 | 27 | 6 | 4.5 | +| 2 | 12 | 6 | 2 | +| 4 | 21 | 8 | 2.62 | +| 8 | 40 | 14 | 2.86 | + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with the Audio Spectrogram Transformer. diff --git a/docs/source/en/model_doc/blip.md b/docs/source/en/model_doc/blip.md index bc122c942a67a5..fa06191834f898 100644 --- a/docs/source/en/model_doc/blip.md +++ b/docs/source/en/model_doc/blip.md @@ -66,6 +66,8 @@ The original code can be found [here](https://github.com/salesforce/BLIP). ## BlipModel +`BlipModel` is going to be deprecated in future versions, please use `BlipForConditionalGeneration`, `BlipForImageTextRetrieval` or `BlipForQuestionAnswering` depending on your usecase. + [[autodoc]] BlipModel - forward - get_text_features diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md index cd32a38f5a6ac9..a0e7f6366bb924 100644 --- a/docs/source/en/model_doc/code_llama.md +++ b/docs/source/en/model_doc/code_llama.md @@ -24,7 +24,7 @@ The abstract from the paper is the following: *We release Code Llama, a family of large language models for code based on Llama 2 providing state-of-the-art performance among open models, infilling capabilities, support for large input contexts, and zero-shot instruction following ability for programming tasks. We provide multiple flavors to cover a wide range of applications: foundation models (Code Llama), Python specializations (Code Llama - Python), and instruction-following models (Code Llama - Instruct) with 7B, 13B and 34B parameters each. All models are trained on sequences of 16k tokens and show improvements on inputs with up to 100k tokens. 7B and 13B Code Llama and Code Llama - Instruct variants support infilling based on surrounding content. Code Llama reaches state-of-the-art performance among open models on several code benchmarks, with scores of up to 53% and 55% on HumanEval and MBPP, respectively. Notably, Code Llama - Python 7B outperforms Llama 2 70B on HumanEval and MBPP, and all our models outperform every other publicly available model on MultiPL-E. We release Code Llama under a permissive license that allows for both research and commercial use.* -Check out all Code Llama model checkpoints [here](https://huggingface.co/models?search=code_llama) and the officially released ones in the [codellama org](https://huggingface.co/codellama). +Check out all Code Llama model checkpoints [here](https://huggingface.co/models?search=code_llama) and the officially released ones in the [Meta Llama org](https://huggingface.co/meta-llama). This model was contributed by [ArthurZucker](https://huggingface.co/ArthurZ). The original code of the authors can be found [here](https://github.com/facebookresearch/llama). @@ -62,8 +62,8 @@ After conversion, the model and tokenizer can be loaded via: ```python >>> from transformers import LlamaForCausalLM, CodeLlamaTokenizer ->>> tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf") ->>> model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf") +>>> tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf") +>>> model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf") >>> PROMPT = '''def remove_non_ascii(s: str) -> str: ... """ ... return result @@ -95,7 +95,7 @@ If you only want the infilled part: >>> from transformers import pipeline >>> import torch ->>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto") +>>> generator = pipeline("text-generation",model="meta-llama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto") >>> generator('def remove_non_ascii(s: str) -> str:\n """ \n return result', max_new_tokens = 128) [{'generated_text': 'def remove_non_ascii(s: str) -> str:\n """ \n return resultRemove non-ASCII characters from a string. """\n result = ""\n for c in s:\n if ord(c) < 128:\n result += c'}] ``` diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md index d60a4926eb1853..fb53742d054162 100644 --- a/docs/source/en/model_doc/dbrx.md +++ b/docs/source/en/model_doc/dbrx.md @@ -31,8 +31,7 @@ We used curriculum learning for pretraining, changing the data mix during traini More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm). - -This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx), though this may not be up to date. +This model was contributed by [eitan-turok](https://huggingface.co/eitanturok) and [abhi-db](https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx-instruct), though this may not be up to date. ## Usage Examples diff --git a/docs/source/en/model_doc/deit.md b/docs/source/en/model_doc/deit.md index 7d9918a45eeeb6..6a4e141facaeac 100644 --- a/docs/source/en/model_doc/deit.md +++ b/docs/source/en/model_doc/deit.md @@ -68,6 +68,34 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The Tenso *facebook/deit-base-patch16-384*. Note that one should use [`DeiTImageProcessor`] in order to prepare images for the model. +### Using Scaled Dot Product Attention (SDPA) + +PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function +encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the +[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) +or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) +page for more information. + +SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set +`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. + +``` +from transformers import DeiTForImageClassification +model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16) +... +``` + +For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`). + +On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `facebook/deit-base-distilled-patch16-224` model, we saw the following speedups during inference. + +| Batch size | Average inference time (ms), eager mode | Average inference time (ms), sdpa model | Speed up, Sdpa / Eager (x) | +|--------------|-------------------------------------------|-------------------------------------------|------------------------------| +| 1 | 8 | 6 | 1.33 | +| 2 | 9 | 6 | 1.5 | +| 4 | 9 | 6 | 1.5 | +| 8 | 8 | 6 | 1.33 | + ## Resources A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DeiT. diff --git a/docs/source/en/model_doc/deta.md b/docs/source/en/model_doc/deta.md index cdda22af7bbf97..996142bc59d6b5 100644 --- a/docs/source/en/model_doc/deta.md +++ b/docs/source/en/model_doc/deta.md @@ -16,6 +16,14 @@ rendered properly in your Markdown viewer. # DETA + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The DETA model was proposed in [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl. diff --git a/docs/source/en/model_doc/efficientformer.md b/docs/source/en/model_doc/efficientformer.md index 92ba90a9e5ed97..24b20793b03c9b 100644 --- a/docs/source/en/model_doc/efficientformer.md +++ b/docs/source/en/model_doc/efficientformer.md @@ -16,28 +16,36 @@ rendered properly in your Markdown viewer. # EfficientFormer + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview -The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191) +The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Eric Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren. EfficientFormer proposes a dimension-consistent pure transformer that can be run on mobile devices for dense prediction tasks like image classification, object detection and semantic segmentation. The abstract from the paper is the following: -*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks. -However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally -times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly -challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation -complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still -unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance? -To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs. -Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm. -Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer. -Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices. -Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on -iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model, -EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can +*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks. +However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally +times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly +challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation +complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still +unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance? +To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs. +Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm. +Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer. +Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices. +Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on +iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model, +EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can reach extremely low latency on mobile devices while maintaining high performance.* This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd). @@ -93,4 +101,4 @@ The original code can be found [here](https://github.com/snap-research/Efficient - call - \ No newline at end of file + diff --git a/docs/source/en/model_doc/ernie_m.md b/docs/source/en/model_doc/ernie_m.md index a99332cb655ac5..85254693501c80 100644 --- a/docs/source/en/model_doc/ernie_m.md +++ b/docs/source/en/model_doc/ernie_m.md @@ -16,6 +16,14 @@ rendered properly in your Markdown viewer. # ErnieM + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The ErnieM model was proposed in [ERNIE-M: Enhanced Multilingual Representation by Aligning diff --git a/docs/source/en/model_doc/gemma.md b/docs/source/en/model_doc/gemma.md index f55995b6d85b6a..abd077af8da170 100644 --- a/docs/source/en/model_doc/gemma.md +++ b/docs/source/en/model_doc/gemma.md @@ -60,6 +60,11 @@ This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [ [[autodoc]] GemmaForSequenceClassification - forward +## GemmaForTokenClassification + +[[autodoc]] GemmaForTokenClassification + - forward + ## FlaxGemmaModel [[autodoc]] FlaxGemmaModel diff --git a/docs/source/en/model_doc/gptsan-japanese.md b/docs/source/en/model_doc/gptsan-japanese.md index 1e6b1b6e1cf6d7..108e59048d5d52 100644 --- a/docs/source/en/model_doc/gptsan-japanese.md +++ b/docs/source/en/model_doc/gptsan-japanese.md @@ -16,6 +16,14 @@ rendered properly in your Markdown viewer. # GPTSAN-japanese + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The GPTSAN-japanese model was released in the repository by Toshiyuki Sakamoto (tanreinama). diff --git a/docs/source/en/model_doc/graphormer.md b/docs/source/en/model_doc/graphormer.md index 08e3f5fb3e9b5a..d01bf04debf9dd 100644 --- a/docs/source/en/model_doc/graphormer.md +++ b/docs/source/en/model_doc/graphormer.md @@ -1,7 +1,7 @@ + +# JetMoe + +## Overview + +**JetMoe-8B** is an 8B Mixture-of-Experts (MoE) language model developed by [Yikang Shen](https://scholar.google.com.hk/citations?user=qff5rRYAAAAJ) and [MyShell](https://myshell.ai/). +JetMoe project aims to provide a LLaMA2-level performance and efficient language model with a limited budget. +To achieve this goal, JetMoe uses a sparsely activated architecture inspired by the [ModuleFormer](https://arxiv.org/abs/2306.04640). +Each JetMoe block consists of two MoE layers: Mixture of Attention Heads and Mixture of MLP Experts. +Given the input tokens, it activates a subset of its experts to process them. +This sparse activation schema enables JetMoe to achieve much better training throughput than similar size dense models. +The training throughput of JetMoe-8B is around 100B tokens per day on a cluster of 96 H100 GPUs with a straightforward 3-way pipeline parallelism strategy. + +This model was contributed by [Yikang Shen](https://huggingface.co/YikangS). + + +## JetMoeConfig + +[[autodoc]] JetMoeConfig + +## JetMoeModel + +[[autodoc]] JetMoeModel + - forward + +## JetMoeForCausalLM + +[[autodoc]] JetMoeForCausalLM + - forward + +## JetMoeForSequenceClassification + +[[autodoc]] JetMoeForSequenceClassification + - forward diff --git a/docs/source/en/model_doc/jukebox.md b/docs/source/en/model_doc/jukebox.md index 578a8a91dd02ea..12f273b71e972c 100644 --- a/docs/source/en/model_doc/jukebox.md +++ b/docs/source/en/model_doc/jukebox.md @@ -15,6 +15,14 @@ rendered properly in your Markdown viewer. --> # Jukebox + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The Jukebox model was proposed in [Jukebox: A generative model for music](https://arxiv.org/pdf/2005.00341.pdf) @@ -27,7 +35,7 @@ The abstract from the paper is the following: *We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.* As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length. -First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditioner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. +First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditioner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positional embedding for the timing data. The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio. ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg) diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md index 915d5ecc70b554..2f0eb63da00a84 100644 --- a/docs/source/en/model_doc/llama.md +++ b/docs/source/en/model_doc/llama.md @@ -121,6 +121,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] LlamaForQuestionAnswering - forward +## LlamaForTokenClassification + +[[autodoc]] LlamaForTokenClassification + - forward + ## FlaxLlamaModel [[autodoc]] FlaxLlamaModel diff --git a/docs/source/en/model_doc/llama3.md b/docs/source/en/model_doc/llama3.md index d8c682c80cd194..067d2e9ba934d5 100644 --- a/docs/source/en/model_doc/llama3.md +++ b/docs/source/en/model_doc/llama3.md @@ -82,4 +82,4 @@ pipeline("Hey how are you doing today?") ``` ## Resources -A ton of cool resources are already available on the documentation page of [~llama2], inviting contributors to add new resourses curated for Llama3 here! 🤗 +A ton of cool resources are already available on the documentation page of [~llama2], inviting contributors to add new resources curated for Llama3 here! 🤗 diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md index a2a3913fcad7b8..a4a1419ee00ac8 100644 --- a/docs/source/en/model_doc/llava_next.md +++ b/docs/source/en/model_doc/llava_next.md @@ -68,6 +68,8 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ ## Usage example +### Single image inference + Here's how to load the model and perform inference in half-precision (`torch.float16`): ```python @@ -94,6 +96,45 @@ output = model.generate(**inputs, max_new_tokens=100) print(processor.decode(output[0], skip_special_tokens=True)) ``` +### Multi image inference + +LLaVa-Next can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it: + +```python +import requests +from PIL import Image +import torch +from transformers import AutoProcessor, LlavaNextForConditionalGeneration + +# Load the model in half-precision +model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto") +processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf") + +# Get three different images +url = "https://www.ilankelman.org/stopsigns/australia.jpg" +image_stop = Image.open(requests.get(url, stream=True).raw) + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image_cats = Image.open(requests.get(url, stream=True).raw) + +url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg" +image_snowman = Image.open(requests.get(url, stream=True).raw) + +# Prepare a batched prompt, where the first one is a multi-turn conversation and the second is not +prompt = [ + "[INST] \nWhat is shown in this image? [/INST] There is a red stop sign in the image. [INST] \nWhat about this image? How many cats do you see [/INST]", + "[INST] \nWhat is shown in this image? [/INST]" +] + +# We can simply feed images in the order they have to be used in the text prompt +# Each "" token uses one image leaving the next for the subsequent "" tokens +inputs = processor(text=prompt, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(model.device) + +# Generate +generate_ids = model.generate(**inputs, max_new_tokens=30) +processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) +``` + ## Model optimization ### Quantization using Bitsandbytes diff --git a/docs/source/en/model_doc/mask2former.md b/docs/source/en/model_doc/mask2former.md index bd5ab80728eb48..4faeed50311f69 100644 --- a/docs/source/en/model_doc/mask2former.md +++ b/docs/source/en/model_doc/mask2former.md @@ -41,6 +41,7 @@ This model was contributed by [Shivalika Singh](https://huggingface.co/shivi) an A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Mask2Former. - Demo notebooks regarding inference + fine-tuning Mask2Former on custom data can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/Mask2Former). +- Scripts for finetuning [`Mask2Former`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/instance-segmentation). If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. The resource should ideally demonstrate something new instead of duplicating an existing resource. diff --git a/docs/source/en/model_doc/maskformer.md b/docs/source/en/model_doc/maskformer.md index 4d31b2829d10f2..a0199f380ce647 100644 --- a/docs/source/en/model_doc/maskformer.md +++ b/docs/source/en/model_doc/maskformer.md @@ -51,6 +51,7 @@ This model was contributed by [francesco](https://huggingface.co/francesco). The - All notebooks that illustrate inference as well as fine-tuning on custom data with MaskFormer can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/MaskFormer). +- Scripts for finetuning [`MaskFormer`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/instance-segmentation). ## MaskFormer specific outputs diff --git a/docs/source/en/model_doc/mega.md b/docs/source/en/model_doc/mega.md index 4ce62ca45a1d74..5545f5e19c47e3 100644 --- a/docs/source/en/model_doc/mega.md +++ b/docs/source/en/model_doc/mega.md @@ -16,12 +16,20 @@ rendered properly in your Markdown viewer. # MEGA + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The MEGA model was proposed in [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer. -MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism -stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA -while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an +MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism +stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA +while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an attractive option for long-document NLP tasks. The abstract from the paper is the following: @@ -34,8 +42,8 @@ The original code can be found [here](https://github.com/facebookresearch/mega). ## Usage tips -- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional. -- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size +- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional. +- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size ## Implementation Notes diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 0ab214206165f1..17ce15b2b8c9b9 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -203,6 +203,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] MistralForSequenceClassification - forward +## MistralForTokenClassification + +[[autodoc]] MistralForTokenClassification + - forward + ## FlaxMistralModel [[autodoc]] FlaxMistralModel @@ -211,4 +216,19 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h ## FlaxMistralForCausalLM [[autodoc]] FlaxMistralForCausalLM - - __call__ \ No newline at end of file + - __call__ + +## TFMistralModel + +[[autodoc]] TFMistralModel + - call + +## TFMistralForCausalLM + +[[autodoc]] TFMistralForCausalLM + - call + +## TFMistralForSequenceClassification + +[[autodoc]] TFMistralForSequenceClassification + - call \ No newline at end of file diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 942b040c3f2fd5..b93acdec581525 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -204,3 +204,8 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h [[autodoc]] MixtralForSequenceClassification - forward + +## MixtralForTokenClassification + +[[autodoc]] MixtralForTokenClassification + - forward diff --git a/docs/source/en/model_doc/nat.md b/docs/source/en/model_doc/nat.md index ecb61ccb0a3397..02c2e466cc4a7b 100644 --- a/docs/source/en/model_doc/nat.md +++ b/docs/source/en/model_doc/nat.md @@ -16,6 +16,14 @@ rendered properly in your Markdown viewer. # Neighborhood Attention Transformer + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview NAT was proposed in [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) diff --git a/docs/source/en/model_doc/nezha.md b/docs/source/en/model_doc/nezha.md index 872f576f1286eb..976722592cad22 100644 --- a/docs/source/en/model_doc/nezha.md +++ b/docs/source/en/model_doc/nezha.md @@ -16,6 +16,14 @@ rendered properly in your Markdown viewer. # Nezha + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The Nezha model was proposed in [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei et al. @@ -25,8 +33,8 @@ The abstract from the paper is the following: *The pre-trained language models have achieved great successes in various natural language understanding (NLU) tasks due to its capacity to capture the deep contextualized information in text by pre-training on large-scale corpora. In this technical report, we present our practice of pre-training language models named NEZHA (NEural contextualiZed -representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks. -The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional +representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks. +The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional Relative Positional Encoding as an effective positional encoding scheme, Whole Word Masking strategy, Mixed Precision Training and the LAMB Optimizer in training the models. The experimental results show that NEZHA achieves the state-of-the-art performances when finetuned on several representative Chinese tasks, including @@ -85,4 +93,4 @@ This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The ori ## NezhaForQuestionAnswering [[autodoc]] NezhaForQuestionAnswering - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md index 75fab0853a9778..1b4e92bc4eb110 100644 --- a/docs/source/en/model_doc/owlv2.md +++ b/docs/source/en/model_doc/owlv2.md @@ -64,8 +64,8 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio >>> for box, score, label in zip(boxes, scores, labels): ... box = [round(i, 2) for i in box.tolist()] ... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") -Detected a photo of a cat with confidence 0.614 at location [341.67, 17.54, 642.32, 278.51] -Detected a photo of a cat with confidence 0.665 at location [6.75, 38.97, 326.62, 354.85] +Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35] +Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13] ``` ## Resources diff --git a/docs/source/en/model_doc/paligemma.md b/docs/source/en/model_doc/paligemma.md new file mode 100644 index 00000000000000..48debe593f97a9 --- /dev/null +++ b/docs/source/en/model_doc/paligemma.md @@ -0,0 +1,78 @@ + + +# PaliGemma + +## Overview + +The PaliGemma model was proposed in [PaliGemma – Google's Cutting-Edge Open Vision Language Model](https://huggingface.co/blog/paligemma) by Google. It is a 3B vision-language model composed by a [SigLIP](siglip) vision encoder and a [Gemma](gemma) language decoder linked by a multimodal linear projection. It cuts an image into a fixed number of VIT tokens and prepends it to an optional prompt. One particularity is that the model uses full block attention on all the image tokens plus the input text tokens. It comes in 3 resolutions, 224x224, 448x448 and 896x896 with 3 base models, with 55 fine-tuned versions for different tasks, and 2 mix models. + + + + PaliGemma architecture. Taken from the blog post. + +This model was contributed by [Molbap](https://huggingface.co/Molbap). + +## Usage tips + +Inference with PaliGemma can be performed as follows: + +```python +from transformers import AutoProcessor, PaliGemmaForConditionalGeneration + +model_id = "google/paligemma-3b-mix-224" +model = PaliGemmaForConditionalGeneration.from_pretrained(model_id) +processor = AutoProcessor.from_pretrained(model_id) + +prompt = "What is on the flower?" +image_file = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg?download=true" +raw_image = Image.open(requests.get(image_file, stream=True).raw) +inputs = processor(prompt, raw_image, return_tensors="pt") +output = model.generate(**inputs, max_new_tokens=20) + +print(processor.decode(output[0], skip_special_tokens=True)[len(prompt):]) +``` + +- PaliGemma is not meant for conversational use, and it works best when fine-tuning to a specific use case. Some downstream tasks on which PaliGemma can be fine-tuned include image captioning, visual question answering (VQA), object detection, referring expression segmentation and document understanding. +- One can use `PaliGemmaProcessor` to prepare images, text and optional labels for the model. When fine-tuning a PaliGemma model, the `suffix` argument can be passed to the processor which creates the `labels` for the model: + +```python +prompt = "What is on the flower?" +answer = "a bee" +inputs = processor(text=prompt, images=raw_image, suffix=answer, return_tensors="pt") +``` + +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with PaliGemma. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +- A blog post introducing all the features of PaliGemma can be found [here](https://huggingface.co/blog/paligemma). +- Demo notebooks on how to fine-tune PaliGemma for VQA with the Trainer API along with inference can be found [here](https://github.com/huggingface/notebooks/tree/main/examples/paligemma). +- Demo notebooks on how to fine-tune PaliGemma on a custom dataset (receipt image -> JSON) along with inference can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/PaliGemma). 🌎 + +## PaliGemmaConfig + +[[autodoc]] PaliGemmaConfig + +## PaliGemmaProcessor + +[[autodoc]] PaliGemmaProcessor + +## PaliGemmaForConditionalGeneration + +[[autodoc]] PaliGemmaForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/persimmon.md b/docs/source/en/model_doc/persimmon.md index fe9e66a0b7175e..7a105ac5543d60 100644 --- a/docs/source/en/model_doc/persimmon.md +++ b/docs/source/en/model_doc/persimmon.md @@ -96,3 +96,8 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece. T [[autodoc]] PersimmonForSequenceClassification - forward + +## PersimmonForTokenClassification + +[[autodoc]] PersimmonForTokenClassification + - forward diff --git a/docs/source/en/model_doc/qdqbert.md b/docs/source/en/model_doc/qdqbert.md index 19b829d0bc5d19..ca718f34af4a32 100644 --- a/docs/source/en/model_doc/qdqbert.md +++ b/docs/source/en/model_doc/qdqbert.md @@ -16,6 +16,14 @@ rendered properly in your Markdown viewer. # QDQBERT + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md index 5f9e5dba22b844..ac0e25e02c35f9 100644 --- a/docs/source/en/model_doc/qwen2.md +++ b/docs/source/en/model_doc/qwen2.md @@ -80,3 +80,8 @@ In the following, we demonstrate how to use `Qwen2-7B-Chat-beta` for the inferen [[autodoc]] Qwen2ForSequenceClassification - forward + +## Qwen2ForTokenClassification + +[[autodoc]] Qwen2ForTokenClassification + - forward diff --git a/docs/source/en/model_doc/qwen2_moe.md b/docs/source/en/model_doc/qwen2_moe.md index 8a546c4016ad5e..9c6dc80beb61e5 100644 --- a/docs/source/en/model_doc/qwen2_moe.md +++ b/docs/source/en/model_doc/qwen2_moe.md @@ -75,3 +75,8 @@ In the following, we demonstrate how to use `Qwen1.5-MoE-A2.7B-Chat` for the inf [[autodoc]] Qwen2MoeForSequenceClassification - forward + +## Qwen2MoeForTokenClassification + +[[autodoc]] Qwen2MoeForTokenClassification + - forward diff --git a/docs/source/en/model_doc/realm.md b/docs/source/en/model_doc/realm.md index a8227bc83c7318..558e83c08b06a6 100644 --- a/docs/source/en/model_doc/realm.md +++ b/docs/source/en/model_doc/realm.md @@ -16,6 +16,14 @@ rendered properly in your Markdown viewer. # REALM + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a @@ -86,4 +94,4 @@ This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The origi [[autodoc]] RealmForOpenQA - block_embedding_to - - forward \ No newline at end of file + - forward diff --git a/docs/source/en/model_doc/sam.md b/docs/source/en/model_doc/sam.md index 2fc06193a774aa..12a87eb5bc8514 100644 --- a/docs/source/en/model_doc/sam.md +++ b/docs/source/en/model_doc/sam.md @@ -81,10 +81,10 @@ processor = SamProcessor.from_pretrained("facebook/sam-vit-huge") img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") mask_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" -segmentation_map = Image.open(requests.get(mask_url, stream=True).raw).convert("RGB") +segmentation_map = Image.open(requests.get(mask_url, stream=True).raw).convert("1") input_points = [[[450, 600]]] # 2D location of a window in the image -inputs = processor(raw_image, input_points=input_points, segmentation_maps=mask, return_tensors="pt").to(device) +inputs = processor(raw_image, input_points=input_points, segmentation_maps=segmentation_map, return_tensors="pt").to(device) with torch.no_grad(): outputs = model(**inputs) diff --git a/docs/source/en/model_doc/speech_to_text_2.md b/docs/source/en/model_doc/speech_to_text_2.md index 6648e67f629d3c..fc2d0357c546c7 100644 --- a/docs/source/en/model_doc/speech_to_text_2.md +++ b/docs/source/en/model_doc/speech_to_text_2.md @@ -16,6 +16,14 @@ rendered properly in your Markdown viewer. # Speech2Text2 + + + This model is in maintenance mode only, we don't accept any new PRs changing its code. + If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. + You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md index 6a50995ca086e8..09c0e5855c3a1d 100644 --- a/docs/source/en/model_doc/stablelm.md +++ b/docs/source/en/model_doc/stablelm.md @@ -104,3 +104,8 @@ Now, to run the model with Flash Attention 2, refer to the snippet below: [[autodoc]] StableLmForSequenceClassification - forward + +## StableLmForTokenClassification + +[[autodoc]] StableLmForTokenClassification + - forward diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md index 9e2e547b8c3eae..1d107b3855564a 100644 --- a/docs/source/en/model_doc/starcoder2.md +++ b/docs/source/en/model_doc/starcoder2.md @@ -66,3 +66,8 @@ These ready-to-use checkpoints can be downloaded and used via the HuggingFace Hu [[autodoc]] Starcoder2ForSequenceClassification - forward + +## Starcoder2ForTokenClassification + +[[autodoc]] Starcoder2ForTokenClassification + - forward diff --git a/docs/source/en/model_doc/superpoint.md b/docs/source/en/model_doc/superpoint.md index 56e28622bde9ff..b9aab2f1b929f2 100644 --- a/docs/source/en/model_doc/superpoint.md +++ b/docs/source/en/model_doc/superpoint.md @@ -38,12 +38,17 @@ to repeatedly detect a much richer set of interest points than the initial pre-a traditional corner detector. The final system gives rise to state-of-the-art homography estimation results on HPatches when compared to LIFT, SIFT and ORB.* -## How to use + + + SuperPoint overview. Taken from the original paper. + +## Usage tips Here is a quick example of using the model to detect interest points in an image: ```python -from transformers import AutoImageProcessor, AutoModel +from transformers import AutoImageProcessor, SuperPointForKeypointDetection import torch from PIL import Image import requests @@ -52,7 +57,7 @@ url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint") -model = AutoModel.from_pretrained("magic-leap-community/superpoint") +model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint") inputs = processor(image, return_tensors="pt") outputs = model(**inputs) @@ -64,7 +69,7 @@ You can also feed multiple images to the model. Due to the nature of SuperPoint, you will need to use the mask attribute to retrieve the respective information : ```python -from transformers import AutoImageProcessor, AutoModel +from transformers import AutoImageProcessor, SuperPointForKeypointDetection import torch from PIL import Image import requests @@ -77,7 +82,7 @@ image_2 = Image.open(requests.get(url_image_2, stream=True).raw) images = [image_1, image_2] processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint") -model = AutoModel.from_pretrained("magic-leap-community/superpoint") +model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint") inputs = processor(images, return_tensors="pt") outputs = model(**inputs) @@ -103,6 +108,12 @@ cv2.imwrite("output_image.png", image) This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille). The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork). +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SuperPoint. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +- A notebook showcasing inference and visualization with SuperPoint can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SuperPoint/Inference_with_SuperPoint_to_detect_interest_points_in_an_image.ipynb). 🌎 + ## SuperPointConfig [[autodoc]] SuperPointConfig diff --git a/docs/source/en/model_doc/tvlt.md b/docs/source/en/model_doc/tvlt.md index f09ea8af863c9a..0a0f50e4731569 100644 --- a/docs/source/en/model_doc/tvlt.md +++ b/docs/source/en/model_doc/tvlt.md @@ -16,6 +16,14 @@ rendered properly in your Markdown viewer. # TVLT + + +This model is in maintenance mode only, we don't accept any new PRs changing its code. +If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2. +You can do so by running the following command: `pip install -U transformers==4.40.2`. + + + ## Overview The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) @@ -60,7 +68,7 @@ The original code can be found [here](https://github.com/zinengtang/TVLT). This [[autodoc]] TvltFeatureExtractor - __call__ - + ## TvltModel [[autodoc]] TvltModel diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md new file mode 100644 index 00000000000000..307c55bb2cef63 --- /dev/null +++ b/docs/source/en/model_doc/video_llava.md @@ -0,0 +1,199 @@ + + +# Video-LLaVA + +## Overview + +Video-LLaVa is an open-source multimodal LLM trained by fine-tuning LlamA/Vicuna on multimodal instruction-following data generated by Llava1.5 and VideChat. It is an auto-regressive language model, based on the transformer architecture. Video-LLaVa unifies visual representations to the language feature space, and enables an LLM to perform visual reasoning capabilities on both images and videos simultaneously. + + +The Video-LLaVA model was proposed in [Video-LLaVA: Learning United Visual Representation by Alignment Before Projection](https://arxiv.org/abs/2311.10122) by Bin Lin, Yang Ye, Bin Zhu, Jiaxi Cui, Munang Ning, Peng Jin, Li Yuan. + +The abstract from the paper is the following: + +*The Large Vision-Language Model (LVLM) has enhanced the performance of various downstream tasks in +visual-language understanding. Most existing approaches +encode images and videos into separate feature spaces, +which are then fed as inputs to large language models. +However, due to the lack of unified tokenization for images and videos, namely misalignment before projection, it +becomes challenging for a Large Language Model (LLM) +to learn multi-modal interactions from several poor projection layers. In this work, we unify visual representation into the language feature space to advance the foundational LLM towards a unified LVLM. As a result, we establish a simple but robust LVLM baseline, Video-LLaVA, +which learns from a mixed dataset of images and videos, +mutually enhancing each other. Video-LLaVA achieves superior performances on a broad range of 9 image benchmarks across 5 image question-answering datasets and 4 +image benchmark toolkits. Additionally, our Video-LLaVA +also outperforms Video-ChatGPT by 5.8%, 9.9%, 18.6%, +and 10.1% on MSRVTT, MSVD, TGIF, and ActivityNet, respectively. Notably, extensive experiments demonstrate that +Video-LLaVA mutually benefits images and videos within +a unified visual representation, outperforming models designed specifically for images or videos. We aim for this +work to provide modest insights into the multi-modal inputs +for the LLM* + +## Usage tips: + +- We advise users to use padding_side="left" when computing batched generation as it leads to more accurate results. Simply make sure to call processor.tokenizer.padding_side = "left" before generating. + +- Note the model has not been explicitly trained to process multiple images/videos in the same prompt, although this is technically possible, you may experience inaccurate results. + +- Note that the video inputs should have exactly 8 frames at the input, since the models were trained in that setting. + +This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay). +The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA). + + +## Usage example + +### Single Media Mode + +The model can accept both images and videos as input. Here's an example code for inference in half-precision (`torch.float16`): + +```python +import av +import torch +import numpy as np +from transformers import VideoLlavaForConditionalGeneration, VideoLlavaProcessor + +def read_video_pyav(container, indices): + ''' + Decode the video with PyAV decoder. + Args: + container (`av.container.input.InputContainer`): PyAV container. + indices (`List[int]`): List of frame indices to decode. + Returns: + result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). + ''' + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) + +# Load the model in half-precision +model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-LLaVA-7B-hf", torch_dtype=torch.float16, device_map="auto") +processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf") + +# Load the video as an np.arrau, sampling uniformly 8 frames +video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset") +container = av.open(video_path) +total_frames = container.streams.video[0].frames +indices = np.arange(0, total_frames, total_frames / 8).astype(int) +video = read_video_pyav(container, indices) + +# For better results, we recommend to prompt the model in the following format +prompt = "USER: