fix: add cors_allow_origin to cli (#162)

huggingface · Feb 22, 2024 · 00a17ea · 00a17ea
1 parent 9d35f82
commit 00a17ea
Show file tree

Hide file tree

Showing 64 changed files with 176 additions and 118 deletions.
diff --git a/.cargo/config.toml b/.cargo/config.toml
@@ -5,4 +5,4 @@ rustflags = ["-C", "target-cpu=native"]
 rustflags = ["-C", "target-feature=+simd128"]
 
 [target.x86_64-apple-darwin]
-rustflags = ["-C", "target-feature=-avx,-avx2"]
+rustflags = ["-C", "target-feature=-avx,-avx2"]
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -5,14 +5,14 @@ body:
     id: system-info
     attributes:
       label: System Info
-      description: | 
+      description: |
         Please share your system info with us (`text-generation-launcher --env` if installed locally).
-        The full command line used that causes issues: 
+        The full command line used that causes issues:
         OS version:
         Rust version (if self-compiling, `cargo version`):
         Model being used (`curl 127.0.0.1:8080/info | jq`):
           If local model please explicit the kind of model and/or equivalents.
-        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`): 
+        Hardware used (GPUs, how many, on which cloud) (`nvidia-smi`):
         Deployment specificities (Kubernetes, EKS, AKS, any particular deployments):
         The current version being used:
 
@@ -52,11 +52,11 @@ body:
 
       placeholder: |
         Steps to reproduce the behavior:
-          
+
           1.
           2.
           3.
-          
+
 
   - type: textarea
     id: expected-behavior

diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -19,7 +19,7 @@ body:
       label: Motivation
       description: |
         Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
-        
+
 
   - type: textarea
     id: contribution

diff --git a/.github/workflows/build_86.yaml b/.github/workflows/build_86.yaml
@@ -131,4 +131,3 @@
            tags: ${{ steps.meta-86-grpc.outputs.tags }}
            labels: ${{ steps.meta-86-grpc.outputs.labels }}
            cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=max
-
diff --git a/.github/workflows/build_89.yaml b/.github/workflows/build_89.yaml
@@ -131,4 +131,3 @@
            tags: ${{ steps.meta-89-grpc.outputs.tags }}
            labels: ${{ steps.meta-89-grpc.outputs.labels }}
            cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-89,mode=max
-
diff --git a/.github/workflows/build_90.yaml b/.github/workflows/build_90.yaml
@@ -131,6 +131,3 @@
            tags: ${{ steps.meta-90-grpc.outputs.tags }}
            labels: ${{ steps.meta-90-grpc.outputs.labels }}
            cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=max
-
-
-
diff --git a/.github/workflows/build_cpu.yaml b/.github/workflows/build_cpu.yaml
@@ -129,4 +129,3 @@
            tags: ${{ steps.meta-cpu-grpc.outputs.tags }}
            labels: ${{ steps.meta-cpu-grpc.outputs.labels }}
            cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=max
-
diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
@@ -19,4 +19,4 @@ jobs:
       languages: en
     secrets:
       token: ${{ secrets.HUGGINGFACE_PUSH }}
-      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
@@ -17,4 +17,4 @@ jobs:
       pr_number: ${{ github.event.number }}
       package: text-embeddings-inference
       additional_args: --not_python_module
-      languages: en
+      languages: en
diff --git a/.github/workflows/liniting.yaml b/.github/workflows/liniting.yaml
@@ -0,0 +1,68 @@
+name: Linting Tests
+
+on:
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  run_tests:
+    runs-on: ubuntu-latest
+
+    env:
+      SCCACHE_GHA_ENABLED: "on"
+      RUSTC_WRAPPER: /usr/local/bin/sccache
+      SCCACHE: 0.3.3
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install Rust
+        uses: actions-rs/toolchain@v1
+        with:
+          # Released on: 28 December, 2023
+          # Branched from master on: 10 November, 2023
+          # https://releases.rs/docs/1.75.0/
+          toolchain: 1.75.0
+          override: true
+          components: rustfmt, clippy
+      - name: Install Protoc
+        uses: arduino/setup-protoc@v1
+      - name: Clean unused files
+        run: |
+          sudo rm -rf /usr/local/lib/android # will release about 10 GB if you don't need Android
+          sudo rm -rf /usr/share/dotnet # will release about 20GB if you don't need .NET
+      - name: Install sccache
+        run: |
+          curl -fsSL https://github.com/mozilla/sccache/releases/download/v$SCCACHE/sccache-v$SCCACHE-x86_64-unknown-linux-musl.tar.gz | tar -xzv --strip-components=1 -C /usr/local/bin sccache-v$SCCACHE-x86_64-unknown-linux-musl/sccache
+          chmod +x /usr/local/bin/sccache
+      - name: configure sccache
+        uses: actions/github-script@v6
+        with:
+          script: |
+            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
+            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
+            core.exportVariable('SCCACHE_GHA_CACHE_TO', 'sccache-${{runner.os}}-${{github.ref_name}}');
+            core.exportVariable('SCCACHE_GHA_CACHE_FROM', 'sccache-${{runner.os}}-main,sccache-${{runner.os}}-');
+      - name: cargo registry cache
+        uses: actions/cache@v3
+        with:
+          key: cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-${{ github.sha }}
+          restore-keys: |
+            cargo-${{ runner.os }}-${{ hashFiles('**/Cargo.toml') }}-
+            cargo-${{ runner.os }}-
+          path: |
+            ~/.cargo/registry
+            ~/.cargo/git
+      - name: Build
+        run: |
+          cargo build
+      - name: Pre-commit checks
+        run: |
+          pip install pre-commit
+          pre-commit install
+          pre-commit run --all-files
+      - name: sccache stats
+        run: |
+          /usr/local/bin/sccache --show-stats
diff --git a/.github/workflows/upload_pr_documentation.yml b/.github/workflows/upload_pr_documentation.yml
@@ -13,4 +13,4 @@ jobs:
       package_name: text-embeddings-inference
     secrets:
       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
+      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,14 @@
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+    -   id: check-yaml
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+        exclude: docs/source/basic_tutorials/launcher.md
+-   repo: https://github.com/doublify/pre-commit-rust
+    rev: v1.0
+    hooks:
+    -   id: fmt
+    -   id: cargo-check
+    -   id: clippy
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM lukemathwalker/cargo-chef:latest-rust-1.73-bookworm AS chef
+FROM lukemathwalker/cargo-chef:latest-rust-1.75-bookworm AS chef
 WORKDIR /usr/src
 
 ENV SCCACHE=0.5.4

diff --git a/Makefile b/Makefile
@@ -9,4 +9,3 @@ integration-tests-review:
 
 cuda-integration-tests-review:
 	cargo insta test --review --features "text-embeddings-backend-candle/cuda text-embeddings-backend-candle/flash-attn text-embeddings-router/candle-cuda" --release
-
diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ with absolute positions in `text-embeddings-inference`.
 
 Examples of supported models:
 
-| MTEB Rank | Model Type  | Model ID                                                                               | 
+| MTEB Rank | Model Type  | Model ID                                                                               |
 |-----------|-------------|----------------------------------------------------------------------------------------|
 | 1         | Bert        | [BAAI/bge-large-en-v1.5](https://hf.co/BAAI/bge-large-en-v1.5)                         |
 | 2         |             | [BAAI/bge-base-en-v1.5](https://hf.co/BAAI/bge-base-en-v1.5)                           |
@@ -130,21 +130,21 @@ Usage: text-embeddings-router [OPTIONS]
 
 Options:
       --model-id <MODEL_ID>
-          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`. 
-          Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of 
+          The name of the model to load. Can be a MODEL_ID as listed on <https://hf.co/models> like `thenlper/gte-base`.
+          Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of
           transformers
 
           [env: MODEL_ID=]
           [default: thenlper/gte-base]
 
       --revision <REVISION>
-          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id 
+          The actual revision of the model if you're referring to a model on the hub. You can use a specific commit id
           or a branch like `refs/pr/2`
 
           [env: REVISION=]
 
       --tokenization-workers <TOKENIZATION_WORKERS>
-          Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation. 
+          Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation.
           Default to the number of CPU cores on the machine
 
           [env: TOKENIZATION_WORKERS=]
@@ -158,7 +158,7 @@ Options:
       --pooling <POOLING>
           Optionally control the pooling method for embedding models.
 
-          If `pooling` is not set, the pooling configuration will be parsed from the model `1_Pooling/config.json` 
+          If `pooling` is not set, the pooling configuration will be parsed from the model `1_Pooling/config.json`
           configuration.
 
           If `pooling` is set, it will override the model pooling configuration
@@ -167,8 +167,8 @@ Options:
           [possible values: cls, mean]
 
       --max-concurrent-requests <MAX_CONCURRENT_REQUESTS>
-          The maximum amount of concurrent requests for this particular deployment. 
-          Having a low limit will refuse clients requests instead of having them wait for too long and is usually good 
+          The maximum amount of concurrent requests for this particular deployment.
+          Having a low limit will refuse clients requests instead of having them wait for too long and is usually good
           to handle backpressure correctly
 
           [env: MAX_CONCURRENT_REQUESTS=]
@@ -181,7 +181,7 @@ Options:
 
           For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
 
-          Overall this number should be the largest possible until the model is compute bound. Since the actual memory 
+          Overall this number should be the largest possible until the model is compute bound. Since the actual memory
           overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
 
           [env: MAX_BATCH_TOKENS=]
@@ -216,14 +216,14 @@ Options:
           [default: 3000]
 
       --uds-path <UDS_PATH>
-          The name of the unix socket some text-embeddings-inference backends will use as they communicate internally 
+          The name of the unix socket some text-embeddings-inference backends will use as they communicate internally
           with gRPC
 
           [env: UDS_PATH=]
           [default: /tmp/text-embeddings-inference-server]
 
       --huggingface-hub-cache <HUGGINGFACE_HUB_CACHE>
-          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk 
+          The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk
           for instance
 
           [env: HUGGINGFACE_HUB_CACHE=/data]
@@ -321,7 +321,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba
 model=SamLowe/roberta-base-go_emotions
 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
 
-docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model 
+docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:0.6 --model-id $model
 ```
 
 Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input:

diff --git a/backends/candle/Cargo.toml b/backends/candle/Cargo.toml
@@ -44,4 +44,3 @@ cuda = ["candle/cuda", "candle-nn/cuda", "dep:candle-cublaslt", "dep:candle-laye
 flash-attn-v1 = ["dep:candle-flash-attn-v1", "cuda"]
 flash-attn = ["dep:candle-flash-attn", "cuda"]
 static-linking = ["candle-cublaslt?/static-linking"]
-
diff --git a/backends/candle/src/models/bert.rs b/backends/candle/src/models/bert.rs
@@ -389,7 +389,7 @@ impl BertClassificationHead {
 impl ClassificationHead for BertClassificationHead {
     fn forward(&self, hidden_states: &Tensor) -> Result<Tensor> {
         let _enter = self.span.enter();
-        let hidden_states = self.output.forward(&hidden_states)?;
+        let hidden_states = self.output.forward(hidden_states)?;
         Ok(hidden_states)
     }
 }

diff --git a/backends/candle/tests/snapshots/test_bert__emotions_batch.snap b/backends/candle/tests/snapshots/test_bert__emotions_batch.snap
@@ -86,4 +86,3 @@ expression: predictions_batch
   - -6.64832
   - -7.4060283
   - 3.046496
-
diff --git a/backends/candle/tests/snapshots/test_bert__emotions_single.snap b/backends/candle/tests/snapshots/test_bert__emotions_single.snap
@@ -30,4 +30,3 @@ expression: predictions_single
   - -6.6477957
   - -7.406438
   - 3.0466576
-
diff --git a/backends/candle/tests/snapshots/test_bert__mini_batch.snap b/backends/candle/tests/snapshots/test_bert__mini_batch.snap
@@ -1154,4 +1154,3 @@ expression: embeddings_batch
   - 0.54164237
   - 0.28229737
   - 0.27705735
-
diff --git a/backends/candle/tests/snapshots/test_bert__mini_batch_pooled.snap b/backends/candle/tests/snapshots/test_bert__mini_batch_pooled.snap
@@ -1154,4 +1154,3 @@ expression: pooled_embeddings_batch
   - 0.13335083
   - -0.58064204
   - 0.059797622
-
diff --git a/backends/candle/tests/snapshots/test_bert__mini_batch_raw.snap b/backends/candle/tests/snapshots/test_bert__mini_batch_raw.snap
@@ -8451,4 +8451,3 @@ expression: raw_embeddings_batch
   - 0.8651217
   - 0.25980273
   - 0.19818383
-
diff --git a/backends/candle/tests/snapshots/test_bert__mini_single.snap b/backends/candle/tests/snapshots/test_bert__mini_single.snap
@@ -386,4 +386,3 @@ expression: embeddings_single
   - 0.54166216
   - 0.28228587
   - 0.27675694
-
diff --git a/backends/candle/tests/snapshots/test_bert__mini_single_pooled.snap b/backends/candle/tests/snapshots/test_bert__mini_single_pooled.snap
@@ -387,4 +387,3 @@ expression: embeddings_single
   - 0.13335083
   - -0.58064204
   - 0.059797622
-
diff --git a/backends/candle/tests/snapshots/test_bert__mini_single_raw.snap b/backends/candle/tests/snapshots/test_bert__mini_single_raw.snap
@@ -2690,4 +2690,3 @@ expression: embeddings_single
   - 0.8651217
   - 0.25980273
   - 0.19818383
-
diff --git a/backends/candle/tests/snapshots/test_flash_bert__emotions_batch.snap b/backends/candle/tests/snapshots/test_flash_bert__emotions_batch.snap
@@ -86,4 +86,3 @@ expression: predictions_batch
   - -6.6484375
   - -7.40625
   - 3.046875
-
diff --git a/backends/candle/tests/snapshots/test_flash_bert__emotions_single.snap b/backends/candle/tests/snapshots/test_flash_bert__emotions_single.snap
@@ -30,4 +30,3 @@ expression: predictions_single
   - -6.6484375
   - -7.40625
   - 3.046875
-
diff --git a/backends/candle/tests/snapshots/test_flash_bert__mini_batch.snap b/backends/candle/tests/snapshots/test_flash_bert__mini_batch.snap
@@ -1154,4 +1154,3 @@ expression: embeddings_batch
   - 0.5415039
   - 0.28173828
   - 0.2770996
-
diff --git a/backends/candle/tests/snapshots/test_flash_bert__mini_batch_pooled.snap b/backends/candle/tests/snapshots/test_flash_bert__mini_batch_pooled.snap
@@ -1154,4 +1154,3 @@ expression: pooled_embeddings_batch
   - 0.13317871
   - -0.5805664
   - 0.05984497
-
diff --git a/backends/candle/tests/snapshots/test_flash_bert__mini_batch_raw.snap b/backends/candle/tests/snapshots/test_flash_bert__mini_batch_raw.snap
@@ -8451,4 +8451,3 @@ expression: raw_embeddings_batch
   - 0.8647461
   - 0.25854492
   - 0.19787598
-
diff --git a/backends/candle/tests/snapshots/test_flash_bert__mini_single.snap b/backends/candle/tests/snapshots/test_flash_bert__mini_single.snap
@@ -386,4 +386,3 @@ expression: embeddings_single
   - 0.5415039
   - 0.28295898
   - 0.2763672
-
diff --git a/backends/candle/tests/snapshots/test_flash_bert__mini_single_pooled.snap b/backends/candle/tests/snapshots/test_flash_bert__mini_single_pooled.snap
@@ -387,4 +387,3 @@ expression: embeddings_single
   - 0.13330078
   - -0.5800781
   - 0.059539795
-
diff --git a/backends/candle/tests/snapshots/test_flash_bert__mini_single_raw.snap b/backends/candle/tests/snapshots/test_flash_bert__mini_single_raw.snap
@@ -2691,4 +2691,3 @@ expression: embeddings_single
   - 0.8642578
   - 0.25952148
   - 0.19750977
-
diff --git a/backends/candle/tests/snapshots/test_flash_jina__jina_batch.snap b/backends/candle/tests/snapshots/test_flash_jina__jina_batch.snap
@@ -1538,4 +1538,3 @@ expression: embeddings_batch
   - 0.089175105
   - 0.37251982
   - 0.009899339
-
diff --git a/backends/candle/tests/snapshots/test_flash_jina__jina_single.snap b/backends/candle/tests/snapshots/test_flash_jina__jina_single.snap
@@ -515,4 +515,3 @@ expression: embeddings_single
   - 0.08928977
   - 0.37260336
   - 0.010012831
-
diff --git a/backends/candle/tests/snapshots/test_jina__jina_batch.snap b/backends/candle/tests/snapshots/test_jina__jina_batch.snap
@@ -1538,4 +1538,3 @@ expression: embeddings_batch
   - 0.089175105
   - 0.37251982
   - 0.009899339
-
diff --git a/backends/candle/tests/snapshots/test_jina__jina_single.snap b/backends/candle/tests/snapshots/test_jina__jina_single.snap
@@ -514,4 +514,3 @@ expression: embeddings_single
   - 0.08928977
   - 0.37260336
   - 0.010012831
-
diff --git a/backends/grpc-client/src/pb/.gitignore b/backends/grpc-client/src/pb/.gitignore
@@ -1 +1 @@
-*.rs
+*.rs
Original file line number	Diff line number	Diff line change
Expand Up		@@ -131,4 +131,3 @@
		tags: ${{ steps.meta-86-grpc.outputs.tags }}
		labels: ${{ steps.meta-86-grpc.outputs.labels }}
		cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-86,mode=max
Original file line number	Diff line number	Diff line change
Expand Up		@@ -131,4 +131,3 @@
		tags: ${{ steps.meta-89-grpc.outputs.tags }}
		labels: ${{ steps.meta-89-grpc.outputs.labels }}
		cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-89,mode=max
Original file line number	Diff line number	Diff line change
Expand Up		@@ -131,6 +131,3 @@
		tags: ${{ steps.meta-90-grpc.outputs.tags }}
		labels: ${{ steps.meta-90-grpc.outputs.labels }}
		cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-90,mode=max
Original file line number	Diff line number	Diff line change
Expand Up		@@ -129,4 +129,3 @@
		tags: ${{ steps.meta-cpu-grpc.outputs.tags }}
		labels: ${{ steps.meta-cpu-grpc.outputs.labels }}
		cache-from: type=registry,ref=registry.internal.huggingface.tech/api-inference/text-embeddings-inference:cache-cpu,mode=max
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,3 @@ integration-tests-review:

		cuda-integration-tests-review:
		cargo insta test --review --features "text-embeddings-backend-candle/cuda text-embeddings-backend-candle/flash-attn text-embeddings-router/candle-cuda" --release
Original file line number	Diff line number	Diff line change
Expand Up		@@ -44,4 +44,3 @@ cuda = ["candle/cuda", "candle-nn/cuda", "dep:candle-cublaslt", "dep:candle-laye
		flash-attn-v1 = ["dep:candle-flash-attn-v1", "cuda"]
		flash-attn = ["dep:candle-flash-attn", "cuda"]
		static-linking = ["candle-cublaslt?/static-linking"]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -86,4 +86,3 @@ expression: predictions_batch
		- -6.64832
		- -7.4060283
		- 3.046496
Original file line number	Diff line number	Diff line change
Expand Up		@@ -30,4 +30,3 @@ expression: predictions_single
		- -6.6477957
		- -7.406438
		- 3.0466576
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1154,4 +1154,3 @@ expression: embeddings_batch
		- 0.54164237
		- 0.28229737
		- 0.27705735
Original file line number	Diff line number	Diff line change
Expand Up		@@ -1154,4 +1154,3 @@ expression: pooled_embeddings_batch
		- 0.13335083
		- -0.58064204
		- 0.059797622