From 7b85d8c041dcd8d961b51c4d578a1da5639b712b Mon Sep 17 00:00:00 2001 From: OlivierDehaene <23298448+OlivierDehaene@users.noreply.github.com> Date: Thu, 29 Feb 2024 15:46:07 +0100 Subject: [PATCH] fix: docs --- README.md | 22 +++++++++++----------- docs/source/en/private_models.md | 2 +- docs/source/en/quick_tour.md | 6 +++--- docs/source/en/supported_models.md | 12 ++++++------ 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index 001a1e6f..4036de85 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,7 @@ model=BAAI/bge-large-en-v1.5 revision=refs/pr/5 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model --revision $revision +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model --revision $revision ``` And then you can make requests like @@ -245,13 +245,13 @@ Text Embeddings Inference ships with multiple Docker images that you can use to | Architecture | Image | |-------------------------------------|-------------------------------------------------------------------------| -| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.0 | +| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.1 | | Volta | NOT SUPPORTED | -| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.0 (experimental) | -| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.0 | -| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.0 | -| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.0 | -| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.0 (experimental) | +| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.1 (experimental) | +| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.1 | +| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.1 | +| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.1 | +| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.1 (experimental) | **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues. You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable. @@ -280,7 +280,7 @@ model= volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run token= -docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model +docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model ``` ### Using Re-rankers models @@ -298,7 +298,7 @@ model=BAAI/bge-reranker-large revision=refs/pr/4 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model --revision $revision +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model --revision $revision ``` And then you can rank the similarity between a query and a list of texts with: @@ -318,7 +318,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba model=SamLowe/roberta-base-go_emotions volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model ``` Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input: @@ -347,7 +347,7 @@ model=BAAI/bge-large-en-v1.5 revision=refs/pr/5 volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0-grpc --model-id $model --revision $revision +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1-grpc --model-id $model --revision $revision ``` ```shell diff --git a/docs/source/en/private_models.md b/docs/source/en/private_models.md index ae3e4049..f8aeb6a7 100644 --- a/docs/source/en/private_models.md +++ b/docs/source/en/private_models.md @@ -37,5 +37,5 @@ model= volume=$PWD/data token= -docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model +docker run --gpus all -e HUGGING_FACE_HUB_TOKEN=$token -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model ``` diff --git a/docs/source/en/quick_tour.md b/docs/source/en/quick_tour.md index 1ad13541..b91456d2 100644 --- a/docs/source/en/quick_tour.md +++ b/docs/source/en/quick_tour.md @@ -34,7 +34,7 @@ model=BAAI/bge-large-en-v1.5 revision=refs/pr/5 volume=$PWD/data -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model --revision $revision +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model --revision $revision ``` @@ -69,7 +69,7 @@ model=BAAI/bge-reranker-large revision=refs/pr/4 volume=$PWD/data -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model --revision $revision +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model --revision $revision ``` Once you have deployed a model you can use the `rerank` endpoint to rank the similarity between a query and a list @@ -90,7 +90,7 @@ You can also use classic Sequence Classification models like `SamLowe/roberta-ba model=SamLowe/roberta-base-go_emotions volume=$PWD/data -docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.0 --model-id $model +docker run --gpus all -p 8080:80 -v $volume:/data --pull always ghcr.io/huggingface/text-embeddings-inference:1.1 --model-id $model ``` Once you have deployed the model you can use the `predict` endpoint to get the emotions most associated with an input: diff --git a/docs/source/en/supported_models.md b/docs/source/en/supported_models.md index 9240ca80..232c6bf4 100644 --- a/docs/source/en/supported_models.md +++ b/docs/source/en/supported_models.md @@ -63,13 +63,13 @@ Find the appropriate Docker image for your hardware in the following table: | Architecture | Image | |-------------------------------------|--------------------------------------------------------------------------| -| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.0 | +| CPU | ghcr.io/huggingface/text-embeddings-inference:cpu-1.1 | | Volta | NOT SUPPORTED | -| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.0 (experimental) | -| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.0 | -| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.0 | -| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.0 | -| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.0 (experimental) | +| Turing (T4, RTX 2000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:turing-1.1 (experimental) | +| Ampere 80 (A100, A30) | ghcr.io/huggingface/text-embeddings-inference:1.1 | +| Ampere 86 (A10, A40, ...) | ghcr.io/huggingface/text-embeddings-inference:86-1.1 | +| Ada Lovelace (RTX 4000 series, ...) | ghcr.io/huggingface/text-embeddings-inference:89-1.1 | +| Hopper (H100) | ghcr.io/huggingface/text-embeddings-inference:hopper-1.1 (experimental) | **Warning**: Flash Attention is turned off by default for the Turing image as it suffers from precision issues. You can turn Flash Attention v1 ON by using the `USE_FLASH_ATTENTION=True` environment variable.