diff --git a/README.md b/README.md index cc75d4a6..5fd00bbb 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,23 @@ search_result = client.query( print(search_result) ``` +FastEmbed can also utilise GPU for faster embeddings. To enable GPU support, install +```bash +pip install 'qdrant-client[fastembed-gpu]' +``` + +```python +from qdrant_client import QdrantClient + +# Initialize the client +client = QdrantClient(":memory:") # or QdrantClient(path="path/to/db") +client.set_model(client.DEFAULT_EMBEDDING_MODEL, providers=["CUDAExecutionProvider", "CPUExecutionProvider"]) +``` + +> Note: `fastembed-gpu` and `fastembed` are mutually exclusive. You can only install one of them. +> +> If you previously installed `fastembed`, you might need to start from a fresh environment to install `fastembed-gpu`. + ## Connect to Qdrant server To connect to Qdrant server, simply specify host and port: diff --git a/poetry.lock b/poetry.lock index b0c0869d..3c58a8ff 100644 --- a/poetry.lock +++ b/poetry.lock @@ -564,13 +564,13 @@ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipyth [[package]] name = "fastembed" -version = "0.2.6" +version = "0.2.7" description = "Fast, light, accurate library built for retrieval embedding generation" optional = true python-versions = "<3.13,>=3.8.0" files = [ - {file = "fastembed-0.2.6-py3-none-any.whl", hash = "sha256:3e18633291722087abebccccd7fcdffafef643cb22d203370d7fad4fa83c10fb"}, - {file = "fastembed-0.2.6.tar.gz", hash = "sha256:adaed5b46e19cc1bbe5f98f2b3ffecfc4d2a48d27512e28ff5bfe92a42649a66"}, + {file = "fastembed-0.2.7-py3-none-any.whl", hash = "sha256:8f21e65e6d5d06bd8727488906c6c4eda8eb86d81be8879a54846dfe8a23c9d3"}, + {file = "fastembed-0.2.7.tar.gz", hash = "sha256:f5537e3680694afcd3d806c19dd4514030fdc3144e7e9a9db0dfece771922503"}, ] [package.dependencies] @@ -583,7 +583,30 @@ numpy = [ onnx = ">=1.15.0,<2.0.0" onnxruntime = ">=1.17.0,<2.0.0" requests = ">=2.31,<3.0" -tokenizers = ">=0.15.1,<0.16.0" +tokenizers = ">=0.15,<0.16" +tqdm = ">=4.66,<5.0" + +[[package]] +name = "fastembed-gpu" +version = "0.2.7" +description = "Fast, light, accurate library built for retrieval embedding generation" +optional = true +python-versions = "<3.13,>=3.8.0" +files = [ + {file = "fastembed_gpu-0.2.7-py3-none-any.whl", hash = "sha256:fdde2c0b78a837c38a09518299efc8c7e9a13d6a7cb21dd3072230b2a30593ed"}, + {file = "fastembed_gpu-0.2.7.tar.gz", hash = "sha256:1fc33ce59cd6fd2816b348c0b4527e5218ff9c33c24b8efcf54f71accb7b055c"}, +] + +[package.dependencies] +huggingface-hub = ">=0.20,<0.21" +loguru = ">=0.7.2,<0.8.0" +numpy = [ + {version = ">=1.21", markers = "python_version < \"3.12\""}, + {version = ">=1.26", markers = "python_version >= \"3.12\""}, +] +onnxruntime-gpu = ">=1.17.0,<2.0.0" +requests = ">=2.31,<3.0" +tokenizers = ">=0.15,<0.16" tqdm = ">=4.66,<5.0" [[package]] @@ -1703,6 +1726,33 @@ packaging = "*" protobuf = "*" sympy = "*" +[[package]] +name = "onnxruntime-gpu" +version = "1.17.1" +description = "ONNX Runtime is a runtime accelerator for Machine Learning models" +optional = true +python-versions = "*" +files = [ + {file = "onnxruntime_gpu-1.17.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a55fe84ee11a59ea069c6a790ee960f1c7da0d7d6c74822b2a8b357027c93646"}, + {file = "onnxruntime_gpu-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:a9abefceb32879cbee9f57977d6bb8d58cbac501f8a64bf96bca2f4fdff157fe"}, + {file = "onnxruntime_gpu-1.17.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b2cd54f2b0a05e6bc9ab30182b859364d30115a19c31be24aa2edef40be00277"}, + {file = "onnxruntime_gpu-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:bdffcced8a5f6275c0df202220e9232138b336f868cd671c9d2c571e834d2a80"}, + {file = "onnxruntime_gpu-1.17.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a1c871e8d0ae4121ea6528fc9410a5a7cbc5e43714b30521d5514fd10b987c83"}, + {file = "onnxruntime_gpu-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:9a0a94eda080e9f4a8e5035fdf0b3c24f5533e7861d88833a94493e63fca0812"}, + {file = "onnxruntime_gpu-1.17.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:624fdb65a632833f13de36854855818680be4f77942d8114524491d58f60d3ab"}, + {file = "onnxruntime_gpu-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:29fa78d232bbb5a5be3a3e0a022148a7b3df2ca66b4c21a11eef56e6f22859e9"}, + {file = "onnxruntime_gpu-1.17.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b0f8c70f2f9aeae825f3a397cc0c5f45124f9ae7c173263cf13c495982b0b99a"}, + {file = "onnxruntime_gpu-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:b1a27a104334461b690e4fc62775e1e71c68936399874932225d7fea21a0c261"}, +] + +[package.dependencies] +coloredlogs = "*" +flatbuffers = "*" +numpy = ">=1.21.6" +packaging = "*" +protobuf = "*" +sympy = "*" + [[package]] name = "packaging" version = "24.0" @@ -3035,8 +3085,9 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] fastembed = ["fastembed"] +fastembed-gpu = ["fastembed-gpu"] [metadata] lock-version = "2.0" python-versions = ">=3.8" -content-hash = "d9f322012dac6de2c4f537befddade3c059b8f2dcd9420ff742450739da4dc03" +content-hash = "5a438b3f01f44b4b86d95f7c71e9bf3cae3a67316dc27cb657f4539e08570bb3" diff --git a/pyproject.toml b/pyproject.toml index 5686be50..3a634038 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,10 @@ grpcio-tools = ">=1.41.0" urllib3 = ">=1.26.14,<3" portalocker = "^2.7.0" fastembed = [ - { version = "0.2.6", optional = true, python = "<3.13" } + { version = "0.2.7", optional = true, python = "<3.13" } +] +fastembed-gpu = [ + { version = "0.2.7", optional = true, python = "<3.13" } ] [tool.poetry.group.dev.dependencies] @@ -58,6 +61,7 @@ types-protobuf = "^4.21.0.5" [tool.poetry.extras] fastembed = ["fastembed"] +fastembed-gpu = ["fastembed-gpu"] [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/qdrant_client/async_qdrant_fastembed.py b/qdrant_client/async_qdrant_fastembed.py index 888d8e8a..89b48d8b 100644 --- a/qdrant_client/async_qdrant_fastembed.py +++ b/qdrant_client/async_qdrant_fastembed.py @@ -12,7 +12,7 @@ import uuid import warnings from itertools import tee -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union from qdrant_client.async_client_base import AsyncQdrantBase from qdrant_client.conversions import common_types as types @@ -21,11 +21,12 @@ from qdrant_client.hybrid.fusion import reciprocal_rank_fusion try: - from fastembed import TextEmbedding - from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding + from fastembed import SparseTextEmbedding, TextEmbedding + from fastembed.common import OnnxProvider except ImportError: TextEmbedding = None SparseTextEmbedding = None + OnnxProvider = None SUPPORTED_EMBEDDING_MODELS: Dict[str, Tuple[int, models.Distance]] = ( { model["model"]: (model["dim"], models.Distance.COSINE) @@ -51,8 +52,7 @@ def __init__(self, **kwargs: Any): self._embedding_model_name: Optional[str] = None self._sparse_embedding_model_name: Optional[str] = None try: - from fastembed import TextEmbedding - from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding + from fastembed import SparseTextEmbedding, TextEmbedding self.__class__._FASTEMBED_INSTALLED = True except ImportError: @@ -75,6 +75,7 @@ def set_model( max_length: Optional[int] = None, cache_dir: Optional[str] = None, threads: Optional[int] = None, + providers: Optional[Sequence["OnnxProvider"]] = None, **kwargs: Any, ) -> None: """ @@ -86,6 +87,9 @@ def set_model( Can be set using the `FASTEMBED_CACHE_PATH` env variable. Defaults to `fastembed_cache` in the system's temp directory. threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. + providers: The list of onnx providers (with or without options) to use. Defaults to None. + Example configuration: + https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#configuration-options Raises: ValueError: If embedding model is not supported. ImportError: If fastembed is not installed. @@ -100,7 +104,11 @@ def set_model( stacklevel=2, ) self._get_or_init_model( - model_name=embedding_model_name, cache_dir=cache_dir, threads=threads, **kwargs + model_name=embedding_model_name, + cache_dir=cache_dir, + threads=threads, + providers=providers, + **kwargs, ) self._embedding_model_name = embedding_model_name @@ -109,6 +117,7 @@ def set_sparse_model( embedding_model_name: Optional[str], cache_dir: Optional[str] = None, threads: Optional[int] = None, + providers: Optional[Sequence["OnnxProvider"]] = None, ) -> None: """ Set sparse embedding model to use for hybrid search over documents in combination with dense embeddings. @@ -119,6 +128,9 @@ def set_sparse_model( Can be set using the `FASTEMBED_CACHE_PATH` env variable. Defaults to `fastembed_cache` in the system's temp directory. threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. + providers: The list of onnx providers (with or without options) to use. Defaults to None. + Example configuration: + https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#configuration-options Raises: ValueError: If embedding model is not supported. ImportError: If fastembed is not installed. @@ -128,7 +140,10 @@ def set_sparse_model( """ if embedding_model_name is not None: self._get_or_init_sparse_model( - model_name=embedding_model_name, cache_dir=cache_dir, threads=threads + model_name=embedding_model_name, + cache_dir=cache_dir, + threads=threads, + providers=providers, ) self._sparse_embedding_model_name = embedding_model_name @@ -155,6 +170,7 @@ def _get_or_init_model( model_name: str, cache_dir: Optional[str] = None, threads: Optional[int] = None, + providers: Optional[Sequence["OnnxProvider"]] = None, **kwargs: Any, ) -> "TextEmbedding": if model_name in cls.embedding_models: @@ -165,7 +181,11 @@ def _get_or_init_model( f"Unsupported embedding model: {model_name}. Supported models: {SUPPORTED_EMBEDDING_MODELS}" ) cls.embedding_models[model_name] = TextEmbedding( - model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs + model_name=model_name, + cache_dir=cache_dir, + threads=threads, + providers=providers, + **kwargs, ) return cls.embedding_models[model_name] @@ -175,6 +195,7 @@ def _get_or_init_sparse_model( model_name: str, cache_dir: Optional[str] = None, threads: Optional[int] = None, + providers: Optional[Sequence["OnnxProvider"]] = None, **kwargs: Any, ) -> "SparseTextEmbedding": if model_name in cls.sparse_embedding_models: @@ -185,7 +206,11 @@ def _get_or_init_sparse_model( f"Unsupported embedding model: {model_name}. Supported models: {SUPPORTED_SPARSE_EMBEDDING_MODELS}" ) cls.sparse_embedding_models[model_name] = SparseTextEmbedding( - model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs + model_name=model_name, + cache_dir=cache_dir, + threads=threads, + providers=providers, + **kwargs, ) return cls.sparse_embedding_models[model_name] diff --git a/qdrant_client/qdrant_fastembed.py b/qdrant_client/qdrant_fastembed.py index 8850f4ac..d45d3378 100644 --- a/qdrant_client/qdrant_fastembed.py +++ b/qdrant_client/qdrant_fastembed.py @@ -1,7 +1,7 @@ import uuid import warnings from itertools import tee -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union from qdrant_client.client_base import QdrantBase from qdrant_client.conversions import common_types as types @@ -10,11 +10,12 @@ from qdrant_client.hybrid.fusion import reciprocal_rank_fusion try: - from fastembed import TextEmbedding - from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding + from fastembed import SparseTextEmbedding, TextEmbedding + from fastembed.common import OnnxProvider except ImportError: TextEmbedding = None SparseTextEmbedding = None + OnnxProvider = None SUPPORTED_EMBEDDING_MODELS: Dict[str, Tuple[int, models.Distance]] = ( @@ -45,8 +46,7 @@ def __init__(self, **kwargs: Any): self._embedding_model_name: Optional[str] = None self._sparse_embedding_model_name: Optional[str] = None try: - from fastembed import TextEmbedding # noqa: F401 - from fastembed.sparse.sparse_text_embedding import SparseTextEmbedding + from fastembed import SparseTextEmbedding, TextEmbedding # noqa: F401 self.__class__._FASTEMBED_INSTALLED = True except ImportError: @@ -70,6 +70,7 @@ def set_model( max_length: Optional[int] = None, cache_dir: Optional[str] = None, threads: Optional[int] = None, + providers: Optional[Sequence["OnnxProvider"]] = None, **kwargs: Any, ) -> None: """ @@ -81,6 +82,9 @@ def set_model( Can be set using the `FASTEMBED_CACHE_PATH` env variable. Defaults to `fastembed_cache` in the system's temp directory. threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. + providers: The list of onnx providers (with or without options) to use. Defaults to None. + Example configuration: + https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#configuration-options Raises: ValueError: If embedding model is not supported. ImportError: If fastembed is not installed. @@ -101,6 +105,7 @@ def set_model( model_name=embedding_model_name, cache_dir=cache_dir, threads=threads, + providers=providers, **kwargs, ) self._embedding_model_name = embedding_model_name @@ -110,6 +115,7 @@ def set_sparse_model( embedding_model_name: Optional[str], cache_dir: Optional[str] = None, threads: Optional[int] = None, + providers: Optional[Sequence["OnnxProvider"]] = None, ) -> None: """ Set sparse embedding model to use for hybrid search over documents in combination with dense embeddings. @@ -120,6 +126,9 @@ def set_sparse_model( Can be set using the `FASTEMBED_CACHE_PATH` env variable. Defaults to `fastembed_cache` in the system's temp directory. threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None. + providers: The list of onnx providers (with or without options) to use. Defaults to None. + Example configuration: + https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#configuration-options Raises: ValueError: If embedding model is not supported. ImportError: If fastembed is not installed. @@ -132,6 +141,7 @@ def set_sparse_model( model_name=embedding_model_name, cache_dir=cache_dir, threads=threads, + providers=providers, ) self._sparse_embedding_model_name = embedding_model_name @@ -163,6 +173,7 @@ def _get_or_init_model( model_name: str, cache_dir: Optional[str] = None, threads: Optional[int] = None, + providers: Optional[Sequence["OnnxProvider"]] = None, **kwargs: Any, ) -> "TextEmbedding": if model_name in cls.embedding_models: @@ -179,6 +190,7 @@ def _get_or_init_model( model_name=model_name, cache_dir=cache_dir, threads=threads, + providers=providers, **kwargs, ) return cls.embedding_models[model_name] @@ -189,6 +201,7 @@ def _get_or_init_sparse_model( model_name: str, cache_dir: Optional[str] = None, threads: Optional[int] = None, + providers: Optional[Sequence["OnnxProvider"]] = None, **kwargs: Any, ) -> "SparseTextEmbedding": if model_name in cls.sparse_embedding_models: @@ -205,6 +218,7 @@ def _get_or_init_sparse_model( model_name=model_name, cache_dir=cache_dir, threads=threads, + providers=providers, **kwargs, ) return cls.sparse_embedding_models[model_name]