chroma-core · Anush008 · Apr 9, 2024 · Apr 10, 2024 · Apr 29, 2024 · May 13, 2024
diff --git a/chromadb/test/ef/test_ef.py b/chromadb/test/ef/test_ef.py
@@ -30,6 +30,7 @@ def test_get_builtins_holds() -> None:
         "SentenceTransformerEmbeddingFunction",
         "Text2VecEmbeddingFunction",
         "ChromaLangchainEmbeddingFunction",
+        "FastEmbedEmbeddingFunction",
     }
 
     assert expected_builtins == embedding_functions.get_builtins()

diff --git a/chromadb/test/ef/test_fastembed_ef.py b/chromadb/test/ef/test_fastembed_ef.py
@@ -0,0 +1,15 @@
+import pytest
+
+from chromadb.utils.embedding_functions.fastembed_embedding_function import (
+    FastEmbedEmbeddingFunction,
+)
+
+# Skip test if the 'fastembed' package is not installed is not installed
+fastembed = pytest.importorskip("fastembed", reason="fastembed not installed")
+
+
+def test_fastembed() -> None:
+    ef = FastEmbedEmbeddingFunction(model_name="BAAI/bge-small-en-v1.5")
+    embeddings = ef(["Here is an article about llamas...", "this is another article"])
+    assert len(embeddings) == 2
+    assert len(embeddings[0]) == 384
diff --git a/chromadb/utils/embedding_functions/fastembed_embedding_function.py b/chromadb/utils/embedding_functions/fastembed_embedding_function.py
@@ -0,0 +1,73 @@
+from typing import Any, Optional, cast
+
+from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
+
+
+class FastEmbedEmbeddingFunction(EmbeddingFunction[Documents]):
+    """
+    This class is used to generate embeddings for a list of texts using FastEmbed - https://qdrant.github.io/fastembed/.
+    Find the list of supported models at https://qdrant.github.io/fastembed/examples/Supported_Models/.
+    """
+
+    def __init__(
+        self,
+        model_name: str = "BAAI/bge-small-en-v1.5",
+        batch_size: int = 256,
+        cache_dir: Optional[str] = None,
+        threads: Optional[int] = None,
+        parallel: Optional[int] = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        Initialize fastembed.TextEmbedding
+
+        Args:
+            model_name (str): The name of the model to use. Defaults to `"BAAI/bge-small-en-v1.5"`.
+            batch_size (int): Batch size for encoding. Higher values will use more memory, but be faster.\
+                                        Defaults to 256.
+            cache_dir (str, optional): The path to the model cache directory.\
+                                       Can also be set using the `FASTEMBED_CACHE_PATH` env variable.
+            threads (int, optional): The number of threads single onnxruntime session can use.
+            parallel (int, optional): If `>1`, data-parallel encoding will be used, recommended for offline encoding of large datasets.\
+                                      If `0`, use all available cores.\
+                                      If `None`, don't use data-parallel processing, use default onnxruntime threading instead.\
+                                      Defaults to None.
+            **kwargs: Additional options to pass to fastembed.TextEmbedding
+
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        """
+        try:
+            from fastembed import TextEmbedding
+        except ImportError:
+            raise ValueError(
+                "The 'fastembed' package is not installed. Please install it with `pip install fastembed`"
+            )
+        self._batch_size = batch_size
+        self._parallel = parallel
+        self._model = TextEmbedding(
+            model_name=model_name, cache_dir=cache_dir, threads=threads, **kwargs
+        )
+
+    def __call__(self, input: Documents) -> Embeddings:
+        """
+        Get the embeddings for a list of texts.
+
+        Args:
+            input (Documents): A list of texts to get embeddings for.
+
+        Returns:
+            Embeddings: The embeddings for the texts.
+
+        Example:
+            >>> fastembed_ef = FastEmbedEmbeddingFunction(model_name="sentence-transformers/all-MiniLM-L6-v2")
+            >>> texts = ["Hello, world!", "How are you?"]
+            >>> embeddings = fastembed_ef(texts)
+        """
+        embeddings = self._model.embed(
+            input, batch_size=self._batch_size, parallel=self._parallel
+        )
+        return cast(
+            Embeddings,
+            [embedding.tolist() for embedding in embeddings],
+        )
diff --git a/docs/docs.trychroma.com/pages/guides/embeddings.md b/docs/docs.trychroma.com/pages/guides/embeddings.md
@@ -18,6 +18,7 @@ Chroma provides lightweight wrappers around popular embedding providers, making
 | [Instructor](/integrations/instructor) | ✅  | ➖ |
 | [Hugging Face Embedding Server](/integrations/hugging-face-server) | ✅  | ✅ |
 | [Jina AI](/integrations/jinaai) | ✅  | ✅ |
+| [FastEmbed](/integrations/fastembed) | ✅ | ➖ |
 
 We welcome pull requests to add new Embedding Functions to the community.
 

diff --git a/docs/docs.trychroma.com/pages/integrations/fastembed.md b/docs/docs.trychroma.com/pages/integrations/fastembed.md
@@ -0,0 +1,29 @@
+---
+title: FastEmbed
+---
+
+# FastEmbed
+
+[FastEmbed](https://qdrant.github.io/fastembed/) is a lightweight, CPU-first Python library built for embedding generation.
+
+This embedding function requires the `fastembed` package. To install it, run
+
+```pip install fastembed```.
+
+You can find a list of all the supported models [here](https://qdrant.github.io/fastembed/examples/Supported_Models/).
+
+## Example usage
+
+Using the default BAAI/bge-small-en-v1.5 model.
+
+```python
+from chromadb.utils.embedding_functions.fastembed_embedding_function import FastEmbedEmbeddingFunction
+ef = FastEmbedEmbeddingFunction()
+```
+
+Additionally, you can also configure the cache directory, number of threads and other FastEmbed options.
+
+```python
+from chromadb.utils.embedding_functions import FastEmbedEmbeddingFunction
+ef = FastEmbedEmbeddingFunction(model_name="nomic-ai/nomic-embed-text-v1.5", cache_dir="models_cache", threads=5)
+```