Team/hypothesis tests (#474)

Merges the team/hypothesis-tests branch to main. Which adds a robust property-based testing suite to Chroma. lfg.
chroma-core · May 6, 2023 · f9b8f7c · f9b8f7c
1 parent 25e2cff
commit f9b8f7c
Show file tree

Hide file tree

Showing 38 changed files with 3,222 additions and 556 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,3 +1,9 @@
 venv
 .git
-examples
+examples
+clients
+.hypothesis
+__pycache__
+.vscode
+*.egg-info
+.pytest_cache
diff --git a/.github/workflows/chroma-integration-test.yml b/.github/workflows/chroma-integration-test.yml
@@ -0,0 +1,37 @@
+name: Chroma Integration Tests
+
+on:
+  push:
+    branches:
+      - main
+      - team/hypothesis-tests
+  pull_request:
+    branches:
+      - main
+      - team/hypothesis-tests
+
+jobs:
+  test:
+    strategy:
+      matrix:
+        python: ['3.7']
+        platform: [ubuntu-latest]
+        testfile: ["--ignore-glob 'chromadb/test/property/*'",
+                   "chromadb/test/property/test_add.py",
+                   "chromadb/test/property/test_collections.py",
+                   "chromadb/test/property/test_cross_version_persist.py",
+                   "chromadb/test/property/test_embeddings.py",
+                   "chromadb/test/property/test_filtering.py",
+                   "chromadb/test/property/test_persist.py"]
+    runs-on: ${{ matrix.platform }}
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python }}
+    - name: Install test dependencies
+      run: python -m pip install -r requirements.txt && python -m pip install -r requirements_dev.txt
+    - name: Integration Test
+      run: bin/integration-test ${{ matrix.testfile }}
diff --git a/.github/workflows/chroma-test.yml b/.github/workflows/chroma-test.yml
@@ -4,16 +4,26 @@ on:
   push:
     branches:
       - main
+      - team/hypothesis-tests
   pull_request:
     branches:
       - main
+      - team/hypothesis-tests
 
 jobs:
   test:
+    timeout-minutes: 90
     strategy:
       matrix:
-        python: ['3.10']
+        python: ['3.7', '3.8', '3.9', '3.10']
         platform: [ubuntu-latest]
+        testfile: ["--ignore-glob 'chromadb/test/property/*'",
+                   "chromadb/test/property/test_add.py",
+                   "chromadb/test/property/test_collections.py",
+                   "chromadb/test/property/test_cross_version_persist.py",
+                   "chromadb/test/property/test_embeddings.py",
+                   "chromadb/test/property/test_filtering.py",
+                   "chromadb/test/property/test_persist.py"]
     runs-on: ${{ matrix.platform }}
     steps:
     - name: Checkout
@@ -25,6 +35,4 @@ jobs:
     - name: Install test dependencies
       run: python -m pip install -r requirements.txt && python -m pip install -r requirements_dev.txt
     - name: Test
-      run: python -m pytest
-    - name: Integration Test
-      run: bin/integration-test
+      run: python -m pytest ${{ matrix.testfile }}
diff --git a/.github/workflows/pr-review-checklist.yml b/.github/workflows/pr-review-checklist.yml
@@ -0,0 +1,37 @@
+name: PR Review Checklist
+
+on:
+  pull_request_target:
+    types: 
+      - opened
+
+jobs:
+  PR-Comment:
+    runs-on: ubuntu-latest
+    steps:
+    - name: PR Comment
+      uses: actions/github-script@v2
+      with:
+        github-token: ${{secrets.GITHUB_TOKEN}}
+        script: |
+          github.issues.createComment({
+            issue_number: ${{ github.event.number }},
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            body: `# Reviewer Checklist
+            Please leverage this checklist to ensure your code review is thorough before approving
+            ## Testing, Bugs, Errors, Logs, Documentation
+            - [ ] Can you think of any use case in which the code does not behave as intended? Have they been tested?
+            - [ ] Can you think of any inputs or external events that could break the code? Is user input validated and safe? Have they been tested?
+            - [ ] If appropriate, are there adequate property based tests?
+            - [ ] If appropriate, are there adequate unit tests?
+            - [ ] Should any logging, debugging, tracing information be added or removed?
+            - [ ] Are error messages user-friendly?
+            - [ ] Have all documentation changes needed been made?
+            - [ ] Have all non-obvious changes been commented?
+            ## System Compatibility
+            - [ ] Are there any potential impacts on other parts of the system or backward compatibility?
+            - [ ] Does this change intersect with any items on our roadmap, and if so, is there a plan for fitting them together?
+            ## Quality
+            - [ ] Is this code of a unexpectedly high quality (Readbility, Modularity, Intuitiveness)`
+          })
diff --git a/.gitignore b/.gitignore
@@ -21,4 +21,4 @@ dist
 .terraform.lock.hcl
 terraform.tfstate
 .hypothesis/
-.idea
+.idea
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,23 +1,28 @@
 {
-    "git.ignoreLimitWarning": true,
-    "editor.rulers": [
-        120
-    ],
-    "editor.formatOnSave": true,
-    "python.formatting.provider": "black",
-    "files.exclude": {
-        "**/__pycache__": true,
-        "**/.ipynb_checkpoints": true,
-        "**/.pytest_cache": true,
-        "**/chroma.egg-info": true
-    },
-    "python.analysis.typeCheckingMode": "basic",
-    "python.linting.flake8Enabled": true,
-    "python.linting.enabled": true,
-    "python.linting.flake8Args": [
-        "--extend-ignore=E203",
-        "--extend-ignore=E501",
-        "--extend-ignore=E503",
-        "--max-line-length=88",
-    ],
-}
+  "git.ignoreLimitWarning": true,
+  "editor.rulers": [
+    120
+  ],
+  "editor.formatOnSave": true,
+  "python.formatting.provider": "black",
+  "files.exclude": {
+    "**/__pycache__": true,
+    "**/.ipynb_checkpoints": true,
+    "**/.pytest_cache": true,
+    "**/chroma.egg-info": true
+  },
+  "python.analysis.typeCheckingMode": "basic",
+  "python.linting.flake8Enabled": true,
+  "python.linting.enabled": true,
+  "python.linting.flake8Args": [
+    "--extend-ignore=E203",
+    "--extend-ignore=E501",
+    "--extend-ignore=E503",
+    "--max-line-length=88"
+  ],
+  "python.testing.pytestArgs": [
+    "."
+  ],
+  "python.testing.unittestEnabled": false,
+  "python.testing.pytestEnabled": true
+}
diff --git a/README.md b/README.md
@@ -13,10 +13,10 @@
   </a> |
   <a href="https://github.com/chroma-core/chroma/blob/master/LICENSE" target="_blank">
       <img src="https://img.shields.io/static/v1?label=license&message=Apache 2.0&color=white" alt="License">
-  </a> | 
+  </a> |
   <a href="https://docs.trychroma.com/" target="_blank">
       Docs
-  </a> | 
+  </a> |
   <a href="https://www.trychroma.com/" target="_blank">
       Homepage
   </a>
@@ -30,19 +30,19 @@ pip install chromadb # python client
 
 The core API is only 4 functions (run our [💡 Google Colab](https://colab.research.google.com/drive/1QEzFyqnoFxq7LUGyP1vzR4iLt9PpCDXv?usp=sharing) or [Replit template](https://replit.com/@swyx/BasicChromaStarter?v=1)):
 
-```python 
+```python
 import chromadb
 # setup Chroma in-memory, for easy prototyping. Can add persistence easily!
 client = chromadb.Client()
 
 # Create collection. get_collection, get_or_create_collection, delete_collection also available!
-collection = client.create_collection("all-my-documents") 
+collection = client.create_collection("all-my-documents")
 
 # Add docs to the collection. Can also update and delete. Row-based API coming soon!
 collection.add(
     documents=["This is document1", "This is document2"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
     metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
-    ids=["doc1", "doc2"], # unique for each doc 
+    ids=["doc1", "doc2"], # unique for each doc
 )
 
 # Query/search 2 most similar results. You can also .get by id
@@ -66,23 +66,23 @@ results = collection.query(
 For example, the `"Chat your data"` use case:
 1. Add documents to your database. You can pass in your own embeddings, embedding function, or let Chroma embed them for you.
 2. Query relevant documents with natural language.
-3. Compose documents into the context window of an LLM like `GPT3` for additional summarization or analysis. 
+3. Compose documents into the context window of an LLM like `GPT3` for additional summarization or analysis.
 
 ## Embeddings?
 
 What are embeddings?
 
 - [Read the guide from OpenAI](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings)
-- __Literal__: Embedding something turns it from image/text/audio into a list of numbers. 🖼️ or 📄 => `[1.2, 2.1, ....]`. This process makes documents "understandable" to a machine learning model. 
-- __By analogy__: An embedding represents the essence of a document. This enables documents and queries with the same essence to be "near" each other and therefore easy to find. 
+- __Literal__: Embedding something turns it from image/text/audio into a list of numbers. 🖼️ or 📄 => `[1.2, 2.1, ....]`. This process makes documents "understandable" to a machine learning model.
+- __By analogy__: An embedding represents the essence of a document. This enables documents and queries with the same essence to be "near" each other and therefore easy to find.
 - __Technical__: An embedding is the latent-space position of a document at a layer of a deep neural network. For models trained specifically to embed data, this is the last layer.
 - __A small example__: If you search your photos for "famous bridge in San Francisco". By embedding this query and comparing it to the embeddings of your photos and their metadata - it should return photos of the Golden Gate Bridge.
 
 Embeddings databases (also known as **vector databases**) store embeddings and allow you to search by nearest neighbors rather than by substrings like a traditional database. By default, Chroma uses [Sentence Transformers](https://docs.trychroma.com/embeddings#default-sentence-transformers) to embed for you but you can also use OpenAI embeddings, Cohere (multilingual) embeddings, or your own.
 
 ## Get involved
 
-Chroma is a rapidly developing project. We welcome PR contributors and ideas for how to improve the project. 
+Chroma is a rapidly developing project. We welcome PR contributors and ideas for how to improve the project.
 - [Join the conversation on Discord](https://discord.gg/MMeYNTmh3x)
 - [Review the roadmap and contribute your ideas](https://docs.trychroma.com/roadmap)
 - [Grab an issue and open a PR](https://github.com/chroma-core/chroma/issues)

diff --git a/bin/integration-test b/bin/integration-test
@@ -12,15 +12,15 @@ trap cleanup EXIT
 
 docker compose -f docker-compose.test.yml up --build -d
 
-export CHROMA_INTEGRATION_TEST=1
+export CHROMA_INTEGRATION_TEST_ONLY=1
 export CHROMA_API_IMPL=rest
 export CHROMA_SERVER_HOST=localhost
 export CHROMA_SERVER_HTTP_PORT=8000
 
-python -m pytest
+echo testing: python -m pytest "$@"
+python -m pytest "$@"
 
 cd clients/js
 yarn
 yarn test:run
-cd ../..
-
+cd ../..
diff --git a/chromadb/api/__init__.py b/chromadb/api/__init__.py
@@ -61,7 +61,8 @@ def create_collection(
         Args:
             name (str): The name of the collection to create. The name must be unique.
             metadata (Optional[Dict], optional): A dictionary of metadata to associate with the collection. Defaults to None.
-            get_or_create (bool, optional): If True, will return the collection if it already exists. Defaults to False.
+            get_or_create (bool, optional): If True, will return the collection if it already exists,
+                and update the metadata (if applicable). Defaults to False.
             embedding_function (Optional[Callable], optional): A function that takes documents and returns an embedding. Defaults to None.
 
         Returns:
@@ -82,8 +83,11 @@ def delete_collection(
         """
 
     @abstractmethod
-    def get_or_create_collection(self, name: str, metadata: Optional[Dict] = None) -> Collection:
-        """Calls create_collection with get_or_create=True
+    def get_or_create_collection(
+        self, name: str, metadata: Optional[Dict] = None
+    ) -> Collection:
+        """Calls create_collection with get_or_create=True.
+           If the collection exists, but with different metadata, the metadata will be replaced.
 
         Args:
             name (str): The name of the collection to create. The name must be unique.
@@ -141,7 +145,7 @@ def _add(
         ⚠️ It is recommended to use the more specific methods below when possible.
 
         Args:
-            collection_name (Union[str, Sequence[str]]): The model space(s) to add the embeddings to
+            collection_name (Union[str, Sequence[str]]): The collection(s) to add the embeddings to
             embedding (Sequence[Sequence[float]]): The sequence of embeddings to add
             metadata (Optional[Union[Dict, Sequence[Dict]]], optional): The metadata to associate with the embeddings. Defaults to None.
             documents (Optional[Union[str, Sequence[str]]], optional): The documents to associate with the embeddings. Defaults to None.
@@ -162,17 +166,40 @@ def _update(
         ⚠️ It is recommended to use the more specific methods below when possible.
 
         Args:
-            collection_name (Union[str, Sequence[str]]): The model space(s) to add the embeddings to
+            collection_name (Union[str, Sequence[str]]): The collection(s) to add the embeddings to
             embedding (Sequence[Sequence[float]]): The sequence of embeddings to add
         """
         pass
 
+    @abstractmethod
+    def _upsert(
+        self,
+        collection_name: str,
+        ids: IDs,
+        embeddings: Optional[Embeddings] = None,
+        metadatas: Optional[Metadatas] = None,
+        documents: Optional[Documents] = None,
+        increment_index: bool = True,
+    ):
+        """Add or update entries in the embedding store.
+        If an entry with the same id already exists, it will be updated, otherwise it will be added.
+
+        Args:
+            collection_name (str): The collection to add the embeddings to
+            ids (Optional[Union[str, Sequence[str]]], optional): The ids to associate with the embeddings. Defaults to None.
+            embeddings (Sequence[Sequence[float]]): The sequence of embeddings to add
+            metadatas (Optional[Union[Dict, Sequence[Dict]]], optional): The metadata to associate with the embeddings. Defaults to None.
+            documents (Optional[Union[str, Sequence[str]]], optional): The documents to associate with the embeddings. Defaults to None.
+            increment_index (bool, optional): If True, will incrementally add to the ANN index of the collection. Defaults to True.
+        """
+        pass
+
     @abstractmethod
     def _count(self, collection_name: str) -> int:
         """Returns the number of embeddings in the database
 
         Args:
-            collection_name (str): The model space to count the embeddings in.
+            collection_name (str): The collection to count the embeddings in.
 
         Returns:
             int: The number of embeddings in the collection
@@ -278,14 +305,19 @@ def raw_sql(self, sql: str) -> pd.DataFrame:
 
     @abstractmethod
     def create_index(self, collection_name: Optional[str] = None) -> bool:
-        """Creates an index for the given model space
+        """Creates an index for the given collection
         ⚠️ This method should not be used directly.
 
         Args:
-            collection_name (Optional[str], optional): The model space to create the index for. Uses the client's model space if None. Defaults to None.
+            collection_name (Optional[str], optional): The collection to create the index for. Uses the client's collection if None. Defaults to None.
 
         Returns:
             bool: True if the index was created successfully
 
         """
         pass
+
+    @abstractmethod
+    def persist(self):
+        """Persist the database to disk"""
+        pass