Skip to content

Commit

Permalink
Team/hypothesis tests (#474)
Browse files Browse the repository at this point in the history
Merges the team/hypothesis-tests branch to main. Which adds a robust property-based testing suite to Chroma. lfg.
  • Loading branch information
HammadB authored May 6, 2023
1 parent 25e2cff commit f9b8f7c
Show file tree
Hide file tree
Showing 38 changed files with 3,222 additions and 556 deletions.
8 changes: 7 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
venv
.git
examples
examples
clients
.hypothesis
__pycache__
.vscode
*.egg-info
.pytest_cache
37 changes: 37 additions & 0 deletions .github/workflows/chroma-integration-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Chroma Integration Tests

on:
push:
branches:
- main
- team/hypothesis-tests
pull_request:
branches:
- main
- team/hypothesis-tests

jobs:
test:
strategy:
matrix:
python: ['3.7']
platform: [ubuntu-latest]
testfile: ["--ignore-glob 'chromadb/test/property/*'",
"chromadb/test/property/test_add.py",
"chromadb/test/property/test_collections.py",
"chromadb/test/property/test_cross_version_persist.py",
"chromadb/test/property/test_embeddings.py",
"chromadb/test/property/test_filtering.py",
"chromadb/test/property/test_persist.py"]
runs-on: ${{ matrix.platform }}
steps:
- name: Checkout
uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python }}
- name: Install test dependencies
run: python -m pip install -r requirements.txt && python -m pip install -r requirements_dev.txt
- name: Integration Test
run: bin/integration-test ${{ matrix.testfile }}
16 changes: 12 additions & 4 deletions .github/workflows/chroma-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,26 @@ on:
push:
branches:
- main
- team/hypothesis-tests
pull_request:
branches:
- main
- team/hypothesis-tests

jobs:
test:
timeout-minutes: 90
strategy:
matrix:
python: ['3.10']
python: ['3.7', '3.8', '3.9', '3.10']
platform: [ubuntu-latest]
testfile: ["--ignore-glob 'chromadb/test/property/*'",
"chromadb/test/property/test_add.py",
"chromadb/test/property/test_collections.py",
"chromadb/test/property/test_cross_version_persist.py",
"chromadb/test/property/test_embeddings.py",
"chromadb/test/property/test_filtering.py",
"chromadb/test/property/test_persist.py"]
runs-on: ${{ matrix.platform }}
steps:
- name: Checkout
Expand All @@ -25,6 +35,4 @@ jobs:
- name: Install test dependencies
run: python -m pip install -r requirements.txt && python -m pip install -r requirements_dev.txt
- name: Test
run: python -m pytest
- name: Integration Test
run: bin/integration-test
run: python -m pytest ${{ matrix.testfile }}
37 changes: 37 additions & 0 deletions .github/workflows/pr-review-checklist.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: PR Review Checklist

on:
pull_request_target:
types:
- opened

jobs:
PR-Comment:
runs-on: ubuntu-latest
steps:
- name: PR Comment
uses: actions/github-script@v2
with:
github-token: ${{secrets.GITHUB_TOKEN}}
script: |
github.issues.createComment({
issue_number: ${{ github.event.number }},
owner: context.repo.owner,
repo: context.repo.repo,
body: `# Reviewer Checklist
Please leverage this checklist to ensure your code review is thorough before approving
## Testing, Bugs, Errors, Logs, Documentation
- [ ] Can you think of any use case in which the code does not behave as intended? Have they been tested?
- [ ] Can you think of any inputs or external events that could break the code? Is user input validated and safe? Have they been tested?
- [ ] If appropriate, are there adequate property based tests?
- [ ] If appropriate, are there adequate unit tests?
- [ ] Should any logging, debugging, tracing information be added or removed?
- [ ] Are error messages user-friendly?
- [ ] Have all documentation changes needed been made?
- [ ] Have all non-obvious changes been commented?
## System Compatibility
- [ ] Are there any potential impacts on other parts of the system or backward compatibility?
- [ ] Does this change intersect with any items on our roadmap, and if so, is there a plan for fitting them together?
## Quality
- [ ] Is this code of a unexpectedly high quality (Readbility, Modularity, Intuitiveness)`
})
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ dist
.terraform.lock.hcl
terraform.tfstate
.hypothesis/
.idea
.idea
49 changes: 27 additions & 22 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
{
"git.ignoreLimitWarning": true,
"editor.rulers": [
120
],
"editor.formatOnSave": true,
"python.formatting.provider": "black",
"files.exclude": {
"**/__pycache__": true,
"**/.ipynb_checkpoints": true,
"**/.pytest_cache": true,
"**/chroma.egg-info": true
},
"python.analysis.typeCheckingMode": "basic",
"python.linting.flake8Enabled": true,
"python.linting.enabled": true,
"python.linting.flake8Args": [
"--extend-ignore=E203",
"--extend-ignore=E501",
"--extend-ignore=E503",
"--max-line-length=88",
],
}
"git.ignoreLimitWarning": true,
"editor.rulers": [
120
],
"editor.formatOnSave": true,
"python.formatting.provider": "black",
"files.exclude": {
"**/__pycache__": true,
"**/.ipynb_checkpoints": true,
"**/.pytest_cache": true,
"**/chroma.egg-info": true
},
"python.analysis.typeCheckingMode": "basic",
"python.linting.flake8Enabled": true,
"python.linting.enabled": true,
"python.linting.flake8Args": [
"--extend-ignore=E203",
"--extend-ignore=E501",
"--extend-ignore=E503",
"--max-line-length=88"
],
"python.testing.pytestArgs": [
"."
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
18 changes: 9 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
</a> |
<a href="https://github.com/chroma-core/chroma/blob/master/LICENSE" target="_blank">
<img src="https://img.shields.io/static/v1?label=license&message=Apache 2.0&color=white" alt="License">
</a> |
</a> |
<a href="https://docs.trychroma.com/" target="_blank">
Docs
</a> |
</a> |
<a href="https://www.trychroma.com/" target="_blank">
Homepage
</a>
Expand All @@ -30,19 +30,19 @@ pip install chromadb # python client

The core API is only 4 functions (run our [💡 Google Colab](https://colab.research.google.com/drive/1QEzFyqnoFxq7LUGyP1vzR4iLt9PpCDXv?usp=sharing) or [Replit template](https://replit.com/@swyx/BasicChromaStarter?v=1)):

```python
```python
import chromadb
# setup Chroma in-memory, for easy prototyping. Can add persistence easily!
client = chromadb.Client()

# Create collection. get_collection, get_or_create_collection, delete_collection also available!
collection = client.create_collection("all-my-documents")
collection = client.create_collection("all-my-documents")

# Add docs to the collection. Can also update and delete. Row-based API coming soon!
collection.add(
documents=["This is document1", "This is document2"], # we handle tokenization, embedding, and indexing automatically. You can skip that and add your own embeddings as well
metadatas=[{"source": "notion"}, {"source": "google-docs"}], # filter on these!
ids=["doc1", "doc2"], # unique for each doc
ids=["doc1", "doc2"], # unique for each doc
)

# Query/search 2 most similar results. You can also .get by id
Expand All @@ -66,23 +66,23 @@ results = collection.query(
For example, the `"Chat your data"` use case:
1. Add documents to your database. You can pass in your own embeddings, embedding function, or let Chroma embed them for you.
2. Query relevant documents with natural language.
3. Compose documents into the context window of an LLM like `GPT3` for additional summarization or analysis.
3. Compose documents into the context window of an LLM like `GPT3` for additional summarization or analysis.

## Embeddings?

What are embeddings?

- [Read the guide from OpenAI](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings)
- __Literal__: Embedding something turns it from image/text/audio into a list of numbers. 🖼️ or 📄 => `[1.2, 2.1, ....]`. This process makes documents "understandable" to a machine learning model.
- __By analogy__: An embedding represents the essence of a document. This enables documents and queries with the same essence to be "near" each other and therefore easy to find.
- __Literal__: Embedding something turns it from image/text/audio into a list of numbers. 🖼️ or 📄 => `[1.2, 2.1, ....]`. This process makes documents "understandable" to a machine learning model.
- __By analogy__: An embedding represents the essence of a document. This enables documents and queries with the same essence to be "near" each other and therefore easy to find.
- __Technical__: An embedding is the latent-space position of a document at a layer of a deep neural network. For models trained specifically to embed data, this is the last layer.
- __A small example__: If you search your photos for "famous bridge in San Francisco". By embedding this query and comparing it to the embeddings of your photos and their metadata - it should return photos of the Golden Gate Bridge.

Embeddings databases (also known as **vector databases**) store embeddings and allow you to search by nearest neighbors rather than by substrings like a traditional database. By default, Chroma uses [Sentence Transformers](https://docs.trychroma.com/embeddings#default-sentence-transformers) to embed for you but you can also use OpenAI embeddings, Cohere (multilingual) embeddings, or your own.

## Get involved

Chroma is a rapidly developing project. We welcome PR contributors and ideas for how to improve the project.
Chroma is a rapidly developing project. We welcome PR contributors and ideas for how to improve the project.
- [Join the conversation on Discord](https://discord.gg/MMeYNTmh3x)
- [Review the roadmap and contribute your ideas](https://docs.trychroma.com/roadmap)
- [Grab an issue and open a PR](https://github.com/chroma-core/chroma/issues)
Expand Down
8 changes: 4 additions & 4 deletions bin/integration-test
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@ trap cleanup EXIT

docker compose -f docker-compose.test.yml up --build -d

export CHROMA_INTEGRATION_TEST=1
export CHROMA_INTEGRATION_TEST_ONLY=1
export CHROMA_API_IMPL=rest
export CHROMA_SERVER_HOST=localhost
export CHROMA_SERVER_HTTP_PORT=8000

python -m pytest
echo testing: python -m pytest "$@"
python -m pytest "$@"

cd clients/js
yarn
yarn test:run
cd ../..

cd ../..
48 changes: 40 additions & 8 deletions chromadb/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ def create_collection(
Args:
name (str): The name of the collection to create. The name must be unique.
metadata (Optional[Dict], optional): A dictionary of metadata to associate with the collection. Defaults to None.
get_or_create (bool, optional): If True, will return the collection if it already exists. Defaults to False.
get_or_create (bool, optional): If True, will return the collection if it already exists,
and update the metadata (if applicable). Defaults to False.
embedding_function (Optional[Callable], optional): A function that takes documents and returns an embedding. Defaults to None.
Returns:
Expand All @@ -82,8 +83,11 @@ def delete_collection(
"""

@abstractmethod
def get_or_create_collection(self, name: str, metadata: Optional[Dict] = None) -> Collection:
"""Calls create_collection with get_or_create=True
def get_or_create_collection(
self, name: str, metadata: Optional[Dict] = None
) -> Collection:
"""Calls create_collection with get_or_create=True.
If the collection exists, but with different metadata, the metadata will be replaced.
Args:
name (str): The name of the collection to create. The name must be unique.
Expand Down Expand Up @@ -141,7 +145,7 @@ def _add(
⚠️ It is recommended to use the more specific methods below when possible.
Args:
collection_name (Union[str, Sequence[str]]): The model space(s) to add the embeddings to
collection_name (Union[str, Sequence[str]]): The collection(s) to add the embeddings to
embedding (Sequence[Sequence[float]]): The sequence of embeddings to add
metadata (Optional[Union[Dict, Sequence[Dict]]], optional): The metadata to associate with the embeddings. Defaults to None.
documents (Optional[Union[str, Sequence[str]]], optional): The documents to associate with the embeddings. Defaults to None.
Expand All @@ -162,17 +166,40 @@ def _update(
⚠️ It is recommended to use the more specific methods below when possible.
Args:
collection_name (Union[str, Sequence[str]]): The model space(s) to add the embeddings to
collection_name (Union[str, Sequence[str]]): The collection(s) to add the embeddings to
embedding (Sequence[Sequence[float]]): The sequence of embeddings to add
"""
pass

@abstractmethod
def _upsert(
self,
collection_name: str,
ids: IDs,
embeddings: Optional[Embeddings] = None,
metadatas: Optional[Metadatas] = None,
documents: Optional[Documents] = None,
increment_index: bool = True,
):
"""Add or update entries in the embedding store.
If an entry with the same id already exists, it will be updated, otherwise it will be added.
Args:
collection_name (str): The collection to add the embeddings to
ids (Optional[Union[str, Sequence[str]]], optional): The ids to associate with the embeddings. Defaults to None.
embeddings (Sequence[Sequence[float]]): The sequence of embeddings to add
metadatas (Optional[Union[Dict, Sequence[Dict]]], optional): The metadata to associate with the embeddings. Defaults to None.
documents (Optional[Union[str, Sequence[str]]], optional): The documents to associate with the embeddings. Defaults to None.
increment_index (bool, optional): If True, will incrementally add to the ANN index of the collection. Defaults to True.
"""
pass

@abstractmethod
def _count(self, collection_name: str) -> int:
"""Returns the number of embeddings in the database
Args:
collection_name (str): The model space to count the embeddings in.
collection_name (str): The collection to count the embeddings in.
Returns:
int: The number of embeddings in the collection
Expand Down Expand Up @@ -278,14 +305,19 @@ def raw_sql(self, sql: str) -> pd.DataFrame:

@abstractmethod
def create_index(self, collection_name: Optional[str] = None) -> bool:
"""Creates an index for the given model space
"""Creates an index for the given collection
⚠️ This method should not be used directly.
Args:
collection_name (Optional[str], optional): The model space to create the index for. Uses the client's model space if None. Defaults to None.
collection_name (Optional[str], optional): The collection to create the index for. Uses the client's collection if None. Defaults to None.
Returns:
bool: True if the index was created successfully
"""
pass

@abstractmethod
def persist(self):
"""Persist the database to disk"""
pass
Loading

0 comments on commit f9b8f7c

Please sign in to comment.