From 734b133909f4d2e0e159c02c9447efbd627facbd Mon Sep 17 00:00:00 2001 From: Ben Eggers <64657842+beggers@users.noreply.github.com> Date: Fri, 13 Oct 2023 08:32:50 -0700 Subject: [PATCH] Further posthog improvements (and a little .gitignore) (#1222) ## Description of changes *Summarize the changes made by this PR.* - Improvements & Bug fixes - Increase add() batch size. - Flatten server context so we can do things like group by version. - Add a few things to `.gitignore` which seem to be created by test runs. - New functionality - Batch query() calls (batch size = 20, where we started for add()). - Change `collection.get()` so its fields are `int` instead of `bool` since I imagine we'll eventually batch it as well. ## Test plan *How are these changes tested?* - [x] Tests pass locally with `pytest` for python, `yarn test` for js - [x] Tested locally by printing posthog events: ```python >>> import chromadb >>> chroma_client = chromadb.Client() bf9b885c-b86e-4194-97b4-d9701d293cce ClientStartEvent {'batch_size': 1, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False} >>> collection = chroma_client.create_collection(name="my_collection") bf9b885c-b86e-4194-97b4-d9701d293cce ClientCreateCollectionEvent {'batch_size': 1, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'embedding_function': 'ONNXMiniLM_L6_V2', 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False} >>> collection.add( ... documents=["This is a document", "This is another document"], ... metadatas=[{"source": "my_source"}, {"source": "my_source"}], ... ids=["id1", "id2"] ... ) bf9b885c-b86e-4194-97b4-d9701d293cce CollectionAddEvent {'batch_size': 1, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'add_amount': 2, 'with_documents': 2, 'with_metadata': 2, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False} >>> for i in range(41): ... results = collection.query( ... query_texts=["This is a query document"], ... n_results=2 ... ) ... bf9b885c-b86e-4194-97b4-d9701d293cce CollectionQueryEvent {'batch_size': 1, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'query_amount': 1, 'with_metadata_filter': 1, 'with_document_filter': 1, 'n_results': 2, 'include_metadatas': 1, 'include_documents': 1, 'include_distances': 1, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False} bf9b885c-b86e-4194-97b4-d9701d293cce CollectionQueryEvent {'batch_size': 20, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'query_amount': 20, 'with_metadata_filter': 20, 'with_document_filter': 20, 'n_results': 40, 'include_metadatas': 20, 'include_documents': 20, 'include_distances': 20, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False} bf9b885c-b86e-4194-97b4-d9701d293cce CollectionQueryEvent {'batch_size': 20, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'query_amount': 20, 'with_metadata_filter': 20, 'with_document_filter': 20, 'n_results': 40, 'include_metadatas': 20, 'include_documents': 20, 'include_distances': 20, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False} >>> for i in range(100): ... collection.add(documents=[str(i)], ids=[str(i)]) ... bf9b885c-b86e-4194-97b4-d9701d293cce CollectionAddEvent {'batch_size': 100, 'collection_uuid': '50de2cd2-06ba-442d-82c1-ae0d94b620e4', 'add_amount': 100, 'with_documents': 100, 'with_metadata': 0, 'chroma_version': '0.4.14', 'server_context': 'None', 'chroma_api_impl': 'chromadb.api.segment.SegmentAPI', 'is_persistent': False, 'chroma_server_ssl_enabled': False} ``` Also confirmed that `collection.get()` spits out an event every time it's called. It also spits out a bunch of data so I'll elide it from this PR description. ## Documentation Changes *Are all docstrings for user-facing APIs updated if required? Do we need to make documentation changes in the [docs repository](https://github.com/chroma-core/docs)?* No docs change needed -- we're not collecting anything new or changing anything significant about how we collect. --- .gitignore | 3 ++ chromadb/api/segment.py | 20 +++++++------ chromadb/telemetry/events.py | 56 +++++++++++++++++++++++++---------- chromadb/telemetry/posthog.py | 2 +- 4 files changed, 56 insertions(+), 25 deletions(-) diff --git a/.gitignore b/.gitignore index fd4f8aa8a97..316c32cb664 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,10 @@ index_data # Default configuration for persist_directory in chromadb/config.py # Currently it's located in "./chroma/" chroma/ +chroma_test_data +server.htpasswd +.venv venv .env .chroma diff --git a/chromadb/api/segment.py b/chromadb/api/segment.py index d23139759d9..85dca1d8532 100644 --- a/chromadb/api/segment.py +++ b/chromadb/api/segment.py @@ -408,13 +408,14 @@ def _get( if "documents" in include: documents = [_doc(m) for m in metadatas] + ids_amount = len(ids) if ids else 0 self._telemetry_client.capture( CollectionGetEvent( collection_uuid=str(collection_id), - ids_count=len(ids) if ids else 0, + ids_count=ids_amount, limit=limit if limit else 0, - include_metadata="metadatas" in include, - include_documents="documents" in include, + include_metadata=ids_amount if "metadatas" in include else 0, + include_documents=ids_amount if "documents" in include else 0, ) ) @@ -571,16 +572,17 @@ def _query( doc_list = [_doc(m) for m in metadata_list] documents.append(doc_list) # type: ignore + query_amount = len(query_embeddings) self._telemetry_client.capture( CollectionQueryEvent( collection_uuid=str(collection_id), - query_amount=len(query_embeddings), + query_amount=query_amount, n_results=n_results, - with_metadata_filter=where is not None, - with_document_filter=where_document is not None, - include_metadatas="metadatas" in include, - include_documents="documents" in include, - include_distances="distances" in include, + with_metadata_filter=query_amount if where is not None else 0, + with_document_filter=query_amount if where_document is not None else 0, + include_metadatas=query_amount if "metadatas" in include else 0, + include_documents=query_amount if "documents" in include else 0, + include_distances=query_amount if "distances" in include else 0, ) ) diff --git a/chromadb/telemetry/events.py b/chromadb/telemetry/events.py index 34c6264fcc9..e662cd85fa7 100644 --- a/chromadb/telemetry/events.py +++ b/chromadb/telemetry/events.py @@ -26,7 +26,8 @@ def __init__(self, collection_uuid: str, embedding_function: str): class CollectionAddEvent(TelemetryEvent): - max_batch_size: ClassVar[int] = 20 + max_batch_size: ClassVar[int] = 100 + batch_size: int collection_uuid: str add_amount: int with_documents: int @@ -89,25 +90,28 @@ def __init__( class CollectionQueryEvent(TelemetryEvent): + max_batch_size: ClassVar[int] = 20 + batch_size: int collection_uuid: str query_amount: int - with_metadata_filter: bool - with_document_filter: bool + with_metadata_filter: int + with_document_filter: int n_results: int - include_metadatas: bool - include_documents: bool - include_distances: bool + include_metadatas: int + include_documents: int + include_distances: int def __init__( self, collection_uuid: str, query_amount: int, - with_metadata_filter: bool, - with_document_filter: bool, + with_metadata_filter: int, + with_document_filter: int, n_results: int, - include_metadatas: bool, - include_documents: bool, - include_distances: bool, + include_metadatas: int, + include_documents: int, + include_distances: int, + batch_size: int = 1, ): super().__init__() self.collection_uuid = collection_uuid @@ -118,22 +122,44 @@ def __init__( self.include_metadatas = include_metadatas self.include_documents = include_documents self.include_distances = include_distances + self.batch_size = batch_size + + @property + def batch_key(self) -> str: + return self.collection_uuid + self.name + + def batch(self, other: "TelemetryEvent") -> "CollectionQueryEvent": + if not self.batch_key == other.batch_key: + raise ValueError("Cannot batch events") + other = cast(CollectionQueryEvent, other) + total_amount = self.query_amount + other.query_amount + return CollectionQueryEvent( + collection_uuid=self.collection_uuid, + query_amount=total_amount, + with_metadata_filter=self.with_metadata_filter + other.with_metadata_filter, + with_document_filter=self.with_document_filter + other.with_document_filter, + n_results=self.n_results + other.n_results, + include_metadatas=self.include_metadatas + other.include_metadatas, + include_documents=self.include_documents + other.include_documents, + include_distances=self.include_distances + other.include_distances, + batch_size=self.batch_size + other.batch_size, + ) class CollectionGetEvent(TelemetryEvent): collection_uuid: str ids_count: int limit: int - include_metadata: bool - include_documents: bool + include_metadata: int + include_documents: int def __init__( self, collection_uuid: str, ids_count: int, limit: int, - include_metadata: bool, - include_documents: bool, + include_metadata: int, + include_documents: int, ): super().__init__() self.collection_uuid = collection_uuid diff --git a/chromadb/telemetry/posthog.py b/chromadb/telemetry/posthog.py index 184904531ef..21676b9fbe7 100644 --- a/chromadb/telemetry/posthog.py +++ b/chromadb/telemetry/posthog.py @@ -49,7 +49,7 @@ def _direct_capture(self, event: TelemetryEvent) -> None: posthog.capture( self.user_id, event.name, - {**(event.properties), "chroma_context": self.context}, + {**event.properties, **self.context}, ) except Exception as e: logger.error(f"Failed to send telemetry event {event.name}: {e}")