From d036595fbaf95130abe22fc9f69761c1956cd0c4 Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Sun, 16 Apr 2023 12:00:37 -0400 Subject: [PATCH 01/10] valiation to prevent dup ID inserts --- chromadb/api/local.py | 4 ++++ chromadb/api/types.py | 3 +++ chromadb/test/property/test_embeddings.py | 22 +++++++++++++++++----- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/chromadb/api/local.py b/chromadb/api/local.py index db275f344ee..f01b15e5dc8 100644 --- a/chromadb/api/local.py +++ b/chromadb/api/local.py @@ -126,6 +126,10 @@ def _add( increment_index: bool = True, ): + existing_ids = set(self._get(collection_name, ids=ids, include=[])["ids"]) + if len(existing_ids) > 0: + raise ValueError(f"IDs {existing_ids} already exist in collection {collection_name}") + collection_uuid = self._db.get_collection_uuid_from_name(collection_name) added_uuids = self._db.add( collection_uuid, diff --git a/chromadb/api/types.py b/chromadb/api/types.py index 39d50d723d4..8b4f5a477e9 100644 --- a/chromadb/api/types.py +++ b/chromadb/api/types.py @@ -84,6 +84,9 @@ def validate_ids(ids: IDs) -> IDs: for id in ids: if not isinstance(id, str): raise ValueError(f"Expected ID to be a str, got {id}") + if len(ids) != len(set(ids)): + dups = set([x for x in ids if ids.count(x) > 1]) + raise ValueError(f"Expected IDs to be unique, found duplicates for: {dups}") return ids diff --git a/chromadb/test/property/test_embeddings.py b/chromadb/test/property/test_embeddings.py index 6a503307d5c..1649f9e49b6 100644 --- a/chromadb/test/property/test_embeddings.py +++ b/chromadb/test/property/test_embeddings.py @@ -92,9 +92,14 @@ def add_embeddings(self, embedding_set): if len(self.embeddings["ids"]) > 0: trace("add_more_embeddings") - self.collection.add(**embedding_set) - self._add_embeddings(embedding_set) - return multiple(*embedding_set["ids"]) + if set(embedding_set["ids"]).intersection(set(self.embeddings["ids"])): + with pytest.raises(ValueError): + self.collection.add(**embedding_set) + return multiple() + else: + self.collection.add(**embedding_set) + self._add_embeddings(embedding_set) + return multiple(*embedding_set["ids"]) @precondition(lambda self: len(self.embeddings["ids"]) > 20) @rule(ids=st.lists(consumes(embedding_ids), min_size=1, max_size=20)) @@ -195,13 +200,20 @@ def test_multi_add(api): assert coll.count() == 1 - results = coll.query(query_embeddings=[[0.0]], n_results=2) - assert results["ids"] == [["a"]] + results = coll.get() + assert results["ids"] == ["a"] coll.delete(ids=["a"]) assert coll.count() == 0 +def test_dup_add(api): + api.reset() + coll = api.create_collection(name="foo") + with pytest.raises(ValueError): + coll.add(ids=["a", "a"], embeddings=[[0.0], [1.1]]) + + def test_escape_chars_in_ids(api): api.reset() id = "\x1f" From 1a460403ecf2551161681638ef2663b13abb5de0 Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Sun, 16 Apr 2023 12:35:23 -0400 Subject: [PATCH 02/10] add JS validation & tests --- clients/js/src/index.ts | 8 ++++++++ clients/js/test/client.test.ts | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/clients/js/src/index.ts b/clients/js/src/index.ts index 4ec2a0dd973..fdaf35bd725 100644 --- a/clients/js/src/index.ts +++ b/clients/js/src/index.ts @@ -154,6 +154,14 @@ export class Collection { ); } + const uniqueIds = new Set(idsArray); + if (uniqueIds.size !== idsArray.length) { + const duplicateIds = idsArray.filter((item, index) => idsArray.indexOf(item) !== index); + throw new Error( + `Expected IDs to be unique, found duplicates for: ${duplicateIds}`, + ); + } + const response = await this.api.add({ collectionName: this.name, addEmbedding: { diff --git a/clients/js/test/client.test.ts b/clients/js/test/client.test.ts index b631eef3c35..00e9c402dba 100644 --- a/clients/js/test/client.test.ts +++ b/clients/js/test/client.test.ts @@ -196,4 +196,36 @@ test('wrong code returns an error', async () => { const results = await collection.get(undefined, { "test": { "$contains": "hello" } }); expect(results.error).toBeDefined() expect(results.error).toBe("ValueError('Expected one of $gt, $lt, $gte, $lte, $ne, $eq, got $contains')") +}) + +test('it should return an error when inserting duplicate IDs', async () => { + await chroma.reset() + const collection = await chroma.createCollection('test') + const ids = ['test1', 'test2', 'test3'] + const embeddings = [ + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + ] + const metadatas = [{ test: 'test1' }, { test: 'test2' }, { test: 'test3' }] + await collection.add(ids, embeddings, metadatas) + const results = await collection.add(ids, embeddings, metadatas); + expect(results.error).toBeDefined() + expect(results.error).toContain("ValueError") +}) + +test('it should return an error when inserting duplicate IDs in the same batch', async () => { + await chroma.reset() + const collection = await chroma.createCollection('test') + const ids = ['test1', 'test2', 'test3', 'test1'] + const embeddings = [ + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], + [10, 9, 8, 7, 6, 5, 4, 3, 2, 1], + [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + ] + const metadatas = [{ test: 'test1' }, { test: 'test2' }, { test: 'test3' }, { test: 'test4' }] + const results = await collection.add(ids, embeddings, metadatas); + expect(results.error).toBeDefined() + expect(results.error).toContain("duplicate") }) \ No newline at end of file From 8671504dcb5dd4720df5786bcbd4cf59d7a0b8f3 Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Mon, 17 Apr 2023 10:10:31 -0400 Subject: [PATCH 03/10] use unique IDs in unit tests --- chromadb/test/test_api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chromadb/test/test_api.py b/chromadb/test/test_api.py index 8c010545711..ce5a460750a 100644 --- a/chromadb/test/test_api.py +++ b/chromadb/test/test_api.py @@ -212,7 +212,7 @@ def test_heartbeat(api_fixture, request): batch_records = { "embeddings": [[1.1, 2.3, 3.2], [1.2, 2.24, 3.2]], - "ids": ["https://example.com", "https://example.com"], + "ids": ["https://example.com/1", "https://example.com/2"], } @@ -251,7 +251,7 @@ def test_get_or_create(api_fixture, request): minimal_records = { "embeddings": [[1.1, 2.3, 3.2], [1.2, 2.24, 3.2]], - "ids": ["https://example.com", "https://example.com"], + "ids": ["https://example.com/1", "https://example.com/2"], } From c5b096e714b18d2c78b0bd02503999d8d051bd54 Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Mon, 17 Apr 2023 10:53:44 -0400 Subject: [PATCH 04/10] fix js test to handle local validation --- clients/js/test/client.test.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/clients/js/test/client.test.ts b/clients/js/test/client.test.ts index 00e9c402dba..722a4c5c654 100644 --- a/clients/js/test/client.test.ts +++ b/clients/js/test/client.test.ts @@ -214,7 +214,7 @@ test('it should return an error when inserting duplicate IDs', async () => { expect(results.error).toContain("ValueError") }) -test('it should return an error when inserting duplicate IDs in the same batch', async () => { +test('validation errors when inserting duplicate IDs in the same batch', async () => { await chroma.reset() const collection = await chroma.createCollection('test') const ids = ['test1', 'test2', 'test3', 'test1'] @@ -225,7 +225,9 @@ test('it should return an error when inserting duplicate IDs in the same batch', [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] ] const metadatas = [{ test: 'test1' }, { test: 'test2' }, { test: 'test3' }, { test: 'test4' }] - const results = await collection.add(ids, embeddings, metadatas); - expect(results.error).toBeDefined() - expect(results.error).toContain("duplicate") + try { + await collection.add(ids, embeddings, metadatas); + } catch (e: any) { + expect(e.message).toMatch('duplicates') + } }) \ No newline at end of file From 25d451f9e70e98b9bc37dd4bcd97999420cf55c1 Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Mon, 17 Apr 2023 13:39:19 -0400 Subject: [PATCH 05/10] ensure that documents are populated for updates --- chromadb/test/property/strategies.py | 21 +++++++++++---------- chromadb/test/property/test_embeddings.py | 5 ++++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/chromadb/test/property/strategies.py b/chromadb/test/property/strategies.py index 55004f4bad9..53fa63623f9 100644 --- a/chromadb/test/property/strategies.py +++ b/chromadb/test/property/strategies.py @@ -1,6 +1,6 @@ import hypothesis import hypothesis.strategies as st -from typing import Optional, Sequence, TypedDict, cast +from typing import Optional, Sequence, TypedDict, Callable, List, cast import hypothesis.extra.numpy as npst import numpy as np import chromadb.api.types as types @@ -105,11 +105,11 @@ def create_embeddings(dim: int, count: int, dtype: np.dtype): ).astype(dtype) -def documents_strategy(count: int): +def documents_strategy(count: int) -> st.SearchStrategy[Optional[List[str]]]: # TODO: Handle non-unique documents # TODO: Handle empty string documents return st.one_of( - st.lists(st.text(min_size=1), min_size=count, max_size=count, unique=True), st.none() + st.none(), st.lists(st.text(min_size=1), min_size=count, max_size=count, unique=True) ) @@ -122,11 +122,8 @@ def metadata_strategy(): ) -def metadatas_strategy(count: int): - return st.one_of( - st.lists(metadata_strategy(), min_size=count, max_size=count), - st.none(), - ) +def metadatas_strategy(count: int) -> st.SearchStrategy[Optional[List[types.Metadata]]]: + return st.one_of(st.none(), st.lists(metadata_strategy(), min_size=count, max_size=count)) @st.composite @@ -136,6 +133,10 @@ def embedding_set( count_st: st.SearchStrategy[int] = st.integers(min_value=1, max_value=512), dtype_st: st.SearchStrategy[np.dtype] = st.sampled_from(float_types), id_st: st.SearchStrategy[str] = st.text(alphabet=legal_id_characters, min_size=1, max_size=64), + documents_st_fn: Callable[[int], st.SearchStrategy[Optional[List[str]]]] = documents_strategy, + metadatas_st_fn: Callable[ + [int], st.SearchStrategy[Optional[List[types.Metadata]]] + ] = metadatas_strategy, dimension: Optional[int] = None, count: Optional[int] = None, dtype: Optional[np.dtype] = None, @@ -157,8 +158,8 @@ def embedding_set( # TODO: Test documents only # TODO: Generative embedding function to guarantee unique embeddings for unique documents - documents = draw(documents_strategy(count)) - metadatas = draw(metadatas_strategy(count)) + documents = draw(documents_st_fn(count)) + metadatas = draw(metadatas_st_fn(count)) embeddings = create_embeddings(dimension, count, dtype) diff --git a/chromadb/test/property/test_embeddings.py b/chromadb/test/property/test_embeddings.py index 1649f9e49b6..4c4c813487f 100644 --- a/chromadb/test/property/test_embeddings.py +++ b/chromadb/test/property/test_embeddings.py @@ -67,7 +67,7 @@ class EmbeddingStateMachine(RuleBasedStateMachine): def __init__(self, api): super().__init__() - self.api = chromadb.Client(configurations()[0]) + self.api = api @initialize( collection=strategies.collections(), @@ -121,6 +121,9 @@ def delete_by_ids(self, ids): dimension_st=dimension_st, id_st=embedding_ids, count_st=st.integers(min_value=1, max_value=5), + documents_st_fn=lambda c: st.lists( + st.text(min_size=1), min_size=c, max_size=c, unique=True + ), ) ) def update_embeddings(self, embedding_set): From e31d240a2acf63f3d1b3773f0013a56383bdcc02 Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Mon, 17 Apr 2023 15:54:03 -0700 Subject: [PATCH 06/10] Revert "fix js test to handle local validation" This reverts commit c5b096e714b18d2c78b0bd02503999d8d051bd54. We will handle TypeScript changes in a separate PR. --- clients/js/test/client.test.ts | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/clients/js/test/client.test.ts b/clients/js/test/client.test.ts index 722a4c5c654..00e9c402dba 100644 --- a/clients/js/test/client.test.ts +++ b/clients/js/test/client.test.ts @@ -214,7 +214,7 @@ test('it should return an error when inserting duplicate IDs', async () => { expect(results.error).toContain("ValueError") }) -test('validation errors when inserting duplicate IDs in the same batch', async () => { +test('it should return an error when inserting duplicate IDs in the same batch', async () => { await chroma.reset() const collection = await chroma.createCollection('test') const ids = ['test1', 'test2', 'test3', 'test1'] @@ -225,9 +225,7 @@ test('validation errors when inserting duplicate IDs in the same batch', async ( [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] ] const metadatas = [{ test: 'test1' }, { test: 'test2' }, { test: 'test3' }, { test: 'test4' }] - try { - await collection.add(ids, embeddings, metadatas); - } catch (e: any) { - expect(e.message).toMatch('duplicates') - } + const results = await collection.add(ids, embeddings, metadatas); + expect(results.error).toBeDefined() + expect(results.error).toContain("duplicate") }) \ No newline at end of file From b13693382f5fe25f8a5d9869e20d3f6926683683 Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Mon, 17 Apr 2023 15:54:28 -0700 Subject: [PATCH 07/10] Revert "add JS validation & tests" This reverts commit 1a460403ecf2551161681638ef2663b13abb5de0. We will handle TypeScript changes in a separate PR --- clients/js/src/index.ts | 8 -------- clients/js/test/client.test.ts | 32 -------------------------------- 2 files changed, 40 deletions(-) diff --git a/clients/js/src/index.ts b/clients/js/src/index.ts index fdaf35bd725..4ec2a0dd973 100644 --- a/clients/js/src/index.ts +++ b/clients/js/src/index.ts @@ -154,14 +154,6 @@ export class Collection { ); } - const uniqueIds = new Set(idsArray); - if (uniqueIds.size !== idsArray.length) { - const duplicateIds = idsArray.filter((item, index) => idsArray.indexOf(item) !== index); - throw new Error( - `Expected IDs to be unique, found duplicates for: ${duplicateIds}`, - ); - } - const response = await this.api.add({ collectionName: this.name, addEmbedding: { diff --git a/clients/js/test/client.test.ts b/clients/js/test/client.test.ts index 00e9c402dba..b631eef3c35 100644 --- a/clients/js/test/client.test.ts +++ b/clients/js/test/client.test.ts @@ -196,36 +196,4 @@ test('wrong code returns an error', async () => { const results = await collection.get(undefined, { "test": { "$contains": "hello" } }); expect(results.error).toBeDefined() expect(results.error).toBe("ValueError('Expected one of $gt, $lt, $gte, $lte, $ne, $eq, got $contains')") -}) - -test('it should return an error when inserting duplicate IDs', async () => { - await chroma.reset() - const collection = await chroma.createCollection('test') - const ids = ['test1', 'test2', 'test3'] - const embeddings = [ - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] - ] - const metadatas = [{ test: 'test1' }, { test: 'test2' }, { test: 'test3' }] - await collection.add(ids, embeddings, metadatas) - const results = await collection.add(ids, embeddings, metadatas); - expect(results.error).toBeDefined() - expect(results.error).toContain("ValueError") -}) - -test('it should return an error when inserting duplicate IDs in the same batch', async () => { - await chroma.reset() - const collection = await chroma.createCollection('test') - const ids = ['test1', 'test2', 'test3', 'test1'] - const embeddings = [ - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], - [10, 9, 8, 7, 6, 5, 4, 3, 2, 1], - [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] - ] - const metadatas = [{ test: 'test1' }, { test: 'test2' }, { test: 'test3' }, { test: 'test4' }] - const results = await collection.add(ids, embeddings, metadatas); - expect(results.error).toBeDefined() - expect(results.error).toContain("duplicate") }) \ No newline at end of file From c43051a3b13fb8af4ac453af7102705895de48a9 Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Mon, 17 Apr 2023 16:01:16 -0700 Subject: [PATCH 08/10] don't convert existing IDs to set --- chromadb/api/local.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chromadb/api/local.py b/chromadb/api/local.py index f01b15e5dc8..b00e773a740 100644 --- a/chromadb/api/local.py +++ b/chromadb/api/local.py @@ -126,7 +126,7 @@ def _add( increment_index: bool = True, ): - existing_ids = set(self._get(collection_name, ids=ids, include=[])["ids"]) + existing_ids = self._get(collection_name, ids=ids, include=[])["ids"] if len(existing_ids) > 0: raise ValueError(f"IDs {existing_ids} already exist in collection {collection_name}") From d6fd0c63eed1f7d66ffee7dd565bb4c734e7e6d0 Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Mon, 17 Apr 2023 16:06:44 -0700 Subject: [PATCH 09/10] use and check for specific error types --- chromadb/api/local.py | 6 ++++-- chromadb/api/types.py | 3 ++- chromadb/errors.py | 8 ++++++++ chromadb/test/property/test_embeddings.py | 7 ++++--- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/chromadb/api/local.py b/chromadb/api/local.py index b00e773a740..7699b42ed1b 100644 --- a/chromadb/api/local.py +++ b/chromadb/api/local.py @@ -3,7 +3,7 @@ from typing import Dict, List, Optional, Sequence, Callable, cast from chromadb import __version__ - +import chromadb.errors as errors from chromadb.api import API from chromadb.db import DB from chromadb.api.types import ( @@ -128,7 +128,9 @@ def _add( existing_ids = self._get(collection_name, ids=ids, include=[])["ids"] if len(existing_ids) > 0: - raise ValueError(f"IDs {existing_ids} already exist in collection {collection_name}") + raise errors.IDAlreadyExistsError( + f"IDs {existing_ids} already exist in collection {collection_name}" + ) collection_uuid = self._db.get_collection_uuid_from_name(collection_name) added_uuids = self._db.add( diff --git a/chromadb/api/types.py b/chromadb/api/types.py index 8b4f5a477e9..6c0ea632766 100644 --- a/chromadb/api/types.py +++ b/chromadb/api/types.py @@ -1,4 +1,5 @@ from typing import Literal, Optional, Union, Dict, Sequence, TypedDict, Protocol, TypeVar, List +import chromadb.errors as errors ID = str IDs = List[ID] @@ -86,7 +87,7 @@ def validate_ids(ids: IDs) -> IDs: raise ValueError(f"Expected ID to be a str, got {id}") if len(ids) != len(set(ids)): dups = set([x for x in ids if ids.count(x) > 1]) - raise ValueError(f"Expected IDs to be unique, found duplicates for: {dups}") + raise errors.DuplicateIDError(f"Expected IDs to be unique, found duplicates for: {dups}") return ids diff --git a/chromadb/errors.py b/chromadb/errors.py index 0a75c43f5b2..e12e6ff4541 100644 --- a/chromadb/errors.py +++ b/chromadb/errors.py @@ -12,3 +12,11 @@ class InvalidDimensionException(Exception): class NotEnoughElementsException(Exception): pass + + +class IDAlreadyExistsError(ValueError): + pass + + +class DuplicateIDError(ValueError): + pass diff --git a/chromadb/test/property/test_embeddings.py b/chromadb/test/property/test_embeddings.py index 4c4c813487f..13cfdf520ab 100644 --- a/chromadb/test/property/test_embeddings.py +++ b/chromadb/test/property/test_embeddings.py @@ -4,6 +4,7 @@ import hypothesis.strategies as st from typing import List, Set, TypedDict, Sequence import chromadb +import chromadb.errors as errors from chromadb.api import API from chromadb.api.models.Collection import Collection from chromadb.test.configurations import configurations @@ -93,7 +94,7 @@ def add_embeddings(self, embedding_set): trace("add_more_embeddings") if set(embedding_set["ids"]).intersection(set(self.embeddings["ids"])): - with pytest.raises(ValueError): + with pytest.raises(errors.IDAlreadyExistsError): self.collection.add(**embedding_set) return multiple() else: @@ -198,7 +199,7 @@ def test_multi_add(api): coll.add(ids=["a"], embeddings=[[0.0]]) assert coll.count() == 1 - with pytest.raises(ValueError): + with pytest.raises(errors.IDAlreadyExistsError): coll.add(ids=["a"], embeddings=[[0.0]]) assert coll.count() == 1 @@ -213,7 +214,7 @@ def test_multi_add(api): def test_dup_add(api): api.reset() coll = api.create_collection(name="foo") - with pytest.raises(ValueError): + with pytest.raises(errors.DuplicateIDError): coll.add(ids=["a", "a"], embeddings=[[0.0], [1.1]]) From 693a53a06b88a79eabe627039a859df53f294a9e Mon Sep 17 00:00:00 2001 From: Luke VanderHart Date: Mon, 17 Apr 2023 16:17:55 -0700 Subject: [PATCH 10/10] avoid operator overloading --- chromadb/test/property/test_embeddings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/chromadb/test/property/test_embeddings.py b/chromadb/test/property/test_embeddings.py index 13cfdf520ab..e2d3676c44b 100644 --- a/chromadb/test/property/test_embeddings.py +++ b/chromadb/test/property/test_embeddings.py @@ -161,8 +161,8 @@ def _add_embeddings(self, embeddings: strategies.EmbeddingSet): else: documents = [None] * len(embeddings["ids"]) - self.embeddings["metadatas"] += metadatas # type: ignore - self.embeddings["documents"] += documents # type: ignore + self.embeddings["metadatas"].extend(metadatas) # type: ignore + self.embeddings["documents"].extend(documents) # type: ignore def _remove_embeddings(self, indices_to_remove: Set[int]):