From 4043b24410279d39a2f413a7bc3fa7944088b326 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Thu, 13 Oct 2022 16:18:23 +0200 Subject: [PATCH 01/23] add new marker --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 6681b0586c..2f883c507b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -347,6 +347,7 @@ markers = [ "milvus: requires a Milvus 2 setup", "milvus1: requires a Milvus 1 container", "opensearch", + "document_store", ] log_cli = true From e5bf12b9cfe23cca93dbf36e2e7714b4ed433a1a Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Thu, 13 Oct 2022 16:18:47 +0200 Subject: [PATCH 02/23] start using test hierarchies --- test/document_stores/test_base.py | 72 ++++++++++++++++++++ test/document_stores/test_elasticsearch.py | 20 ++++++ test/document_stores/test_opensearch.py | 77 +++------------------- 3 files changed, 100 insertions(+), 69 deletions(-) create mode 100644 test/document_stores/test_base.py create mode 100644 test/document_stores/test_elasticsearch.py diff --git a/test/document_stores/test_base.py b/test/document_stores/test_base.py new file mode 100644 index 0000000000..1eeda12d13 --- /dev/null +++ b/test/document_stores/test_base.py @@ -0,0 +1,72 @@ +import pytest +import numpy as np + +from haystack.schema import Document, Label, Answer + + +@pytest.mark.document_store +class DocumentStoreTest: + """ + This is the base class for any Document Store testsuite, it doesn't have the `Test` prefix in the name + because we want to run its methods only in subclasses. + """ + + @pytest.fixture + def documents(self): + documents = [] + for i in range(3): + documents.append( + Document( + content=f"A Foo Document {i}", + meta={"name": f"name_{i}", "year": "2020", "month": "01"}, + embedding=np.random.rand(768).astype(np.float32), + ) + ) + + documents.append( + Document( + content=f"A Bar Document {i}", + meta={"name": f"name_{i}", "year": "2021", "month": "02"}, + embedding=np.random.rand(768).astype(np.float32), + ) + ) + + documents.append( + Document( + content=f"Document {i} without embeddings", + meta={"name": f"name_{i}", "no_embedding": True, "month": "03"}, + ) + ) + + return documents + + @pytest.fixture + def labels(self, documents): + labels = [] + for i, d in enumerate(documents): + labels.append( + Label( + query="query", + document=d, + is_correct_document=True, + is_correct_answer=False, + # create a mix set of labels + origin="user-feedback" if i % 2 else "gold-label", + answer=None if not i else Answer(f"the answer is {i}"), + ) + ) + return labels + + @pytest.mark.integration + def test_write_documents(self, ds, documents): + ds.write_documents(documents) + docs = ds.get_all_documents() + assert len(docs) == len(documents) + for i, doc in enumerate(docs): + expected = documents[i] + assert doc.id == expected.id + + @pytest.mark.integration + def test_write_labels(self, ds, labels): + ds.write_labels(labels) + assert ds.get_all_labels() == labels diff --git a/test/document_stores/test_elasticsearch.py b/test/document_stores/test_elasticsearch.py new file mode 100644 index 0000000000..300da733c8 --- /dev/null +++ b/test/document_stores/test_elasticsearch.py @@ -0,0 +1,20 @@ +import pytest +from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore +from .test_base import DocumentStoreTest + + +class TestElasticsearchDocumentStore(DocumentStoreTest): + # Constants + + index_name = __name__ + + @pytest.fixture + def ds(self): + """ + This fixture provides a working document store and takes care of removing the indices when done + """ + labels_index_name = f"{self.index_name}_labels" + ds = ElasticsearchDocumentStore(index=self.index_name, label_index=labels_index_name, create_index=True) + yield ds + ds.delete_index(self.index_name) + ds.delete_index(labels_index_name) diff --git a/test/document_stores/test_opensearch.py b/test/document_stores/test_opensearch.py index 45841d10c5..4fdf8d53ca 100644 --- a/test/document_stores/test_opensearch.py +++ b/test/document_stores/test_opensearch.py @@ -19,15 +19,15 @@ from haystack.schema import Document, Label, Answer from haystack.errors import DocumentStoreError -# Being all the tests in this module, ideally we wouldn't need a marker here, -# but this is to allow this test suite to be skipped when running (e.g.) -# `pytest test/document_stores --document-store-type=faiss` -class TestOpenSearchDocumentStore: +from .test_base import DocumentStoreTest + + +class TestOpenSearchDocumentStore(DocumentStoreTest): # Constants query_emb = np.random.random_sample(size=(2, 2)) - index_name = "myindex" + index_name = __name__ # Fixtures @@ -36,11 +36,10 @@ def ds(self): """ This fixture provides a working document store and takes care of removing the indices when done """ - index_name = __name__ - labels_index_name = f"{index_name}_labels" - ds = OpenSearchDocumentStore(index=index_name, label_index=labels_index_name, port=9201, create_index=True) + labels_index_name = f"{self.index_name}_labels" + ds = OpenSearchDocumentStore(index=self.index_name, label_index=labels_index_name, port=9201, create_index=True) yield ds - ds.delete_index(index_name) + ds.delete_index(self.index_name) ds.delete_index(labels_index_name) @pytest.fixture @@ -82,35 +81,6 @@ def _init_client_params(self): "use_system_proxy": True, } - @pytest.fixture - def documents(self): - documents = [] - for i in range(3): - documents.append( - Document( - content=f"A Foo Document {i}", - meta={"name": f"name_{i}", "year": "2020", "month": "01"}, - embedding=np.random.rand(768).astype(np.float32), - ) - ) - - documents.append( - Document( - content=f"A Bar Document {i}", - meta={"name": f"name_{i}", "year": "2021", "month": "02"}, - embedding=np.random.rand(768).astype(np.float32), - ) - ) - - documents.append( - Document( - content=f"Document {i} without embeddings", - meta={"name": f"name_{i}", "no_embedding": True, "month": "03"}, - ) - ) - - return documents - @pytest.fixture def index(self): return { @@ -143,23 +113,6 @@ def index(self): }, } - @pytest.fixture - def labels(self, documents): - labels = [] - for i, d in enumerate(documents): - labels.append( - Label( - query="query", - document=d, - is_correct_document=True, - is_correct_answer=False, - # create a mix set of labels - origin="user-feedback" if i % 2 else "gold-label", - answer=None if not i else Answer(f"the answer is {i}"), - ) - ) - return labels - # Integration tests @pytest.mark.integration @@ -170,20 +123,6 @@ def test___init__(self): def test___init___faiss(self): OpenSearchDocumentStore(index="faiss_index", port=9201, create_index=True, knn_engine="faiss") - @pytest.mark.integration - def test_write_documents(self, ds, documents): - ds.write_documents(documents) - docs = ds.get_all_documents() - assert len(docs) == len(documents) - for i, doc in enumerate(docs): - expected = documents[i] - assert doc.id == expected.id - - @pytest.mark.integration - def test_write_labels(self, ds, labels): - ds.write_labels(labels) - assert ds.get_all_labels() == labels - @pytest.mark.integration def test_recreate_index(self, ds, documents, labels): ds.write_documents(documents) From 9a04f4264394888a811f99b9494bc7c3f57e8d70 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 17 Oct 2022 16:09:47 +0200 Subject: [PATCH 03/23] move ES tests into their own class --- test/document_stores/test_base.py | 23 +- test/document_stores/test_document_store.py | 335 -------------------- test/document_stores/test_elasticsearch.py | 203 +++++++++++- test/document_stores/test_opensearch.py | 5 +- test/document_stores/test_search_engine.py | 43 +++ 5 files changed, 266 insertions(+), 343 deletions(-) create mode 100644 test/document_stores/test_search_engine.py diff --git a/test/document_stores/test_base.py b/test/document_stores/test_base.py index 1eeda12d13..98a89668a5 100644 --- a/test/document_stores/test_base.py +++ b/test/document_stores/test_base.py @@ -5,10 +5,11 @@ @pytest.mark.document_store -class DocumentStoreTest: +class DocumentStoreBaseTestAbstract: """ - This is the base class for any Document Store testsuite, it doesn't have the `Test` prefix in the name - because we want to run its methods only in subclasses. + This is a base class to test abstract methods from DocumentStoreBase to be inherited by any Document Store + testsuite. It doesn't have the `Test` prefix in the name so that its methods won't be collected for this + class but only for its subclasses. """ @pytest.fixture @@ -18,7 +19,7 @@ def documents(self): documents.append( Document( content=f"A Foo Document {i}", - meta={"name": f"name_{i}", "year": "2020", "month": "01"}, + meta={"name": f"name_{i}", "year": "2020", "month": "01", "numbers": [2, 4]}, embedding=np.random.rand(768).astype(np.float32), ) ) @@ -70,3 +71,17 @@ def test_write_documents(self, ds, documents): def test_write_labels(self, ds, labels): ds.write_labels(labels) assert ds.get_all_labels() == labels + + # get_all_documents_generator + # get_all_labels + # get_document_by_id + # get_document_count + # query_by_embedding + # get_label_count + # write_labels + # delete_documents + # delete_labels + # delete_index + # _create_document_field_map + # get_documents_by_id + # update_document_meta diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index 8734f9dcd8..f37d5ff0ae 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -77,84 +77,6 @@ ] -@pytest.mark.elasticsearch -def test_init_elastic_client(): - # defaults - _ = ElasticsearchDocumentStore() - - # list of hosts + single port - _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=9200) - - # list of hosts + list of ports (wrong) - with pytest.raises(Exception): - _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200]) - - # list of hosts + list - _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200, 9200]) - - # only api_key - with pytest.raises(Exception): - _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test") - - # api_key + id - _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test") - - -@pytest.mark.elasticsearch -def test_init_elastic_doc_store_with_index_recreation(): - index_name = "test_index_recreation" - label_index_name = "test_index_recreation_labels" - - document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name) - documents = [Document(content="Doc1")] - labels = [ - Label( - query="query", - document=documents[0], - is_correct_document=True, - is_correct_answer=False, - origin="user-feedback", - answer=None, - ) - ] - document_store.write_documents(documents, index=index_name) - document_store.write_labels(labels, index=label_index_name) - - document_store = ElasticsearchDocumentStore(index=index_name, label_index=label_index_name, recreate_index=True) - docs = document_store.get_all_documents(index=index_name) - labels = document_store.get_all_labels(index=label_index_name) - - assert len(docs) == 0 - assert len(labels) == 0 - - -@pytest.mark.elasticsearch -def test_elasticsearch_eq_filter(): - documents = [ - {"content": "some text", "id": "1", "keyword_field": ["x", "y", "z"], "number_field": [1, 2, 3, 4]}, - {"content": "some text", "id": "2", "keyword_field": ["x", "y", "w"], "number_field": [1, 2, 3]}, - {"content": "some text", "id": "3", "keyword_field": ["x", "z"], "number_field": [2, 4]}, - {"content": "some text", "id": "4", "keyword_field": ["z", "x"], "number_field": [5, 6]}, - {"content": "some text", "id": "5", "keyword_field": ["x", "y"], "number_field": [2, 3]}, - ] - - index = "test_elasticsearch_eq_filter" - document_store = ElasticsearchDocumentStore(index=index, recreate_index=True) - document_store.write_documents(documents) - - filter = {"keyword_field": {"$eq": ["z", "x"]}} - filtered_docs = document_store.get_all_documents(index=index, filters=filter) - assert len(filtered_docs) == 2 - for doc in filtered_docs: - assert set(doc.meta["keyword_field"]) == {"x", "z"} - - filter = {"number_field": {"$eq": [2, 3]}} - filtered_docs = document_store.query(query=None, index=index, filters=filter) - assert len(filtered_docs) == 1 - assert filtered_docs[0].meta["number_field"] == [2, 3] - assert filtered_docs[0].id == "5" - - def test_write_with_duplicate_doc_ids(document_store: BaseDocumentStore): duplicate_documents = [ Document(content="Doc1", id_hash_keys=["content"]), @@ -1295,164 +1217,6 @@ def test_get_meta_values_by_key(document_store: BaseDocumentStore): assert bucket["count"] == 1 -@pytest.mark.elasticsearch -def test_elasticsearch_custom_fields(): - document_store = ElasticsearchDocumentStore( - index="haystack_test_custom", - content_field="custom_text_field", - embedding_field="custom_embedding_field", - recreate_index=True, - ) - - doc_to_write = {"custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)} - document_store.write_documents([doc_to_write]) - documents = document_store.get_all_documents(return_embedding=True) - assert len(documents) == 1 - assert documents[0].content == "test" - np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding) - - -@pytest.mark.elasticsearch -def test_elasticsearch_delete_index(): - client = Elasticsearch() - index_name = "haystack_test_deletion" - - document_store = ElasticsearchDocumentStore(index=index_name) - - # the index should exist - index_exists = client.indices.exists(index=index_name) - assert index_exists - - document_store.delete_index(index_name) - - # the index was deleted and should not exist - index_exists = client.indices.exists(index=index_name) - assert not index_exists - - -@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) -def test_elasticsearch_query_with_filters_and_missing_embeddings(document_store: ElasticsearchDocumentStore): - document_store.write_documents(DOCUMENTS) - document_without_embedding = Document( - content="Doc without embedding", meta={"name": "name_7", "year": "2021", "month": "04"} - ) - document_store.write_documents([document_without_embedding]) - filters = {"year": "2021"} - document_store.skip_missing_embeddings = False - with pytest.raises(RequestError): - document_store.query_by_embedding(np.random.rand(768), filters=filters) - - document_store.skip_missing_embeddings = True - documents = document_store.query_by_embedding(np.random.rand(768), filters=filters) - assert len(documents) == 3 - - -@pytest.mark.elasticsearch -def test_get_document_count_only_documents_without_embedding_arg(): - documents = [ - { - "content": "text1", - "id": "1", - "embedding": np.random.rand(768).astype(np.float32), - "meta_field_for_count": "a", - }, - { - "content": "text2", - "id": "2", - "embedding": np.random.rand(768).astype(np.float64), - "meta_field_for_count": "b", - }, - {"content": "text3", "id": "3", "embedding": np.random.rand(768).astype(np.float32).tolist()}, - {"content": "text4", "id": "4", "meta_field_for_count": "b"}, - {"content": "text5", "id": "5", "meta_field_for_count": "b"}, - {"content": "text6", "id": "6", "meta_field_for_count": "c"}, - { - "content": "text7", - "id": "7", - "embedding": np.random.rand(768).astype(np.float64), - "meta_field_for_count": "c", - }, - ] - - _index: str = "haystack_test_count" - document_store = ElasticsearchDocumentStore(index=_index, recreate_index=True) - - document_store.write_documents(documents) - - assert document_store.get_document_count() == 7 - assert document_store.get_document_count(only_documents_without_embedding=True) == 3 - assert ( - document_store.get_document_count( - only_documents_without_embedding=True, filters={"meta_field_for_count": ["c"]} - ) - == 1 - ) - assert ( - document_store.get_document_count( - only_documents_without_embedding=True, filters={"meta_field_for_count": ["b"]} - ) - == 2 - ) - - -@pytest.mark.elasticsearch -def test_skip_missing_embeddings(caplog): - documents = [ - {"content": "text1", "id": "1"}, # a document without embeddings - {"content": "text2", "id": "2", "embedding": np.random.rand(768).astype(np.float64)}, - {"content": "text3", "id": "3", "embedding": np.random.rand(768).astype(np.float32).tolist()}, - {"content": "text4", "id": "4", "embedding": np.random.rand(768).astype(np.float32)}, - ] - document_store = ElasticsearchDocumentStore(index="skip_missing_embedding_index", recreate_index=True) - document_store.write_documents(documents) - - document_store.skip_missing_embeddings = True - retrieved_docs = document_store.query_by_embedding(np.random.rand(768).astype(np.float32)) - assert len(retrieved_docs) == 3 - - document_store.skip_missing_embeddings = False - with pytest.raises(RequestError): - document_store.query_by_embedding(np.random.rand(768).astype(np.float32)) - - # Test scenario with no embeddings for the entire index - documents = [ - {"content": "text1", "id": "1"}, - {"content": "text2", "id": "2"}, - {"content": "text3", "id": "3"}, - {"content": "text4", "id": "4"}, - ] - - document_store.delete_documents() - document_store.write_documents(documents) - - document_store.skip_missing_embeddings = True - with caplog.at_level(logging.WARNING): - document_store.query_by_embedding(np.random.rand(768).astype(np.float32)) - assert "No documents with embeddings. Run the document store's update_embeddings() method." in caplog.text - - -@pytest.mark.elasticsearch -def test_elasticsearch_synonyms(): - synonyms = ["i-pod, i pod, ipod", "sea biscuit, sea biscit, seabiscuit", "foo, foo bar, baz"] - synonym_type = "synonym_graph" - - client = Elasticsearch() - client.indices.delete(index="haystack_synonym_arg", ignore=[404]) - document_store = ElasticsearchDocumentStore( - index="haystack_synonym_arg", synonyms=synonyms, synonym_type=synonym_type - ) - indexed_settings = client.indices.get_settings(index="haystack_synonym_arg") - - assert ( - synonym_type - == indexed_settings["haystack_synonym_arg"]["settings"]["index"]["analysis"]["filter"]["synonym"]["type"] - ) - assert ( - synonyms - == indexed_settings["haystack_synonym_arg"]["settings"]["index"]["analysis"]["filter"]["synonym"]["synonyms"] - ) - - @pytest.mark.parametrize( "document_store_with_docs", ["memory", "faiss", "milvus1", "weaviate", "elasticsearch"], indirect=True ) @@ -1995,105 +1759,6 @@ def test_DeepsetCloudDocumentStore_query_without_index(): assert document_store.query(query="some query") == [] -@pytest.mark.elasticsearch -def test_elasticsearch_search_field_mapping(): - - client = Elasticsearch() - client.indices.delete(index="haystack_search_field_mapping", ignore=[404]) - - index_data = [ - { - "title": "Green tea components", - "meta": { - "content": "The green tea plant contains a range of healthy compounds that make it into the final drink", - "sub_content": "Drink tip", - }, - "id": "1", - }, - { - "title": "Green tea catechin", - "meta": { - "content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).", - "sub_content": "Ingredients tip", - }, - "id": "2", - }, - { - "title": "Minerals in Green tea", - "meta": { - "content": "Green tea also has small amounts of minerals that can benefit your health.", - "sub_content": "Minerals tip", - }, - "id": "3", - }, - { - "title": "Green tea Benefits", - "meta": { - "content": "Green tea does more than just keep you alert, it may also help boost brain function.", - "sub_content": "Health tip", - }, - "id": "4", - }, - ] - - document_store = ElasticsearchDocumentStore( - index="haystack_search_field_mapping", search_fields=["content", "sub_content"], content_field="title" - ) - document_store.write_documents(index_data) - - indexed_settings = client.indices.get_mapping(index="haystack_search_field_mapping") - - assert indexed_settings["haystack_search_field_mapping"]["mappings"]["properties"]["content"]["type"] == "text" - assert indexed_settings["haystack_search_field_mapping"]["mappings"]["properties"]["sub_content"]["type"] == "text" - - -@pytest.mark.elasticsearch -def test_elasticsearch_existing_alias(): - - client = Elasticsearch() - client.indices.delete(index="haystack_existing_alias_1", ignore=[404]) - client.indices.delete(index="haystack_existing_alias_2", ignore=[404]) - client.indices.delete_alias(index="_all", name="haystack_existing_alias", ignore=[404]) - - settings = {"mappings": {"properties": {"content": {"type": "text"}}}} - - client.indices.create(index="haystack_existing_alias_1", body=settings) - client.indices.create(index="haystack_existing_alias_2", body=settings) - - client.indices.put_alias( - index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias" - ) - - # To be valid, all indices related to the alias must have content field of type text - _ = ElasticsearchDocumentStore(index="haystack_existing_alias", search_fields=["content"]) - - -@pytest.mark.elasticsearch -def test_elasticsearch_existing_alias_missing_fields(): - - client = Elasticsearch() - client.indices.delete(index="haystack_existing_alias_1", ignore=[404]) - client.indices.delete(index="haystack_existing_alias_2", ignore=[404]) - client.indices.delete_alias(index="_all", name="haystack_existing_alias", ignore=[404]) - - right_settings = {"mappings": {"properties": {"content": {"type": "text"}}}} - - wrong_settings = {"mappings": {"properties": {"content": {"type": "histogram"}}}} - - client.indices.create(index="haystack_existing_alias_1", body=right_settings) - client.indices.create(index="haystack_existing_alias_2", body=wrong_settings) - - client.indices.put_alias( - index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias" - ) - - with pytest.raises(Exception): - # wrong field type for "content" in index "haystack_existing_alias_2" - _ = ElasticsearchDocumentStore( - index="haystack_existing_alias", search_fields=["content"], content_field="title" - ) - - @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) def test_elasticsearch_brownfield_support(document_store_with_docs): new_document_store = InMemoryDocumentStore() diff --git a/test/document_stores/test_elasticsearch.py b/test/document_stores/test_elasticsearch.py index 300da733c8..d384754cef 100644 --- a/test/document_stores/test_elasticsearch.py +++ b/test/document_stores/test_elasticsearch.py @@ -1,9 +1,15 @@ import pytest + +import numpy as np + +from haystack.schema import Document from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore -from .test_base import DocumentStoreTest + +from .test_base import DocumentStoreBaseTestAbstract +from .test_search_engine import SearchEngineDocumentStoreTestAbstract -class TestElasticsearchDocumentStore(DocumentStoreTest): +class TestElasticsearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDocumentStoreTestAbstract): # Constants index_name = __name__ @@ -18,3 +24,196 @@ def ds(self): yield ds ds.delete_index(self.index_name) ds.delete_index(labels_index_name) + + @pytest.mark.integration + def test___init__(self): + # defaults + _ = ElasticsearchDocumentStore() + + # list of hosts + single port + _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=9200) + + # list of hosts + list of ports (wrong) + with pytest.raises(Exception): + _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200]) + + # list of hosts + list + _ = ElasticsearchDocumentStore(host=["localhost", "127.0.0.1"], port=[9200, 9200]) + + # only api_key + with pytest.raises(Exception): + _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test") + + # api_key + id + _ = ElasticsearchDocumentStore(host=["localhost"], port=[9200], api_key="test", api_key_id="test") + + @pytest.mark.integration + def test_recreate_index(self, ds, documents, labels): + ds.write_documents(documents) + ds.write_labels(labels) + + # Create another document store on top of the previous one + ds = ElasticsearchDocumentStore(index=ds.index, label_index=ds.label_index, recreate_index=True) + assert len(ds.get_all_documents(index=ds.index)) == 0 + assert len(ds.get_all_labels(index=ds.label_index)) == 0 + + @pytest.mark.integration + def test_eq_filter(self, ds, documents): + ds.write_documents(documents) + + filter = {"name": {"$eq": ["name_0"]}} + filtered_docs = ds.get_all_documents(filters=filter) + assert len(filtered_docs) == 3 + for doc in filtered_docs: + assert doc.meta["name"] == "name_0" + + filter = {"numbers": {"$eq": [2, 4]}} + filtered_docs = ds.query(query=None, filters=filter) + assert len(filtered_docs) == 3 + for doc in filtered_docs: + assert doc.meta["month"] == "01" + assert doc.meta["numbers"] == [2, 4] + + @pytest.mark.integration + def test_custom_fields(self, ds): + index = "haystack_test_custom" + document_store = ElasticsearchDocumentStore( + index=index, + content_field="custom_text_field", + embedding_field="custom_embedding_field", + recreate_index=True, + ) + doc_to_write = {"custom_text_field": "test", "custom_embedding_field": np.random.rand(768).astype(np.float32)} + document_store.write_documents([doc_to_write]) + documents = document_store.get_all_documents(return_embedding=True) + assert len(documents) == 1 + assert documents[0].content == "test" + np.testing.assert_array_equal(doc_to_write["custom_embedding_field"], documents[0].embedding) + document_store.delete_index(index) + + @pytest.mark.integration + def test_query_with_filters_and_missing_embeddings(self, ds, documents): + ds.write_documents(documents) + filters = {"month": {"$in": ["01", "03"]}} + ds.skip_missing_embeddings = False + with pytest.raises(ds._RequestError): + ds.query_by_embedding(np.random.rand(768), filters=filters) + + ds.skip_missing_embeddings = True + documents = ds.query_by_embedding(np.random.rand(768), filters=filters) + assert len(documents) == 3 + + @pytest.mark.integration + def test_synonyms(self, ds): + synonyms = ["i-pod, i pod, ipod", "sea biscuit, sea biscit, seabiscuit", "foo, foo bar, baz"] + synonym_type = "synonym_graph" + + client = ds.client + index = "haystack_synonym_arg" + client.indices.delete(index=index, ignore=[404]) + ElasticsearchDocumentStore(index=index, synonyms=synonyms, synonym_type=synonym_type) + indexed_settings = client.indices.get_settings(index=index) + + assert synonym_type == indexed_settings[index]["settings"]["index"]["analysis"]["filter"]["synonym"]["type"] + assert synonyms == indexed_settings[index]["settings"]["index"]["analysis"]["filter"]["synonym"]["synonyms"] + + @pytest.mark.integration + def test_search_field_mapping(self): + index = "haystack_search_field_mapping" + document_store = ElasticsearchDocumentStore( + index=index, search_fields=["content", "sub_content"], content_field="title" + ) + + document_store.write_documents( + [ + { + "title": "Green tea components", + "meta": { + "content": "The green tea plant contains a range of healthy compounds that make it into the final drink", + "sub_content": "Drink tip", + }, + "id": "1", + }, + { + "title": "Green tea catechin", + "meta": { + "content": "Green tea contains a catechin called epigallocatechin-3-gallate (EGCG).", + "sub_content": "Ingredients tip", + }, + "id": "2", + }, + { + "title": "Minerals in Green tea", + "meta": { + "content": "Green tea also has small amounts of minerals that can benefit your health.", + "sub_content": "Minerals tip", + }, + "id": "3", + }, + { + "title": "Green tea Benefits", + "meta": { + "content": "Green tea does more than just keep you alert, it may also help boost brain function.", + "sub_content": "Health tip", + }, + "id": "4", + }, + ] + ) + + indexed_settings = document_store.client.indices.get_mapping(index=index) + + assert indexed_settings[index]["mappings"]["properties"]["content"]["type"] == "text" + assert indexed_settings[index]["mappings"]["properties"]["sub_content"]["type"] == "text" + document_store.delete_index(index) + + @pytest.mark.integration + def test_existing_alias(self, ds): + client = ds.client + client.indices.delete(index="haystack_existing_alias_1", ignore=[404]) + client.indices.delete(index="haystack_existing_alias_2", ignore=[404]) + client.indices.delete_alias(index="_all", name="haystack_existing_alias", ignore=[404]) + + settings = {"mappings": {"properties": {"content": {"type": "text"}}}} + + client.indices.create(index="haystack_existing_alias_1", body=settings) + client.indices.create(index="haystack_existing_alias_2", body=settings) + + client.indices.put_alias( + index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias" + ) + + # To be valid, all indices related to the alias must have content field of type text + ElasticsearchDocumentStore(index="haystack_existing_alias", search_fields=["content"]) + + @pytest.mark.integration + def test_existing_alias_missing_fields(self, ds): + + client = ds.client + client.indices.delete(index="haystack_existing_alias_1", ignore=[404]) + client.indices.delete(index="haystack_existing_alias_2", ignore=[404]) + client.indices.delete_alias(index="_all", name="haystack_existing_alias", ignore=[404]) + + right_settings = {"mappings": {"properties": {"content": {"type": "text"}}}} + wrong_settings = {"mappings": {"properties": {"content": {"type": "histogram"}}}} + + client.indices.create(index="haystack_existing_alias_1", body=right_settings) + client.indices.create(index="haystack_existing_alias_2", body=wrong_settings) + client.indices.put_alias( + index="haystack_existing_alias_1,haystack_existing_alias_2", name="haystack_existing_alias" + ) + + with pytest.raises(Exception): + # wrong field type for "content" in index "haystack_existing_alias_2" + ElasticsearchDocumentStore( + index="haystack_existing_alias", search_fields=["content"], content_field="title" + ) + + @pytest.mark.integration + def test_get_document_count_only_documents_without_embedding_arg(self, ds, documents): + ds.write_documents(documents) + + assert ds.get_document_count() == 9 + assert ds.get_document_count(only_documents_without_embedding=True) == 3 + assert ds.get_document_count(only_documents_without_embedding=True, filters={"month": ["01"]}) == 0 + assert ds.get_document_count(only_documents_without_embedding=True, filters={"month": ["03"]}) == 3 diff --git a/test/document_stores/test_opensearch.py b/test/document_stores/test_opensearch.py index 4fdf8d53ca..1da662ba4f 100644 --- a/test/document_stores/test_opensearch.py +++ b/test/document_stores/test_opensearch.py @@ -19,10 +19,11 @@ from haystack.schema import Document, Label, Answer from haystack.errors import DocumentStoreError -from .test_base import DocumentStoreTest +from .test_base import DocumentStoreBaseTestAbstract +from .test_search_engine import SearchEngineDocumentStoreTestAbstract -class TestOpenSearchDocumentStore(DocumentStoreTest): +class TestOpenSearchDocumentStore(DocumentStoreBaseTestAbstract, SearchEngineDocumentStoreTestAbstract): # Constants diff --git a/test/document_stores/test_search_engine.py b/test/document_stores/test_search_engine.py new file mode 100644 index 0000000000..0bd4736e41 --- /dev/null +++ b/test/document_stores/test_search_engine.py @@ -0,0 +1,43 @@ +import pytest +from haystack.document_stores.search_engine import SearchEngineDocumentStore, prepare_hosts + + +@pytest.mark.unit +def test_prepare_hosts(self): + pass + + +@pytest.mark.document_store +class SearchEngineDocumentStoreTestAbstract: + """ + This is the base class for any Document Store testsuite, it doesn't have the `Test` prefix in the name + because we want to run its methods only in subclasses. + """ + + @pytest.mark.integration + def test___do_bulk(self): + pass + + @pytest.mark.integration + def test___do_scan(self): + pass + + @pytest.mark.integration + def test_query_by_embedding(self): + pass + + @pytest.mark.integration + def test_delete_index(self, ds): + client = ds.client + # the index should exist + assert client.indices.exists(index=ds.index) + ds.delete_index(ds.index) + # the index was deleted and should not exist + assert not client.indices.exists(index=ds.index) + + +@pytest.mark.document_store +class TestSearchEngineDocumentStore: + @pytest.mark.integration + def test__split_document_list(self): + pass From b1dc79968e07547839552eec22bb25a9998ebc82 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 17 Oct 2022 19:52:49 +0200 Subject: [PATCH 04/23] refactor test workflow --- .github/workflows/tests.yml | 176 +++++++++------------ test/document_stores/test_elasticsearch.py | 7 +- test/document_stores/test_opensearch.py | 9 +- 3 files changed, 89 insertions(+), 103 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 70805b9d5d..2e6485761d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -91,17 +91,22 @@ jobs: if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' unit-tests: - name: Unit / ${{ matrix.os }} + name: Unit / ${{ matrix.topic }} / ${{ matrix.os }} needs: - mypy - pylint strategy: fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: + - ubuntu-latest + - windows-latest + - macos-latest] + topic: + - document_stores runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Setup Python uses: ./.github/actions/python_cache/ @@ -110,7 +115,7 @@ jobs: run: pip install .[all] - name: Run - run: pytest -m "unit" test/ + run: pytest -m "unit" test/${{ matrix.topic }} - uses: act10ns/slack@v1 with: @@ -215,117 +220,86 @@ jobs: channel: '#haystack' if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' - elasticsearch-tests-linux: + integration-tests-elasticsearch: + name: Integration / Elasticsearch / ${{ matrix.os }} needs: - - mypy - - pylint - runs-on: ubuntu-latest + - unit-tests + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - windows-latest + - macos-latest] + runs-on: ${{ matrix.os }} + services: + elasticsearch: + image: elasticsearch:7.17 + env: + discovery.type: "single-node" + ES_JAVA_OPTS: "-Xms128m -Xmx256m" + env: + ELASTICSEARCH_HOST: "elasticsearch" steps: - - uses: actions/checkout@v2 - - - name: Setup Elasticsearch - run: | - docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2 - - # TODO Let's try to remove this one from the unit tests - - name: Install pdftotext - run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin + - uses: actions/checkout@v3 - - name: Setup Python - uses: ./.github/actions/python_cache/ - - - name: Install Haystack - run: pip install . - - - name: Run tests - env: - TOKENIZERS_PARALLELISM: 'false' - run: | - pytest ${{ env.PYTEST_PARAMS }} -m "elasticsearch and not integration" test/document_stores/ --document_store_type=elasticsearch - - - name: Dump docker logs on failure - if: failure() - uses: jwalton/gh-docker-logs@v1 - - - uses: act10ns/slack@v1 - with: - status: ${{ job.status }} - channel: '#haystack' - if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' - - elasticsearch-tests-windows: - needs: - - mypy - - pylint - runs-on: windows-latest - if: contains(github.event.pull_request.labels.*.name, 'topic:windows') || !github.event.pull_request.draft + - name: Setup Python + uses: ./.github/actions/python_cache/ - steps: - - uses: actions/checkout@v2 + - name: Install Haystack + run: pip install -U .[docstores] - - name: Install dependencies - run: | - choco install --no-progress xpdf-utils - choco install --no-progress openjdk --version=11.0.2.01 - refreshenv - choco install --no-progress elasticsearch --version=7.9.2 - refreshenv - Get-Service elasticsearch-service-x64 | Start-Service + - name: Run tests + run: | + pytest -m "document_store and integration" test/document_stores/test_elasticsearch.py - - name: Setup Python - uses: ./.github/actions/python_cache/ - with: - prefix: windows - - - name: Run tests - env: - TOKENIZERS_PARALLELISM: 'false' - run: | - pytest ${{ env.PYTEST_PARAMS }} -m "elasticsearch and not integration" test/document_stores/ ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} --document_store_type=elasticsearch + - uses: act10ns/slack@v1 + with: + status: ${{ job.status }} + channel: '#haystack' + if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' - - uses: act10ns/slack@v1 - with: - status: ${{ job.status }} - channel: '#haystack' - if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' - opensearch-tests-linux: + integration-tests-opensearch: + name: Integration / Elasticsearch / ${{ matrix.os }} needs: - - mypy - - pylint - runs-on: ubuntu-latest + - unit-tests + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - windows-latest + - macos-latest] + runs-on: ${{ matrix.os }} + services: + opensearch: + image: opensearchproject/opensearch:1.3.5 + env: + discovery.type: "single-node" + ES_JAVA_OPTS: "-Xms128m -Xmx256m" + ports: + - 9201:9200 + env: + OPENSEARCH_HOST: "opensearch" steps: - - uses: actions/checkout@v2 - - - name: Setup Opensearch - run: | - docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.3.5 - - # TODO Let's try to remove this one from the unit tests - - name: Install pdftotext - run: wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz && tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin + - uses: actions/checkout@v3 - - name: Setup Python - uses: ./.github/actions/python_cache/ - - - name: Install Haystack - run: pip install . + - name: Setup Python + uses: ./.github/actions/python_cache/ - - name: Run tests - env: - TOKENIZERS_PARALLELISM: 'false' - run: | - pytest ${{ env.PYTEST_PARAMS }} -m "opensearch and not integration" test/document_stores/test_document_store.py --document_store_type=opensearch + - name: Install Haystack + run: pip install -U .[docstores] - - name: Dump docker logs on failure - if: failure() - uses: jwalton/gh-docker-logs@v1 + - name: Run tests + run: | + pytest -m "document_store and integration" test/document_stores/test_opensearch.py - - uses: act10ns/slack@v1 - with: - status: ${{ job.status }} - channel: '#haystack' - if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' + - uses: act10ns/slack@v1 + with: + status: ${{ job.status }} + channel: '#haystack' + if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' faiss-tests-linux: needs: diff --git a/test/document_stores/test_elasticsearch.py b/test/document_stores/test_elasticsearch.py index d384754cef..9f7b8040cf 100644 --- a/test/document_stores/test_elasticsearch.py +++ b/test/document_stores/test_elasticsearch.py @@ -20,7 +20,12 @@ def ds(self): This fixture provides a working document store and takes care of removing the indices when done """ labels_index_name = f"{self.index_name}_labels" - ds = ElasticsearchDocumentStore(index=self.index_name, label_index=labels_index_name, create_index=True) + ds = ElasticsearchDocumentStore( + index=self.index_name, + label_index=labels_index_name, + host=os.environ.get("ELASTICSEARCH_HOST", "localhost"), + create_index=True, + ) yield ds ds.delete_index(self.index_name) ds.delete_index(labels_index_name) diff --git a/test/document_stores/test_opensearch.py b/test/document_stores/test_opensearch.py index 1da662ba4f..793d508ea0 100644 --- a/test/document_stores/test_opensearch.py +++ b/test/document_stores/test_opensearch.py @@ -1,3 +1,4 @@ +import os import logging from unittest.mock import MagicMock, patch @@ -38,7 +39,13 @@ def ds(self): This fixture provides a working document store and takes care of removing the indices when done """ labels_index_name = f"{self.index_name}_labels" - ds = OpenSearchDocumentStore(index=self.index_name, label_index=labels_index_name, port=9201, create_index=True) + ds = OpenSearchDocumentStore( + index=self.index_name, + label_index=labels_index_name, + port=9201, + host=os.environ.get("OPENSEARCH_HOST", "localhost"), + create_index=True, + ) yield ds ds.delete_index(self.index_name) ds.delete_index(labels_index_name) From a86fca8f7524cd85697b60f444603d10ea69da3b Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 19 Oct 2022 10:23:30 +0200 Subject: [PATCH 05/23] job steps --- .github/workflows/tests.yml | 164 +++++++++++---------- test/document_stores/test_elasticsearch.py | 1 + 2 files changed, 85 insertions(+), 80 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2e6485761d..3e20ead7f2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -123,6 +123,90 @@ jobs: channel: '#haystack' if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' + integration-tests-elasticsearch: + name: Integration / Elasticsearch / ${{ matrix.os }} + needs: + - unit-tests + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - windows-latest + - macos-latest] + runs-on: ${{ matrix.os }} + services: + elasticsearch: + image: elasticsearch:7.17 + env: + discovery.type: "single-node" + ES_JAVA_OPTS: "-Xms128m -Xmx256m" + env: + ELASTICSEARCH_HOST: "elasticsearch" + steps: + - uses: actions/checkout@v3 + + - name: Setup Python + uses: ./.github/actions/python_cache/ + + - name: Install Haystack + run: pip install -U .[docstores] + + - name: Run tests + run: | + pytest -m "document_store and integration" test/document_stores/test_elasticsearch.py + + - uses: act10ns/slack@v1 + with: + status: ${{ job.status }} + channel: '#haystack' + if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' + + integration-tests-opensearch: + name: Integration / Elasticsearch / ${{ matrix.os }} + needs: + - unit-tests + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - windows-latest + - macos-latest] + runs-on: ${{ matrix.os }} + services: + opensearch: + image: opensearchproject/opensearch:1.3.5 + env: + discovery.type: "single-node" + ES_JAVA_OPTS: "-Xms128m -Xmx256m" + ports: + - 9201:9200 + env: + OPENSEARCH_HOST: "opensearch" + steps: + - uses: actions/checkout@v3 + + - name: Setup Python + uses: ./.github/actions/python_cache/ + + - name: Install Haystack + run: pip install -U .[docstores] + + - name: Run tests + run: | + pytest -m "document_store and integration" test/document_stores/test_opensearch.py + + - uses: act10ns/slack@v1 + with: + status: ${{ job.status }} + channel: '#haystack' + if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' + +# +# TODO: the following steps need to be revisited +# + unit-tests-linux: needs: - mypy @@ -220,86 +304,6 @@ jobs: channel: '#haystack' if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' - integration-tests-elasticsearch: - name: Integration / Elasticsearch / ${{ matrix.os }} - needs: - - unit-tests - strategy: - fail-fast: false - matrix: - os: - - ubuntu-latest - - windows-latest - - macos-latest] - runs-on: ${{ matrix.os }} - services: - elasticsearch: - image: elasticsearch:7.17 - env: - discovery.type: "single-node" - ES_JAVA_OPTS: "-Xms128m -Xmx256m" - env: - ELASTICSEARCH_HOST: "elasticsearch" - steps: - - uses: actions/checkout@v3 - - - name: Setup Python - uses: ./.github/actions/python_cache/ - - - name: Install Haystack - run: pip install -U .[docstores] - - - name: Run tests - run: | - pytest -m "document_store and integration" test/document_stores/test_elasticsearch.py - - - uses: act10ns/slack@v1 - with: - status: ${{ job.status }} - channel: '#haystack' - if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' - - - integration-tests-opensearch: - name: Integration / Elasticsearch / ${{ matrix.os }} - needs: - - unit-tests - strategy: - fail-fast: false - matrix: - os: - - ubuntu-latest - - windows-latest - - macos-latest] - runs-on: ${{ matrix.os }} - services: - opensearch: - image: opensearchproject/opensearch:1.3.5 - env: - discovery.type: "single-node" - ES_JAVA_OPTS: "-Xms128m -Xmx256m" - ports: - - 9201:9200 - env: - OPENSEARCH_HOST: "opensearch" - steps: - - uses: actions/checkout@v3 - - - name: Setup Python - uses: ./.github/actions/python_cache/ - - - name: Install Haystack - run: pip install -U .[docstores] - - - name: Run tests - run: | - pytest -m "document_store and integration" test/document_stores/test_opensearch.py - - - uses: act10ns/slack@v1 - with: - status: ${{ job.status }} - channel: '#haystack' - if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' faiss-tests-linux: needs: diff --git a/test/document_stores/test_elasticsearch.py b/test/document_stores/test_elasticsearch.py index 9f7b8040cf..3fa7d29b07 100644 --- a/test/document_stores/test_elasticsearch.py +++ b/test/document_stores/test_elasticsearch.py @@ -1,3 +1,4 @@ +import os import pytest import numpy as np From 04b8a7c97e5db5a1797f2c0a49fa7b16572ca4f5 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 19 Oct 2022 16:04:27 +0200 Subject: [PATCH 06/23] add more tests --- test/document_stores/test_base.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/document_stores/test_base.py b/test/document_stores/test_base.py index 98a89668a5..f571130819 100644 --- a/test/document_stores/test_base.py +++ b/test/document_stores/test_base.py @@ -72,6 +72,23 @@ def test_write_labels(self, ds, labels): ds.write_labels(labels) assert ds.get_all_labels() == labels + @pytest.mark.integration + def test_write_with_duplicate_doc_ids(self, ds): + duplicate_documents = [ + Document(content="Doc1", id_hash_keys=["content"]), + Document(content="Doc1", id_hash_keys=["content"]), + ] + ds.write_documents(duplicate_documents, duplicate_documents="skip") + assert len(ds.get_all_documents()) == 1 + with pytest.raises(Exception): + ds.write_documents(duplicate_documents, duplicate_documents="fail") + + @pytest.mark.integration + def test_get_all_documents_without_filters(self, ds, documents): + ds.write_documents(documents) + out = ds.get_all_documents() + assert out == documents + # get_all_documents_generator # get_all_labels # get_document_by_id From 0275f29d187a6f46bbe66dbd11f06e46c812552f Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Wed, 19 Oct 2022 18:28:00 +0200 Subject: [PATCH 07/23] move more tests --- test/document_stores/test_base.py | 158 +++++++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 1 deletion(-) diff --git a/test/document_stores/test_base.py b/test/document_stores/test_base.py index f571130819..54bf16cde8 100644 --- a/test/document_stores/test_base.py +++ b/test/document_stores/test_base.py @@ -27,7 +27,7 @@ def documents(self): documents.append( Document( content=f"A Bar Document {i}", - meta={"name": f"name_{i}", "year": "2021", "month": "02"}, + meta={"name": f"name_{i}", "year": "2021", "month": "02", "numbers": [-2, -4]}, embedding=np.random.rand(768).astype(np.float32), ) ) @@ -83,12 +83,168 @@ def test_write_with_duplicate_doc_ids(self, ds): with pytest.raises(Exception): ds.write_documents(duplicate_documents, duplicate_documents="fail") + @pytest.mark.skip @pytest.mark.integration def test_get_all_documents_without_filters(self, ds, documents): ds.write_documents(documents) out = ds.get_all_documents() assert out == documents + @pytest.mark.integration + def test_get_all_document_filter_duplicate_text_value(self, ds): + documents = [ + Document(content="duplicated", meta={"meta_field": "0"}, id_hash_keys=["meta"]), + Document(content="duplicated", meta={"meta_field": "1", "name": "file.txt"}, id_hash_keys=["meta"]), + Document(content="Doc2", meta={"name": "file_2.txt"}, id_hash_keys=["meta"]), + ] + ds.write_documents(documents) + documents = ds.get_all_documents(filters={"meta_field": ["1"]}) + assert len(documents) == 1 + assert documents[0].content == "duplicated" + assert documents[0].meta["name"] == "file.txt" + + documents = ds.get_all_documents(filters={"meta_field": ["0"]}) + assert len(documents) == 1 + assert documents[0].content == "duplicated" + assert documents[0].meta.get("name") is None + + documents = ds.get_all_documents(filters={"name": ["file_2.txt"]}) + assert len(documents) == 1 + assert documents[0].content == "Doc2" + assert documents[0].meta.get("meta_field") is None + + @pytest.mark.integration + def test_get_all_documents_with_correct_filters(self, ds, documents): + ds.write_documents(documents) + result = ds.get_all_documents(filters={"year": ["2020"]}) + assert len(result) == 3 + + documents = ds.get_all_documents(filters={"year": ["2020", "2021"]}) + assert len(documents) == 6 + + @pytest.mark.integration + def test_get_all_documents_with_incorrect_filter_name(self, ds, documents): + ds.write_documents(documents) + result = ds.get_all_documents(filters={"non_existing_meta_field": ["whatever"]}) + assert len(result) == 0 + + @pytest.mark.integration + def test_get_all_documents_with_incorrect_filter_value(self, ds, documents): + ds.write_documents(documents) + result = ds.get_all_documents(filters={"year": ["nope"]}) + assert len(result) == 0 + + @pytest.mark.integration + def test_extended_filter(self, ds, documents): + ds.write_documents(documents) + + # Test comparison operators individually + + result = ds.get_all_documents(filters={"year": {"$eq": "2020"}}) + assert len(result) == 3 + result = ds.get_all_documents(filters={"year": "2020"}) + assert len(result) == 3 + + result = ds.get_all_documents(filters={"year": {"$in": ["2020", "2021", "n.a."]}}) + assert len(result) == 6 + result = ds.get_all_documents(filters={"year": ["2020", "2021", "n.a."]}) + assert len(result) == 6 + + result = ds.get_all_documents(filters={"year": {"$ne": "2020"}}) + assert len(result) == 6 + + result = ds.get_all_documents(filters={"year": {"$nin": ["2020", "2021", "n.a."]}}) + assert len(result) == 3 + + result = ds.get_all_documents(filters={"numbers": {"$gt": 0}}) + assert len(result) == 3 + + result = ds.get_all_documents(filters={"numbers": {"$gte": -2}}) + assert len(result) == 6 + + result = ds.get_all_documents(filters={"numbers": {"$lt": 0}}) + assert len(result) == 3 + + result = ds.get_all_documents(filters={"numbers": {"$lte": 2.0}}) + assert len(result) == 6 + + # Test compound filters + + result = ds.get_all_documents(filters={"year": {"$lte": "2021", "$gte": "2020"}}) + assert len(result) == 6 + + filters = {"$and": {"year": {"$lte": "2021", "$gte": "2020"}, "name": {"$in": ["name_0", "name_1"]}}} + result = ds.get_all_documents(filters=filters) + assert len(result) == 4 + + filters_simplified = {"year": {"$lte": "2021", "$gte": "2020"}, "name": ["name_0", "name_1"]} + result = ds.get_all_documents(filters=filters_simplified) + assert len(result) == 4 + + filters = { + "$and": { + "year": {"$lte": "2021", "$gte": "2020"}, + "$or": {"name": {"$in": ["name_0", "name_1"]}, "numbers": {"$lt": 5.0}}, + } + } + result = ds.get_all_documents(filters=filters) + assert len(result) == 6 + + filters_simplified = { + "year": {"$lte": "2021", "$gte": "2020"}, + "$or": {"name": {"$in": ["name_0", "name_2"]}, "numbers": {"$lt": 5.0}}, + } + result = ds.get_all_documents(filters=filters_simplified) + assert len(result) == 6 + + filters = { + "$and": { + "year": {"$lte": "2021", "$gte": "2020"}, + "$or": { + "name": {"$in": ["name_0", "name_1"]}, + "$and": {"numbers": {"$lt": 5.0}, "$not": {"month": {"$eq": "01"}}}, + }, + } + } + result = ds.get_all_documents(filters=filters) + assert len(result) == 5 + + filters_simplified = { + "year": {"$lte": "2021", "$gte": "2020"}, + "$or": {"name": ["name_0", "name_1"], "$and": {"numbers": {"$lt": 5.0}, "$not": {"month": {"$eq": "01"}}}}, + } + result = ds.get_all_documents(filters=filters_simplified) + assert len(result) == 5 + + # Test nested logical operations within "$not", important as we apply De Morgan's laws in WeaviateDocumentstore + + filters = { + "$not": { + "$or": { + "$and": {"numbers": {"$lt": 5.0}, "month": {"$ne": "01"}}, + "$not": {"year": {"$lte": "2021", "$gte": "2020"}}, + } + } + } + result = ds.get_all_documents(filters=filters) + docs_meta = result[0].meta["numbers"] + assert len(result) == 3 + assert [2, 4] == docs_meta + + # Test same logical operator twice on same level + + filters = { + "$or": [ + {"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$gte": "2020"}}}, + {"$and": {"name": {"$in": ["name_0", "name_1"]}, "year": {"$lt": "2021"}}}, + ] + } + result = ds.get_all_documents(filters=filters) + docs_meta = [doc.meta["name"] for doc in result] + assert len(result) == 4 + assert "name_0" in docs_meta + assert "name_2" not in docs_meta + # get_all_documents_generator # get_all_labels # get_document_by_id From 6628ca493c35337bfd328a38ddeb271b9f49129b Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 21 Oct 2022 11:57:15 +0200 Subject: [PATCH 08/23] more tests --- test/document_stores/test_base.py | 107 ++++++++++++++++++++- test/document_stores/test_search_engine.py | 6 +- 2 files changed, 108 insertions(+), 5 deletions(-) diff --git a/test/document_stores/test_base.py b/test/document_stores/test_base.py index 54bf16cde8..eb4ac5b895 100644 --- a/test/document_stores/test_base.py +++ b/test/document_stores/test_base.py @@ -2,6 +2,7 @@ import numpy as np from haystack.schema import Document, Label, Answer +from haystack.errors import DuplicateDocumentError @pytest.mark.document_store @@ -245,10 +246,109 @@ def test_extended_filter(self, ds, documents): assert "name_0" in docs_meta assert "name_2" not in docs_meta - # get_all_documents_generator + @pytest.mark.integration + def test_get_document_by_id(self, ds, documents): + ds.write_documents(documents) + doc = ds.get_document_by_id(documents[0].id) + assert doc.id == documents[0].id + assert doc.content == documents[0].content + + @pytest.mark.integration + def test_get_documents_by_id(self, ds, documents): + ds.write_documents(documents) + ids = [doc.id for doc in documents] + result = {doc.id for doc in ds.get_documents_by_id(ids, batch_size=2)} + assert set(ids) == result + + @pytest.mark.integration + def test_get_document_count(self, ds, documents): + ds.write_documents(documents) + assert ds.get_document_count() == 9 + assert ds.get_document_count(filters={"year": ["2020"]}) == 3 + assert ds.get_document_count(filters={"month": ["02"]}) == 3 + + @pytest.mark.integration + def test_get_all_documents_generator(self, ds, documents): + ds.write_documents(documents) + assert len(list(ds.get_all_documents_generator(batch_size=2))) == 9 + + @pytest.mark.integration + def test_duplicate_documents_skip(self, ds, documents): + ds.write_documents(documents) + + updated_docs = [] + for d in documents: + updated_d = Document.from_dict(d.to_dict()) + updated_d.meta["name"] = "Updated" + updated_docs.append(updated_d) + + ds.write_documents(updated_docs, duplicate_documents="skip") + result = ds.get_all_documents() + assert result[0].meta["name"] == "name_0" + + @pytest.mark.integration + def test_duplicate_documents_overwrite(self, ds, documents): + ds.write_documents(documents) + + updated_docs = [] + for d in documents: + updated_d = Document.from_dict(d.to_dict()) + updated_d.meta["name"] = "Updated" + updated_docs.append(updated_d) + + ds.write_documents(updated_docs, duplicate_documents="overwrite") + for doc in ds.get_all_documents(): + assert doc.meta["name"] == "Updated" + + @pytest.mark.integration + def test_duplicate_documents_fail(self, ds, documents): + ds.write_documents(documents) + + updated_docs = [] + for d in documents: + updated_d = Document.from_dict(d.to_dict()) + updated_d.meta["name"] = "Updated" + updated_docs.append(updated_d) + + with pytest.raises(DuplicateDocumentError): + ds.write_documents(updated_docs, duplicate_documents="fail") + + @pytest.mark.integration + def test_write_document_meta(self, ds): + ds.write_documents( + [ + {"content": "dict_without_meta", "id": "1"}, + {"content": "dict_with_meta", "meta_field": "test2", "id": "2"}, + Document(content="document_object_without_meta", id="3"), + Document(content="document_object_with_meta", meta={"meta_field": "test4"}, id="4"), + ] + ) + assert not ds.get_document_by_id("1").meta + assert ds.get_document_by_id("2").meta["meta_field"] == "test2" + assert not ds.get_document_by_id("3").meta + assert ds.get_document_by_id("4").meta["meta_field"] == "test4" + + @pytest.mark.integration + def test_delete_documents(self, ds, documents): + ds.write_documents(documents) + ds.delete_documents() + assert ds.get_document_count() == 0 + + @pytest.mark.integration + def test_delete_documents_with_filters(self, ds, documents): + ds.write_documents(documents) + ds.delete_documents(filters={"year": ["2020", "2021"]}) + documents = ds.get_all_documents() + assert ds.get_document_count() == 3 + + @pytest.mark.integration + def test_delete_documents_by_id(self, ds, documents): + ds.write_documents(documents) + docs_to_delete = ds.get_all_documents(filters={"year": ["2020"]}) + ds.delete_documents(ids=[doc.id for doc in docs_to_delete]) + assert ds.get_document_count() == 6 + # get_all_labels - # get_document_by_id - # get_document_count # query_by_embedding # get_label_count # write_labels @@ -256,5 +356,4 @@ def test_extended_filter(self, ds, documents): # delete_labels # delete_index # _create_document_field_map - # get_documents_by_id # update_document_meta diff --git a/test/document_stores/test_search_engine.py b/test/document_stores/test_search_engine.py index 0bd4736e41..6230d0cf61 100644 --- a/test/document_stores/test_search_engine.py +++ b/test/document_stores/test_search_engine.py @@ -10,7 +10,7 @@ def test_prepare_hosts(self): @pytest.mark.document_store class SearchEngineDocumentStoreTestAbstract: """ - This is the base class for any Document Store testsuite, it doesn't have the `Test` prefix in the name + This is the base class for any Searchengine Document Store testsuite, it doesn't have the `Test` prefix in the name because we want to run its methods only in subclasses. """ @@ -38,6 +38,10 @@ def test_delete_index(self, ds): @pytest.mark.document_store class TestSearchEngineDocumentStore: + """ + This class tests the concrete methods in SearchEngineDocumentStore + """ + @pytest.mark.integration def test__split_document_list(self): pass From f39d76eca000a5187963ea7e19ada2df9f845a4f Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 21 Oct 2022 13:11:23 +0200 Subject: [PATCH 09/23] test labels --- test/document_stores/test_base.py | 67 ++++++++++++++++++++++++++++--- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/test/document_stores/test_base.py b/test/document_stores/test_base.py index eb4ac5b895..5962020b7c 100644 --- a/test/document_stores/test_base.py +++ b/test/document_stores/test_base.py @@ -48,13 +48,14 @@ def labels(self, documents): for i, d in enumerate(documents): labels.append( Label( - query="query", + query=f"query_{i}", document=d, is_correct_document=True, is_correct_answer=False, # create a mix set of labels origin="user-feedback" if i % 2 else "gold-label", answer=None if not i else Answer(f"the answer is {i}"), + meta={"name": f"label_{i}", "year": f"{2020 + i}"}, ) ) return labels @@ -348,12 +349,66 @@ def test_delete_documents_by_id(self, ds, documents): ds.delete_documents(ids=[doc.id for doc in docs_to_delete]) assert ds.get_document_count() == 6 - # get_all_labels + @pytest.mark.integration + def test_write_get_all_labels(self, ds, labels): + ds.write_labels(labels) + ds.write_labels(labels[:3], index="custom_index") + assert len(ds.get_all_labels()) == 9 + assert len(ds.get_all_labels(index="custom_index")) == 3 + # remove the index we created in this test + ds.delete_index("custom_index") + + @pytest.mark.integration + def test_delete_labels(self, ds, labels): + ds.write_labels(labels) + ds.write_labels(labels[:3], index="custom_index") + ds.delete_labels() + ds.delete_labels(index="custom_index") + assert len(ds.get_all_labels()) == 0 + assert len(ds.get_all_labels(index="custom_index")) == 0 + # remove the index we created in this test + ds.delete_index("custom_index") + + @pytest.mark.integration + def test_write_labels_duplicate(self, ds, labels): + # create a duplicate + dupe = Label.from_dict(labels[0].to_dict()) + + ds.write_labels(labels + [dupe]) + + # ensure the duplicate was discarded + assert len(ds.get_all_labels()) == len(labels) + + @pytest.mark.integration + def test_delete_labels_by_id(self, ds, labels): + ds.write_labels(labels) + ds.delete_labels(ids=[labels[0].id]) + assert len(ds.get_all_labels()) == len(labels) - 1 + + @pytest.mark.integration + def test_delete_labels_by_filter(self, ds, labels): + ds.write_labels(labels) + ds.delete_labels(filters={"query": "query_1"}) + assert len(ds.get_all_labels()) == len(labels) - 1 + + @pytest.mark.integration + def test_delete_labels_by_filter_id(self, ds, labels): + ds.write_labels(labels) + + # ids and filters are ANDed, the following should have no effect + ds.delete_labels(ids=[labels[0].id], filters={"query": "query_9"}) + assert len(ds.get_all_labels()) == len(labels) + + # + ds.delete_labels(ids=[labels[0].id], filters={"query": "query_0"}) + assert len(ds.get_all_labels()) == len(labels) - 1 + + @pytest.mark.integration + def test_get_label_count(self, ds, labels): + ds.write_labels(labels) + assert ds.get_label_count() == len(labels) + # query_by_embedding - # get_label_count - # write_labels - # delete_documents - # delete_labels # delete_index # _create_document_field_map # update_document_meta From d7946ace1fdd1d004c7526c3d579d9a13cd9f4fd Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 21 Oct 2022 15:50:59 +0200 Subject: [PATCH 10/23] add more tests --- test/document_stores/test_base.py | 39 +++++++++++++++++++--- test/document_stores/test_search_engine.py | 25 ++++++++++---- 2 files changed, 53 insertions(+), 11 deletions(-) diff --git a/test/document_stores/test_base.py b/test/document_stores/test_base.py index 5962020b7c..e227353c91 100644 --- a/test/document_stores/test_base.py +++ b/test/document_stores/test_base.py @@ -3,6 +3,7 @@ from haystack.schema import Document, Label, Answer from haystack.errors import DuplicateDocumentError +from haystack.document_stores import BaseDocumentStore @pytest.mark.document_store @@ -60,6 +61,10 @@ def labels(self, documents): ) return labels + # + # Integration tests + # + @pytest.mark.integration def test_write_documents(self, ds, documents): ds.write_documents(documents) @@ -408,7 +413,33 @@ def test_get_label_count(self, ds, labels): ds.write_labels(labels) assert ds.get_label_count() == len(labels) - # query_by_embedding - # delete_index - # _create_document_field_map - # update_document_meta + @pytest.mark.integration + def test_delete_index(self, ds, documents): + ds.write_documents(documents, index="custom_index") + assert ds.get_document_count(index="custom_index") == len(documents) + ds.delete_index(index="custom_index") + with pytest.raises(Exception): + ds.get_document_count(index="custom_index") + + @pytest.mark.integration + def test_update_meta(self, ds, documents): + ds.write_documents(documents) + doc = documents[0] + ds.update_document_meta(doc.id, meta={"year": "2099", "month": "12"}) + doc = ds.get_document_by_id(doc.id) + assert doc.meta["year"] == "2099" + assert doc.meta["month"] == "12" + + # + # Unit tests + # + + @pytest.mark.unit + def test_normalize_embeddings_diff_shapes(self): + VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32") + BaseDocumentStore.normalize_embedding(VEC_1) + assert np.linalg.norm(VEC_1) - 1 < 0.01 + + VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32").reshape(1, -1) + BaseDocumentStore.normalize_embedding(VEC_1) + assert np.linalg.norm(VEC_1) - 1 < 0.01 diff --git a/test/document_stores/test_search_engine.py b/test/document_stores/test_search_engine.py index 6230d0cf61..691b6ada99 100644 --- a/test/document_stores/test_search_engine.py +++ b/test/document_stores/test_search_engine.py @@ -27,13 +27,24 @@ def test_query_by_embedding(self): pass @pytest.mark.integration - def test_delete_index(self, ds): - client = ds.client - # the index should exist - assert client.indices.exists(index=ds.index) - ds.delete_index(ds.index) - # the index was deleted and should not exist - assert not client.indices.exists(index=ds.index) + def test_get_meta_values_by_key(self, ds, documents): + ds.write_documents(documents) + + # test without filters or query + result = ds.get_metadata_values_by_key(key="name") + assert result == [ + {"count": 3, "value": "name_0"}, + {"count": 3, "value": "name_1"}, + {"count": 3, "value": "name_2"}, + ] + + # test with filters but no query + result = ds.get_metadata_values_by_key(key="year", filters={"month": ["01"]}) + assert result == [{"count": 3, "value": "2020"}] + + # test with filters & query + result = ds.get_metadata_values_by_key(key="year", query="Bar") + assert result == [{"count": 3, "value": "2021"}] @pytest.mark.document_store From c2e5c6e84badaf0f634a28da2d53008640fc1ae6 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 21 Oct 2022 15:55:14 +0200 Subject: [PATCH 11/23] Update tests.yml --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3e20ead7f2..68f5bdd77a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -163,7 +163,7 @@ jobs: if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' integration-tests-opensearch: - name: Integration / Elasticsearch / ${{ matrix.os }} + name: Integration / Opensearch / ${{ matrix.os }} needs: - unit-tests strategy: @@ -776,4 +776,4 @@ jobs: with: status: ${{ job.status }} channel: '#haystack' - if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' \ No newline at end of file + if: failure() && github.repository_owner == 'deepset-ai' && github.ref == 'refs/heads/main' From e042a8862335b1fe516901667853757371f3795d Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 21 Oct 2022 15:57:33 +0200 Subject: [PATCH 12/23] Update tests.yml --- .github/workflows/tests.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 68f5bdd77a..d51e566e6e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -632,7 +632,6 @@ jobs: integration-tests-linux: needs: - unit-tests-linux - - elasticsearch-tests-linux timeout-minutes: 60 strategy: @@ -730,7 +729,6 @@ jobs: integration-tests-windows: needs: - unit-tests-windows - - elasticsearch-tests-windows runs-on: windows-latest if: contains(github.event.pull_request.labels.*.name, 'topic:windows') || !github.event.pull_request.draft From 3b35aaa541c4313d91e9e0f9cf490436dd914b31 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Fri, 21 Oct 2022 19:14:36 +0200 Subject: [PATCH 13/23] fix --- test/document_stores/test_search_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/document_stores/test_search_engine.py b/test/document_stores/test_search_engine.py index 691b6ada99..e8a89e684a 100644 --- a/test/document_stores/test_search_engine.py +++ b/test/document_stores/test_search_engine.py @@ -3,7 +3,7 @@ @pytest.mark.unit -def test_prepare_hosts(self): +def test_prepare_hosts(): pass From f6ed55c63f09516f3747a0801bdd2a5a1330a78a Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 24 Oct 2022 09:21:10 +0200 Subject: [PATCH 14/23] typo --- .github/workflows/tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d51e566e6e..457d6556fb 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -101,7 +101,7 @@ jobs: os: - ubuntu-latest - windows-latest - - macos-latest] + - macos-latest topic: - document_stores runs-on: ${{ matrix.os }} @@ -133,7 +133,7 @@ jobs: os: - ubuntu-latest - windows-latest - - macos-latest] + - macos-latest runs-on: ${{ matrix.os }} services: elasticsearch: @@ -172,7 +172,7 @@ jobs: os: - ubuntu-latest - windows-latest - - macos-latest] + - macos-latest runs-on: ${{ matrix.os }} services: opensearch: From 5efd316aa260c8327a4f170ea8c81998ef67f62d Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 24 Oct 2022 09:53:00 +0200 Subject: [PATCH 15/23] fix es image tag --- .github/workflows/tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 457d6556fb..01d037ce22 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -137,7 +137,7 @@ jobs: runs-on: ${{ matrix.os }} services: elasticsearch: - image: elasticsearch:7.17 + image: elasticsearch:7.17.6 env: discovery.type: "single-node" ES_JAVA_OPTS: "-Xms128m -Xmx256m" @@ -154,7 +154,7 @@ jobs: - name: Run tests run: | - pytest -m "document_store and integration" test/document_stores/test_elasticsearch.py + pytest -x -m "document_store and integration" test/document_stores/test_elasticsearch.py - uses: act10ns/slack@v1 with: @@ -195,7 +195,7 @@ jobs: - name: Run tests run: | - pytest -m "document_store and integration" test/document_stores/test_opensearch.py + pytest -x -m "document_store and integration" test/document_stores/test_opensearch.py - uses: act10ns/slack@v1 with: From 3875abbec78267354555942bda73c0b4f7ed31f0 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 24 Oct 2022 10:42:58 +0200 Subject: [PATCH 16/23] map es ports --- .github/workflows/tests.yml | 4 +++- test/document_stores/test_opensearch.py | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 01d037ce22..8a45213dc0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -141,6 +141,8 @@ jobs: env: discovery.type: "single-node" ES_JAVA_OPTS: "-Xms128m -Xmx256m" + ports: + - 9200:9200 env: ELASTICSEARCH_HOST: "elasticsearch" steps: @@ -181,7 +183,7 @@ jobs: discovery.type: "single-node" ES_JAVA_OPTS: "-Xms128m -Xmx256m" ports: - - 9201:9200 + - 9200:9200 env: OPENSEARCH_HOST: "opensearch" steps: diff --git a/test/document_stores/test_opensearch.py b/test/document_stores/test_opensearch.py index 793d508ea0..a94d3e2f16 100644 --- a/test/document_stores/test_opensearch.py +++ b/test/document_stores/test_opensearch.py @@ -42,7 +42,6 @@ def ds(self): ds = OpenSearchDocumentStore( index=self.index_name, label_index=labels_index_name, - port=9201, host=os.environ.get("OPENSEARCH_HOST", "localhost"), create_index=True, ) From 092ae114f29f62f7de8d7fcd7e391cc18cc54153 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 24 Oct 2022 11:16:09 +0200 Subject: [PATCH 17/23] try --- .github/workflows/tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 8a45213dc0..e899b47323 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -143,8 +143,8 @@ jobs: ES_JAVA_OPTS: "-Xms128m -Xmx256m" ports: - 9200:9200 - env: - ELASTICSEARCH_HOST: "elasticsearch" + # env: + # ELASTICSEARCH_HOST: "elasticsearch" steps: - uses: actions/checkout@v3 @@ -184,8 +184,8 @@ jobs: ES_JAVA_OPTS: "-Xms128m -Xmx256m" ports: - 9200:9200 - env: - OPENSEARCH_HOST: "opensearch" + # env: + # OPENSEARCH_HOST: "opensearch" steps: - uses: actions/checkout@v3 From f012fc4216c5a0ee5d4486c90e8769c1d9968a0c Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 24 Oct 2022 12:13:04 +0200 Subject: [PATCH 18/23] fix --- .github/workflows/tests.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index e899b47323..2174c2c980 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -130,10 +130,7 @@ jobs: strategy: fail-fast: false matrix: - os: - - ubuntu-latest - - windows-latest - - macos-latest + os: [ubuntu-latest] runs-on: ${{ matrix.os }} services: elasticsearch: @@ -171,10 +168,7 @@ jobs: strategy: fail-fast: false matrix: - os: - - ubuntu-latest - - windows-latest - - macos-latest + os: [ubuntu-latest] runs-on: ${{ matrix.os }} services: opensearch: From 3655c8d9f79267773ef38a921c98b4e32c87d1bc Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 24 Oct 2022 12:17:12 +0200 Subject: [PATCH 19/23] default port --- test/document_stores/test_opensearch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/document_stores/test_opensearch.py b/test/document_stores/test_opensearch.py index a94d3e2f16..bdf225ae9e 100644 --- a/test/document_stores/test_opensearch.py +++ b/test/document_stores/test_opensearch.py @@ -124,11 +124,11 @@ def index(self): @pytest.mark.integration def test___init__(self): - OpenSearchDocumentStore(index="default_index", port=9201, create_index=True) + OpenSearchDocumentStore(index="default_index", create_index=True) @pytest.mark.integration def test___init___faiss(self): - OpenSearchDocumentStore(index="faiss_index", port=9201, create_index=True, knn_engine="faiss") + OpenSearchDocumentStore(index="faiss_index", create_index=True, knn_engine="faiss") @pytest.mark.integration def test_recreate_index(self, ds, documents, labels): @@ -136,7 +136,7 @@ def test_recreate_index(self, ds, documents, labels): ds.write_labels(labels) # Create another document store on top of the previous one - ds = OpenSearchDocumentStore(index=ds.index, label_index=ds.label_index, recreate_index=True, port=9201) + ds = OpenSearchDocumentStore(index=ds.index, label_index=ds.label_index, recreate_index=True) assert len(ds.get_all_documents(index=ds.index)) == 0 assert len(ds.get_all_labels(index=ds.label_index)) == 0 @@ -159,7 +159,7 @@ def test_change_knn_engine(self, ds, caplog): assert ds.embeddings_field_supports_similarity == True index_name = ds.index with caplog.at_level(logging.WARNING): - ds = OpenSearchDocumentStore(port=9201, knn_engine="faiss", index=index_name) + ds = OpenSearchDocumentStore(knn_engine="faiss", index=index_name) warning = ( "Embedding field 'embedding' was initially created with knn_engine 'nmslib', but knn_engine was " "set to 'faiss' when initializing OpenSearchDocumentStore. Falling back to slow exact vector " From 71374be847107166b6051e8924d26be703fe8799 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Mon, 24 Oct 2022 15:43:15 +0200 Subject: [PATCH 20/23] remove opensearch from the markers sorcery --- .github/workflows/tests.yml | 5 ----- conftest.py | 2 +- test/conftest.py | 25 +------------------------ 3 files changed, 2 insertions(+), 30 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 2174c2c980..b4a901dbe2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -662,15 +662,10 @@ jobs: run: | python -c "from transformers import AutoModel;[AutoModel.from_pretrained(model_name) for model_name in ['vblagoje/bart_lfqa','yjernite/bart_eli5', 'vblagoje/dpr-ctx_encoder-single-lfqa-wiki', 'vblagoje/dpr-question_encoder-single-lfqa-wiki', 'facebook/dpr-question_encoder-single-nq-base', 'facebook/dpr-ctx_encoder-single-nq-base', 'elastic/distilbert-base-cased-finetuned-conll03-english']]" - - name: Run Elasticsearch run: | docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2 - - name: Run Opensearch - run: | - docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.3.5 - - name: Run Milvus run: | cd ../../ # Avoid causing permission issues on hashFiles later by creating unreadable folders like "volumes" diff --git a/conftest.py b/conftest.py index b0ea11b4d1..a381d802f8 100644 --- a/conftest.py +++ b/conftest.py @@ -2,7 +2,7 @@ def pytest_addoption(parser): parser.addoption( "--document_store_type", action="store", - default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone, opensearch", + default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone", ) diff --git a/test/conftest.py b/test/conftest.py index 29d89324f5..dfce287b7f 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -152,7 +152,6 @@ def pytest_collection_modifyitems(config, items): "pinecone": [pytest.mark.pinecone], # FIXME GraphDB can't be treated as a regular docstore, it fails most of their tests "graphdb": [pytest.mark.integration], - "opensearch": [pytest.mark.opensearch], } for item in items: for name, markers in name_to_markers.items(): @@ -196,17 +195,7 @@ def infer_required_doc_store(item, keywords): # 2. if the test name contains the docstore name, we use that # 3. use an arbitrary one by calling set.pop() required_doc_store = None - all_doc_stores = { - "elasticsearch", - "faiss", - "sql", - "memory", - "milvus1", - "milvus", - "weaviate", - "pinecone", - "opensearch", - } + all_doc_stores = {"elasticsearch", "faiss", "sql", "memory", "milvus1", "milvus", "weaviate", "pinecone"} docstore_markers = set(keywords).intersection(all_doc_stores) if len(docstore_markers) > 1: # if parameterized infer the docstore from the parameter @@ -1099,18 +1088,6 @@ def get_document_store( knn_engine="faiss", ) - elif document_store_type == "opensearch": - document_store = OpenSearchDocumentStore( - index=index, - return_embedding=True, - embedding_dim=embedding_dim, - embedding_field=embedding_field, - similarity=similarity, - recreate_index=recreate_index, - port=9201, - knn_engine="nmslib", - ) - else: raise Exception(f"No document store fixture for '{document_store_type}'") From 1f96f3794bad3ab7f9e2d6d67b43f5bdf8203faf Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Thu, 27 Oct 2022 12:10:18 +0200 Subject: [PATCH 21/23] revert --- .github/workflows/tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b4a901dbe2..a4c70da90a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -666,6 +666,10 @@ jobs: run: | docker run -d -p 9200:9200 -e "discovery.type=single-node" -e "ES_JAVA_OPTS=-Xms128m -Xmx256m" elasticsearch:7.9.2 + - name: Run Opensearch + run: | + docker run -d -p 9201:9200 -p 9600:9600 -e "discovery.type=single-node" opensearchproject/opensearch:1.3.5 + - name: Run Milvus run: | cd ../../ # Avoid causing permission issues on hashFiles later by creating unreadable folders like "volumes" From 8f4b6aa8f921611918068490c55acc8fb5b19f06 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Thu, 27 Oct 2022 16:53:11 +0200 Subject: [PATCH 22/23] skip new tests in old jobs --- .github/workflows/tests.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a4c70da90a..ad194d1b83 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -708,8 +708,9 @@ jobs: - name: Run tests env: TOKENIZERS_PARALLELISM: 'false' # Avoid logspam by tokenizers + # we add "and not document_store" to exclude the tests that were ported to the new strategy run: | - pytest ${{ env.PYTEST_PARAMS }} -m "integration" test/${{ matrix.folder }} + pytest ${{ env.PYTEST_PARAMS }} -m "integration and not document_store" test/${{ matrix.folder }} - name: Dump docker logs on failure if: failure() From 00aa0200a78573ae0f28f27aaa1d88d5ab9cf922 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Thu, 27 Oct 2022 17:32:27 +0200 Subject: [PATCH 23/23] skip opensearch_faiss --- test/document_stores/test_document_store.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index f37d5ff0ae..88bdc8dc1f 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -1802,9 +1802,7 @@ def test_elasticsearch_brownfield_support(document_store_with_docs): @pytest.mark.parametrize( - "document_store", - ["faiss", "milvus1", "milvus", "weaviate", "opensearch_faiss", "opensearch", "elasticsearch", "memory"], - indirect=True, + "document_store", ["faiss", "milvus1", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True ) def test_cosine_similarity(document_store: BaseDocumentStore): # below we will write documents to the store and then query it to see if vectors were normalized or not @@ -1846,9 +1844,7 @@ def test_cosine_similarity(document_store: BaseDocumentStore): @pytest.mark.parametrize( - "document_store", - ["faiss", "milvus1", "milvus", "weaviate", "opensearch_faiss", "opensearch", "elasticsearch", "memory"], - indirect=True, + "document_store", ["faiss", "milvus1", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True ) def test_update_embeddings_cosine_similarity(document_store: BaseDocumentStore): # below we will write documents to the store and then query it to see if vectors were normalized @@ -1908,7 +1904,7 @@ def embed_documents(self, docs): @pytest.mark.parametrize( "document_store_small", - ["faiss", "milvus1", "milvus", "weaviate", "memory", "elasticsearch", "opensearch", "opensearch_faiss"], + ["faiss", "milvus1", "milvus", "weaviate", "memory", "elasticsearch", "opensearch"], indirect=True, ) def test_cosine_sanity_check(document_store_small):