From 945a6bbf8f98395f8f3eb14d93767ccf00c2d6f0 Mon Sep 17 00:00:00 2001 From: Adrien Wald Date: Wed, 9 Feb 2022 14:44:38 +0000 Subject: [PATCH 1/7] use get_all_documents in ElasticsearchFilterOnlyRetriever.retrieve --- haystack/nodes/retriever/sparse.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/haystack/nodes/retriever/sparse.py b/haystack/nodes/retriever/sparse.py index dcd138f8b9..84dc16140c 100644 --- a/haystack/nodes/retriever/sparse.py +++ b/haystack/nodes/retriever/sparse.py @@ -139,20 +139,16 @@ def retrieve( Scan through documents in DocumentStore and return a small number documents that are most relevant to the query. - :param query: The query + :param query: Has no effect, can pass in empty string :param filters: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field - :param top_k: How many documents to return per query. + :param top_k: Has no effect, pass in any int or None :param index: The name of the index in the DocumentStore from which to retrieve documents :param headers: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. """ - if top_k is None: - top_k = self.top_k if index is None: index = self.document_store.index - documents = self.document_store.query( - query=None, filters=filters, top_k=top_k, custom_query=self.custom_query, index=index, headers=headers - ) + documents = self.document_store.get_all_documents(filters=filters, index=index, headers=headers) return documents From 0d67d840183e24d759d28766a771264d5c948315 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 19 Apr 2022 15:20:57 +0000 Subject: [PATCH 2/7] Update Documentation & Code Style --- docs/_src/api/api/retriever.md | 4 ++-- docs/_src/api/openapi/openapi-1.3.1rc0.json | 9 ++++++++- docs/_src/api/openapi/openapi.json | 9 ++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index bb75023bb9..dc46aec859 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -229,9 +229,9 @@ that are most relevant to the query. **Arguments**: -- `query`: The query +- `query`: Has no effect, can pass in empty string - `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field -- `top_k`: How many documents to return per query. +- `top_k`: Has no effect, pass in any int or None - `index`: The name of the index in the DocumentStore from which to retrieve documents - `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. diff --git a/docs/_src/api/openapi/openapi-1.3.1rc0.json b/docs/_src/api/openapi/openapi-1.3.1rc0.json index f9961cf3f4..8fff7c9626 100644 --- a/docs/_src/api/openapi/openapi-1.3.1rc0.json +++ b/docs/_src/api/openapi/openapi-1.3.1rc0.json @@ -867,7 +867,14 @@ "title": "Location", "type": "array", "items": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] } }, "msg": { diff --git a/docs/_src/api/openapi/openapi.json b/docs/_src/api/openapi/openapi.json index f9961cf3f4..8fff7c9626 100644 --- a/docs/_src/api/openapi/openapi.json +++ b/docs/_src/api/openapi/openapi.json @@ -867,7 +867,14 @@ "title": "Location", "type": "array", "items": { - "type": "string" + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] } }, "msg": { From fcfa14c7225d08132808f634d852f299b4a7aff3 Mon Sep 17 00:00:00 2001 From: Adrien Wald Date: Tue, 19 Apr 2022 08:31:41 -0700 Subject: [PATCH 3/7] add test case for es_filter_only retriever --- test/test_retriever.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/test/test_retriever.py b/test/test_retriever.py index c31faa5353..6c55b23d34 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -563,3 +563,24 @@ def test_embeddings_encoder_of_embedding_retriever_should_warn_about_model_forma "You may need to set 'model_format='sentence_transformers' to ensure correct loading of model." in caplog.text ) + +@pytest.mark.parametrize("retriever", ["es_filter_only"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) +def test_es_filter_only(document_store, retriever): + docs = [ + Document(content="Doc1", meta={"f1": "0"}), + Document(content="Doc2", meta={"f1": "0"}), + Document(content="Doc3", meta={"f1": "0"}), + Document(content="Doc4", meta={"f1": "0"}), + Document(content="Doc5", meta={"f1": "0"}), + Document(content="Doc6", meta={"f1": "0"}), + Document(content="Doc7", meta={"f1": "1"}), + Document(content="Doc8", meta={"f1": "0"}), + Document(content="Doc9", meta={"f1": "0"}), + Document(content="Doc10", meta={"f1": "0"}), + Document(content="Doc11", meta={"f1": "0"}), + Document(content="Doc12", meta={"f1": "0"}) + ] + document_store.write_documents(docs) + retrieved_docs = retriever.retrieve(filters={"f1": ["0"]}) + assert len(retrieved_docs) == 11 From 3422497dfa953314da95bd82c1e5e7a8c1888da8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 19 Apr 2022 15:34:07 +0000 Subject: [PATCH 4/7] Update Documentation & Code Style --- test/test_retriever.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_retriever.py b/test/test_retriever.py index 6c55b23d34..02da84eb47 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -564,6 +564,7 @@ def test_embeddings_encoder_of_embedding_retriever_should_warn_about_model_forma in caplog.text ) + @pytest.mark.parametrize("retriever", ["es_filter_only"], indirect=True) @pytest.mark.parametrize("document_store", ["elasticsearch"], indirect=True) def test_es_filter_only(document_store, retriever): @@ -579,7 +580,7 @@ def test_es_filter_only(document_store, retriever): Document(content="Doc9", meta={"f1": "0"}), Document(content="Doc10", meta={"f1": "0"}), Document(content="Doc11", meta={"f1": "0"}), - Document(content="Doc12", meta={"f1": "0"}) + Document(content="Doc12", meta={"f1": "0"}), ] document_store.write_documents(docs) retrieved_docs = retriever.retrieve(filters={"f1": ["0"]}) From 826f42fed6da85971f7e0e8489034d9c6f99f3e3 Mon Sep 17 00:00:00 2001 From: Adrien Wald Date: Wed, 20 Apr 2022 13:22:56 -0700 Subject: [PATCH 5/7] fix test by adding empty string for query --- test/test_retriever.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_retriever.py b/test/test_retriever.py index 6c55b23d34..87f4a66de9 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -582,5 +582,5 @@ def test_es_filter_only(document_store, retriever): Document(content="Doc12", meta={"f1": "0"}) ] document_store.write_documents(docs) - retrieved_docs = retriever.retrieve(filters={"f1": ["0"]}) + retrieved_docs = retriever.retrieve('', filters={"f1": ["0"]}) assert len(retrieved_docs) == 11 From 47e51d0db913c3329017875e46faab10239714c1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Wed, 20 Apr 2022 20:25:09 +0000 Subject: [PATCH 6/7] Update Documentation & Code Style --- test/test_retriever.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_retriever.py b/test/test_retriever.py index 680ed62166..2ac78c670d 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -583,5 +583,5 @@ def test_es_filter_only(document_store, retriever): Document(content="Doc12", meta={"f1": "0"}), ] document_store.write_documents(docs) - retrieved_docs = retriever.retrieve('', filters={"f1": ["0"]}) + retrieved_docs = retriever.retrieve("", filters={"f1": ["0"]}) assert len(retrieved_docs) == 11 From a78562539e639df03378ed41b5849edf32f36d7c Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 25 Apr 2022 09:20:44 +0200 Subject: [PATCH 7/7] add explicit name of argument "query" --- test/test_retriever.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_retriever.py b/test/test_retriever.py index 2ac78c670d..d0e8509117 100644 --- a/test/test_retriever.py +++ b/test/test_retriever.py @@ -583,5 +583,5 @@ def test_es_filter_only(document_store, retriever): Document(content="Doc12", meta={"f1": "0"}), ] document_store.write_documents(docs) - retrieved_docs = retriever.retrieve("", filters={"f1": ["0"]}) + retrieved_docs = retriever.retrieve(query="", filters={"f1": ["0"]}) assert len(retrieved_docs) == 11