From d326140f0895c931c4734b7b4baf6490061c5a41 Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Tue, 18 Jul 2023 11:49:35 +0200 Subject: [PATCH 01/22] added hybrid search example Added an example about hybrid search for faq pipeline on covid dataset --- examples/hybrid_search__faq_pipeline.py | 81 +++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 examples/hybrid_search__faq_pipeline.py diff --git a/examples/hybrid_search__faq_pipeline.py b/examples/hybrid_search__faq_pipeline.py new file mode 100644 index 0000000000..30dfed7a8b --- /dev/null +++ b/examples/hybrid_search__faq_pipeline.py @@ -0,0 +1,81 @@ +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) + +from haystack.document_stores import ElasticsearchDocumentStore + +from haystack.nodes import EmbeddingRetriever, BM25Retriever, JoinDocuments, SentenceTransformersRanker +from haystack.nodes.other.docs2answers import Docs2Answers +from haystack.utils import launch_es, print_answers, fetch_archive_from_http +import pandas as pd +from haystack.pipelines import Pipeline + + +def basic_faq_pipeline(): + document_store=InMemoryDocumentStore(use_bm25=True) + document_store = ElasticsearchDocumentStore( + host="localhost", + username="", + password="", + index="document", + embedding_field="question_emb", + embedding_dim=384, + excluded_meta_data=["question_emb"], + similarity="cosine", + ) + + sparse_retriever = BM25Retriever(document_store=document_store) + dense_retriever = EmbeddingRetriever( + document_store=document_store, + embedding_model="sentence-transformers/all-MiniLM-L6-v2", + use_gpu=True, + scale_score=False, + ) + + join_documents = JoinDocuments(join_mode="reciprocal_rank_fusion") + rerank = SentenceTransformersRanker("cross-encoder/ms-marco-MiniLM-L-6-v2") + + doc_to_answers = Docs2Answers() + + doc_dir = "data/basic_faq_pipeline" + s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/small_faq_covid.csv1.zip" + fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + + df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv") + + # Minimal cleaning + df.fillna(value="", inplace=True) + df["question"] = df["question"].apply(lambda x: x.strip()) + print(df.head()) + + # Get embeddings for our questions from the FAQs + questions = list(df["question"].values) + df["question_emb"] = dense_retriever.embed_queries(queries=questions).tolist() + df = df.rename(columns={"question": "content"}) + + # Convert Dataframe to list of dicts and index them in our DocumentStore + docs_to_index = df.to_dict(orient="records") + document_store.write_documents(docs_to_index) + + # Initialize a Pipeline (this time without a reader) and ask questions + pipeline = Pipeline() + pipeline.add_node(component=sparse_retriever, name="SparseRetriever", inputs=["Query"]) + pipeline.add_node(component=dense_retriever, name="DenseRetriever", inputs=["Query"]) + pipeline.add_node(component=join_documents, name="JoinDocuments", inputs=["SparseRetriever", "DenseRetriever"]) + pipeline.add_node(component=rerank, name="ReRanker", inputs=["JoinDocuments"]) + pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["ReRanker"]) + + # Ask a question + prediction = pipeline.run(query="How is the virus spreading?", params={"SparseRetriever": {"top_k": 10}, + "DenseRetriever": {"top_k": 10}, + "JoinDocuments":{"top_k_join": 15}, + "ReRanker": {"top_k": 5}}) + + print_answers(prediction, details="medium") + return prediction + + +if __name__ == "__main__": + launch_es() + basic_faq_pipeline() \ No newline at end of file From 53b63635660ff044813f5ed447b221d94b351660 Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Tue, 18 Jul 2023 11:58:24 +0200 Subject: [PATCH 02/22] formatted with back formatter --- examples/hybrid_search__faq_pipeline.py | 46 ++++++++++++++++++------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/examples/hybrid_search__faq_pipeline.py b/examples/hybrid_search__faq_pipeline.py index 30dfed7a8b..a2b3e3d85d 100644 --- a/examples/hybrid_search__faq_pipeline.py +++ b/examples/hybrid_search__faq_pipeline.py @@ -1,11 +1,18 @@ import logging -logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.basicConfig( + format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING +) logging.getLogger("haystack").setLevel(logging.INFO) from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import EmbeddingRetriever, BM25Retriever, JoinDocuments, SentenceTransformersRanker +from haystack.nodes import ( + EmbeddingRetriever, + BM25Retriever, + JoinDocuments, + SentenceTransformersRanker, +) from haystack.nodes.other.docs2answers import Docs2Answers from haystack.utils import launch_es, print_answers, fetch_archive_from_http import pandas as pd @@ -13,7 +20,7 @@ def basic_faq_pipeline(): - document_store=InMemoryDocumentStore(use_bm25=True) + document_store = InMemoryDocumentStore(use_bm25=True) document_store = ElasticsearchDocumentStore( host="localhost", username="", @@ -60,17 +67,32 @@ def basic_faq_pipeline(): # Initialize a Pipeline (this time without a reader) and ask questions pipeline = Pipeline() - pipeline.add_node(component=sparse_retriever, name="SparseRetriever", inputs=["Query"]) - pipeline.add_node(component=dense_retriever, name="DenseRetriever", inputs=["Query"]) - pipeline.add_node(component=join_documents, name="JoinDocuments", inputs=["SparseRetriever", "DenseRetriever"]) + pipeline.add_node( + component=sparse_retriever, name="SparseRetriever", inputs=["Query"] + ) + pipeline.add_node( + component=dense_retriever, name="DenseRetriever", inputs=["Query"] + ) + pipeline.add_node( + component=join_documents, + name="JoinDocuments", + inputs=["SparseRetriever", "DenseRetriever"], + ) pipeline.add_node(component=rerank, name="ReRanker", inputs=["JoinDocuments"]) - pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["ReRanker"]) + pipeline.add_node( + component=doc_to_answers, name="Docs2Answers", inputs=["ReRanker"] + ) # Ask a question - prediction = pipeline.run(query="How is the virus spreading?", params={"SparseRetriever": {"top_k": 10}, - "DenseRetriever": {"top_k": 10}, - "JoinDocuments":{"top_k_join": 15}, - "ReRanker": {"top_k": 5}}) + prediction = pipeline.run( + query="How is the virus spreading?", + params={ + "SparseRetriever": {"top_k": 10}, + "DenseRetriever": {"top_k": 10}, + "JoinDocuments": {"top_k_join": 15}, + "ReRanker": {"top_k": 5}, + }, + ) print_answers(prediction, details="medium") return prediction @@ -78,4 +100,4 @@ def basic_faq_pipeline(): if __name__ == "__main__": launch_es() - basic_faq_pipeline() \ No newline at end of file + basic_faq_pipeline() From 812fed5eded5314db8b2bd87eaf578a51a881fa1 Mon Sep 17 00:00:00 2001 From: nickprock Date: Wed, 19 Jul 2023 08:53:27 +0200 Subject: [PATCH 03/22] renamed document --- ...eline.py => hybrid_search_faq_pipeline.py} | 29 ++++--------------- 1 file changed, 6 insertions(+), 23 deletions(-) rename examples/{hybrid_search__faq_pipeline.py => hybrid_search_faq_pipeline.py} (76%) diff --git a/examples/hybrid_search__faq_pipeline.py b/examples/hybrid_search_faq_pipeline.py similarity index 76% rename from examples/hybrid_search__faq_pipeline.py rename to examples/hybrid_search_faq_pipeline.py index a2b3e3d85d..1c145b8b23 100644 --- a/examples/hybrid_search__faq_pipeline.py +++ b/examples/hybrid_search_faq_pipeline.py @@ -1,18 +1,11 @@ import logging -logging.basicConfig( - format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING -) +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) logging.getLogger("haystack").setLevel(logging.INFO) from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import ( - EmbeddingRetriever, - BM25Retriever, - JoinDocuments, - SentenceTransformersRanker, -) +from haystack.nodes import EmbeddingRetriever, BM25Retriever, JoinDocuments, SentenceTransformersRanker from haystack.nodes.other.docs2answers import Docs2Answers from haystack.utils import launch_es, print_answers, fetch_archive_from_http import pandas as pd @@ -67,21 +60,11 @@ def basic_faq_pipeline(): # Initialize a Pipeline (this time without a reader) and ask questions pipeline = Pipeline() - pipeline.add_node( - component=sparse_retriever, name="SparseRetriever", inputs=["Query"] - ) - pipeline.add_node( - component=dense_retriever, name="DenseRetriever", inputs=["Query"] - ) - pipeline.add_node( - component=join_documents, - name="JoinDocuments", - inputs=["SparseRetriever", "DenseRetriever"], - ) + pipeline.add_node(component=sparse_retriever, name="SparseRetriever", inputs=["Query"]) + pipeline.add_node(component=dense_retriever, name="DenseRetriever", inputs=["Query"]) + pipeline.add_node(component=join_documents, name="JoinDocuments", inputs=["SparseRetriever", "DenseRetriever"]) pipeline.add_node(component=rerank, name="ReRanker", inputs=["JoinDocuments"]) - pipeline.add_node( - component=doc_to_answers, name="Docs2Answers", inputs=["ReRanker"] - ) + pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["ReRanker"]) # Ask a question prediction = pipeline.run( From 8694d735aea102fdc9ab5a94a27f2b749f3f417c Mon Sep 17 00:00:00 2001 From: nickprock Date: Wed, 19 Jul 2023 08:56:57 +0200 Subject: [PATCH 04/22] fixed --- examples/hybrid_search_faq_pipeline.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/hybrid_search_faq_pipeline.py b/examples/hybrid_search_faq_pipeline.py index 1c145b8b23..630bf1d970 100644 --- a/examples/hybrid_search_faq_pipeline.py +++ b/examples/hybrid_search_faq_pipeline.py @@ -13,7 +13,6 @@ def basic_faq_pipeline(): - document_store = InMemoryDocumentStore(use_bm25=True) document_store = ElasticsearchDocumentStore( host="localhost", username="", From 29c5c3ecfbdeb6348897a4ff19985817cfe6b718 Mon Sep 17 00:00:00 2001 From: nickprock Date: Wed, 19 Jul 2023 09:05:03 +0200 Subject: [PATCH 05/22] fixed typos --- examples/hybrid_search_faq_pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/hybrid_search_faq_pipeline.py b/examples/hybrid_search_faq_pipeline.py index 630bf1d970..dec60b6022 100644 --- a/examples/hybrid_search_faq_pipeline.py +++ b/examples/hybrid_search_faq_pipeline.py @@ -12,7 +12,7 @@ from haystack.pipelines import Pipeline -def basic_faq_pipeline(): +def hybrid_search_faq_pipeline(): document_store = ElasticsearchDocumentStore( host="localhost", username="", @@ -82,4 +82,4 @@ def basic_faq_pipeline(): if __name__ == "__main__": launch_es() - basic_faq_pipeline() + hybrid_search_faq_pipeline() From b7be547f1106f4b95e5d9f7a65d78497bd755a8a Mon Sep 17 00:00:00 2001 From: nickprock Date: Wed, 19 Jul 2023 09:09:11 +0200 Subject: [PATCH 06/22] added test added test for hybrid search --- examples/test_hybrid_search_faq_pipeline.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 examples/test_hybrid_search_faq_pipeline.py diff --git a/examples/test_hybrid_search_faq_pipeline.py b/examples/test_hybrid_search_faq_pipeline.py new file mode 100644 index 0000000000..047ea13bb5 --- /dev/null +++ b/examples/test_hybrid_search_faq_pipeline.py @@ -0,0 +1,19 @@ +from examples.hybrid_search_faq_pipeline import hybrid_search_faq_pipeline + +from haystack.schema import Answer, Document + + +def test_basic_faq_pipeline(): + prediction = hybrid_search_faq_pipeline() + + assert prediction is not None + assert prediction["query"] == "How is the virus spreading?" + + assert len(prediction["answers"]) == 5 # top-k of ReRanker + assert type(prediction["answers"][0]) == Answer + assert ( + prediction["answers"][0].answer + == """This virus was first detected in Wuhan City, Hubei Province, China. The first infections were linked to a live animal market, but the virus is now spreading from person-to-person. It’s important to note that person-to-person spread can happen on a continuum. Some viruses are highly contagious (like measles), while other viruses are less so.\n\nThe virus that causes COVID-19 seems to be spreading easily and sustainably in the community (“community spread”) in some affected geographic areas. Community spread means people have been infected with the virus in an area, including some who are not sure how or where they became infected.\n\nLearn what is known about the spread of newly emerged coronaviruses.""" + ) + assert prediction["answers"][0].score <= 1 + assert prediction["answers"][0].score >= 0 From c212d2f8d2d72a297541a4424d49ae4cea28cb31 Mon Sep 17 00:00:00 2001 From: nickprock Date: Wed, 19 Jul 2023 10:13:53 +0200 Subject: [PATCH 07/22] fixed withespaces --- examples/hybrid_search_faq_pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/hybrid_search_faq_pipeline.py b/examples/hybrid_search_faq_pipeline.py index dec60b6022..97da13e1f9 100644 --- a/examples/hybrid_search_faq_pipeline.py +++ b/examples/hybrid_search_faq_pipeline.py @@ -31,9 +31,8 @@ def hybrid_search_faq_pipeline(): use_gpu=True, scale_score=False, ) - join_documents = JoinDocuments(join_mode="reciprocal_rank_fusion") - rerank = SentenceTransformersRanker("cross-encoder/ms-marco-MiniLM-L-6-v2") + rerank = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-6-v2") doc_to_answers = Docs2Answers() From 90f699c65a5f81a8ad9f52a4c6039ea5e1ffa810 Mon Sep 17 00:00:00 2001 From: nickprock Date: Mon, 24 Jul 2023 10:23:32 +0200 Subject: [PATCH 08/22] removed test for hybrid search --- examples/test_hybrid_search_faq_pipeline.py | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 examples/test_hybrid_search_faq_pipeline.py diff --git a/examples/test_hybrid_search_faq_pipeline.py b/examples/test_hybrid_search_faq_pipeline.py deleted file mode 100644 index 047ea13bb5..0000000000 --- a/examples/test_hybrid_search_faq_pipeline.py +++ /dev/null @@ -1,19 +0,0 @@ -from examples.hybrid_search_faq_pipeline import hybrid_search_faq_pipeline - -from haystack.schema import Answer, Document - - -def test_basic_faq_pipeline(): - prediction = hybrid_search_faq_pipeline() - - assert prediction is not None - assert prediction["query"] == "How is the virus spreading?" - - assert len(prediction["answers"]) == 5 # top-k of ReRanker - assert type(prediction["answers"][0]) == Answer - assert ( - prediction["answers"][0].answer - == """This virus was first detected in Wuhan City, Hubei Province, China. The first infections were linked to a live animal market, but the virus is now spreading from person-to-person. It’s important to note that person-to-person spread can happen on a continuum. Some viruses are highly contagious (like measles), while other viruses are less so.\n\nThe virus that causes COVID-19 seems to be spreading easily and sustainably in the community (“community spread”) in some affected geographic areas. Community spread means people have been infected with the virus in an area, including some who are not sure how or where they became infected.\n\nLearn what is known about the spread of newly emerged coronaviruses.""" - ) - assert prediction["answers"][0].score <= 1 - assert prediction["answers"][0].score >= 0 From 05210fa8b3445dc82170f617189f7ff74573221f Mon Sep 17 00:00:00 2001 From: nickprock Date: Mon, 24 Jul 2023 10:31:21 +0200 Subject: [PATCH 09/22] fixed pylint --- examples/hybrid_search_faq_pipeline.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/hybrid_search_faq_pipeline.py b/examples/hybrid_search_faq_pipeline.py index 97da13e1f9..94d3b3589c 100644 --- a/examples/hybrid_search_faq_pipeline.py +++ b/examples/hybrid_search_faq_pipeline.py @@ -1,16 +1,16 @@ import logging -logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) -logging.getLogger("haystack").setLevel(logging.INFO) +import pandas as pd from haystack.document_stores import ElasticsearchDocumentStore - from haystack.nodes import EmbeddingRetriever, BM25Retriever, JoinDocuments, SentenceTransformersRanker from haystack.nodes.other.docs2answers import Docs2Answers from haystack.utils import launch_es, print_answers, fetch_archive_from_http -import pandas as pd from haystack.pipelines import Pipeline +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) + def hybrid_search_faq_pipeline(): document_store = ElasticsearchDocumentStore( From bfd6c74437e913c876f0f9b20a73546f4a569b07 Mon Sep 17 00:00:00 2001 From: nickprock Date: Mon, 24 Jul 2023 10:56:24 +0200 Subject: [PATCH 10/22] commented logging --- examples/hybrid_search_faq_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/hybrid_search_faq_pipeline.py b/examples/hybrid_search_faq_pipeline.py index 94d3b3589c..2e2a86aca0 100644 --- a/examples/hybrid_search_faq_pipeline.py +++ b/examples/hybrid_search_faq_pipeline.py @@ -1,4 +1,4 @@ -import logging +# import logging import pandas as pd @@ -8,8 +8,8 @@ from haystack.utils import launch_es, print_answers, fetch_archive_from_http from haystack.pipelines import Pipeline -logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) -logging.getLogger("haystack").setLevel(logging.INFO) +# logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +# logging.getLogger("haystack").setLevel(logging.INFO) def hybrid_search_faq_pipeline(): From e4519bd288938b4e04054be435e4f69a7c2d2d90 Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Wed, 4 Oct 2023 15:07:41 +0200 Subject: [PATCH 11/22] fixed bug in join_docs.py _concatenate_results --- haystack/nodes/other/join_docs.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index 4185873a7c..2a8967e511 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -64,7 +64,7 @@ def run_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None): document_map = {doc.id: doc for result in results for doc in result} if self.join_mode == "concatenate": - scores_map = self._concatenate_results(results) + scores_map = self._concatenate_results(results, document_map) elif self.join_mode == "merge": scores_map = self._calculate_comb_sum(results) elif self.join_mode == "reciprocal_rank_fusion": @@ -118,11 +118,21 @@ def run_batch_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = return output, "output_1" - def _concatenate_results(self, results): + def _concatenate_results(self, results, document_map): """ Concatenates multiple document result lists. """ - return {doc.id: doc.score for result in results for doc in result} + list_id = list(document_map.keys()) + scores_map = {} + for idx in list_id: + tmp = [] + for result in results: + for doc in result: + if doc.id==idx: + tmp.append(doc) + item_best_score = max(tmp, key=lambda x:x.score) + scores_map.update({idx: item_best_score.score}) + return scores_map def _calculate_comb_sum(self, results): """ From fddb1693a9647364fdf4c3215a06e2a20eba95a7 Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Wed, 4 Oct 2023 17:20:37 +0200 Subject: [PATCH 12/22] Update join_docs.py updated comment --- haystack/nodes/other/join_docs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index 2a8967e511..9e52bcf08c 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -121,6 +121,7 @@ def run_batch_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = def _concatenate_results(self, results, document_map): """ Concatenates multiple document result lists. + Return the documents with the higher score. """ list_id = list(document_map.keys()) scores_map = {} From 05e3837406d98fc7f3ab37d854db81c80a9bb1b1 Mon Sep 17 00:00:00 2001 From: nickprock Date: Wed, 4 Oct 2023 19:55:01 +0200 Subject: [PATCH 13/22] format with black --- haystack/nodes/other/join_docs.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index 9e52bcf08c..7ce0de819d 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -1,11 +1,10 @@ -from collections import defaultdict import logging +from collections import defaultdict from math import inf +from typing import List, Optional -from typing import Optional, List - -from haystack.schema import Document from haystack.nodes.other.join import JoinNode +from haystack.schema import Document logger = logging.getLogger(__name__) @@ -128,10 +127,10 @@ def _concatenate_results(self, results, document_map): for idx in list_id: tmp = [] for result in results: - for doc in result: - if doc.id==idx: - tmp.append(doc) - item_best_score = max(tmp, key=lambda x:x.score) + for doc in result: + if doc.id == idx: + tmp.append(doc) + item_best_score = max(tmp, key=lambda x: x.score) scores_map.update({idx: item_best_score.score}) return scores_map From cf72ea178a1867863396eb1597302ffd1bb16b87 Mon Sep 17 00:00:00 2001 From: nickprock Date: Thu, 5 Oct 2023 11:07:28 +0200 Subject: [PATCH 14/22] added releasenote on PR --- ...ocuments-concatenate-56a7cdba00a7248e.yaml | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml diff --git a/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml b/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml new file mode 100644 index 0000000000..15e507e92a --- /dev/null +++ b/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml @@ -0,0 +1,37 @@ +--- +prelude: > + Replace this text with content to appear at the top of the section for this + release. This is equivalent to the "Highlights" section we used before. + The prelude might repeat some details that are also present in other notes + from the same release, that's ok. Not every release note requires a prelude, + use it only to describe major features or notable changes. +upgrade: + - | + List upgrade notes here, or remove this section. + Upgrade notes should be rare: only list known/potential breaking changes, + or major changes that require user action before the upgrade. + Notes here must include steps that users can follow to 1. know if they're + affected and 2. handle the change gracefully on their end. +features: + - | + List new features here, or remove this section. +enhancements: + - | + List new behavior that is too small to be + considered a new feature, or remove this section. +issues: + - | + List known issues here, or remove this section. For example, if some change is experimental or known to not work in some cases, it should be mentioned here. +deprecations: + - | + List deprecations notes here, or remove this section. Deprecations should not be used for something that is removed in the release, use upgrade section instead. Deprecation should allow time for users to make necessary changes for the removal to happen in a future release. +security: + - | + Add security notes here, or remove this section. +fixes: + - | + Add normal bug fixes here, or remove this section. +preview: + - | + Add changes to Haystack version 2, or remove this section. + Haystack version 2 can be found under haystack/preview. From 7ae098da4adf32e967da6274f98b9fa8fdca38d9 Mon Sep 17 00:00:00 2001 From: nickprock Date: Mon, 9 Oct 2023 17:05:49 +0200 Subject: [PATCH 15/22] updated release notes --- ...ocuments-concatenate-56a7cdba00a7248e.yaml | 35 +------------------ 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml b/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml index 15e507e92a..e282f7b9fb 100644 --- a/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml +++ b/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml @@ -1,37 +1,4 @@ --- -prelude: > - Replace this text with content to appear at the top of the section for this - release. This is equivalent to the "Highlights" section we used before. - The prelude might repeat some details that are also present in other notes - from the same release, that's ok. Not every release note requires a prelude, - use it only to describe major features or notable changes. -upgrade: - - | - List upgrade notes here, or remove this section. - Upgrade notes should be rare: only list known/potential breaking changes, - or major changes that require user action before the upgrade. - Notes here must include steps that users can follow to 1. know if they're - affected and 2. handle the change gracefully on their end. -features: - - | - List new features here, or remove this section. enhancements: - | - List new behavior that is too small to be - considered a new feature, or remove this section. -issues: - - | - List known issues here, or remove this section. For example, if some change is experimental or known to not work in some cases, it should be mentioned here. -deprecations: - - | - List deprecations notes here, or remove this section. Deprecations should not be used for something that is removed in the release, use upgrade section instead. Deprecation should allow time for users to make necessary changes for the removal to happen in a future release. -security: - - | - Add security notes here, or remove this section. -fixes: - - | - Add normal bug fixes here, or remove this section. -preview: - - | - Add changes to Haystack version 2, or remove this section. - Haystack version 2 can be found under haystack/preview. + Make JoinDocuments return only the document with the highest score if there are duplicate documents in the list. From f15bcbb3f06d813984c047bf721a0f9b11f25c38 Mon Sep 17 00:00:00 2001 From: nickprock Date: Mon, 9 Oct 2023 17:19:45 +0200 Subject: [PATCH 16/22] updated test_join_documents --- test/nodes/test_join_documents.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index 92cf5c8828..03058aa298 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -54,3 +54,23 @@ def test_joindocuments_preserves_root_node(): join_docs = JoinDocuments() result, _ = join_docs.run(inputs) assert result["root_node"] == "File" + + +@pytest.mark.unit +@pytest.mark.parametrize("join_mode", ["concatenate", "merge", "reciprocal_rank_fusion"]) +def test_joindocuments_keep_only_highest_ranking_duplicate(join_mode): + inputs = [ + {"documents": [Document(content="text document 1", content_type="text", score=0.2)]}, + {"documents": [Document(content="text document 2", content_type="text", score=0.3)]}, + {"documents": [Document(content="text document 2", content_type="text", score=0.7)]}, + ] + expected_outputs = [ + {"documents": [Document(content="text document 1", content_type="text", score=0.2)]}, + {"documents": [Document(content="text document 2", content_type="text", score=0.7)]}, + ] + + join_docs = JoinDocuments(join_mode=join_mode) + result, _ = join_docs.run(inputs) + assert len(result["documents"]) == 2 + if join_mode == "concatenate": + assert result["documents"] == expected_outputs From dbb96d3ced50064c31916632c8f1fb309400cde8 Mon Sep 17 00:00:00 2001 From: nickprock Date: Mon, 9 Oct 2023 17:40:57 +0200 Subject: [PATCH 17/22] updated test --- test/nodes/test_join_documents.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index 03058aa298..aee77f005b 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -58,7 +58,7 @@ def test_joindocuments_preserves_root_node(): @pytest.mark.unit @pytest.mark.parametrize("join_mode", ["concatenate", "merge", "reciprocal_rank_fusion"]) -def test_joindocuments_keep_only_highest_ranking_duplicate(join_mode): +def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(join_mode): inputs = [ {"documents": [Document(content="text document 1", content_type="text", score=0.2)]}, {"documents": [Document(content="text document 2", content_type="text", score=0.3)]}, @@ -74,3 +74,5 @@ def test_joindocuments_keep_only_highest_ranking_duplicate(join_mode): assert len(result["documents"]) == 2 if join_mode == "concatenate": assert result["documents"] == expected_outputs + else: + pass From acd50091d4939945e7fd9db93458e320e8d70f7b Mon Sep 17 00:00:00 2001 From: nickprock Date: Mon, 9 Oct 2023 18:04:50 +0200 Subject: [PATCH 18/22] updated test --- test/nodes/test_join_documents.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index aee77f005b..db6f3498b7 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -65,8 +65,8 @@ def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(join_mode {"documents": [Document(content="text document 2", content_type="text", score=0.7)]}, ] expected_outputs = [ - {"documents": [Document(content="text document 1", content_type="text", score=0.2)]}, {"documents": [Document(content="text document 2", content_type="text", score=0.7)]}, + {"documents": [Document(content="text document 1", content_type="text", score=0.2)]}, ] join_docs = JoinDocuments(join_mode=join_mode) From cc2f5ec6aa2998418182c6ede0193bed27a7ab50 Mon Sep 17 00:00:00 2001 From: Nicola Procopio Date: Mon, 9 Oct 2023 18:37:59 +0200 Subject: [PATCH 19/22] Update test_join_documents.py --- test/nodes/test_join_documents.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index db6f3498b7..77be3111a4 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -60,19 +60,17 @@ def test_joindocuments_preserves_root_node(): @pytest.mark.parametrize("join_mode", ["concatenate", "merge", "reciprocal_rank_fusion"]) def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(join_mode): inputs = [ - {"documents": [Document(content="text document 1", content_type="text", score=0.2)]}, - {"documents": [Document(content="text document 2", content_type="text", score=0.3)]}, - {"documents": [Document(content="text document 2", content_type="text", score=0.7)]}, - ] - expected_outputs = [ + {"documents": [Document(content="text document 1", content_type="text", score=0.2), + Document(content="text document 2", content_type="text", score=0.3)]}, {"documents": [Document(content="text document 2", content_type="text", score=0.7)]}, - {"documents": [Document(content="text document 1", content_type="text", score=0.2)]}, ] + expected_outputs = {"documents": [Document(content="text document 2", content_type="text", score=0.7), + Document(content="text document 1", content_type="text", score=0.2)]} join_docs = JoinDocuments(join_mode=join_mode) result, _ = join_docs.run(inputs) assert len(result["documents"]) == 2 if join_mode == "concatenate": - assert result["documents"] == expected_outputs + assert result['documents'] == expected_outputs['documents'] else: pass From dced12f6aea7761a61365045bda11a6dca67f958 Mon Sep 17 00:00:00 2001 From: nickprock Date: Mon, 9 Oct 2023 18:41:26 +0200 Subject: [PATCH 20/22] formatted with black --- test/nodes/test_join_documents.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index 77be3111a4..f10155fdb0 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -60,17 +60,25 @@ def test_joindocuments_preserves_root_node(): @pytest.mark.parametrize("join_mode", ["concatenate", "merge", "reciprocal_rank_fusion"]) def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(join_mode): inputs = [ - {"documents": [Document(content="text document 1", content_type="text", score=0.2), - Document(content="text document 2", content_type="text", score=0.3)]}, + { + "documents": [ + Document(content="text document 1", content_type="text", score=0.2), + Document(content="text document 2", content_type="text", score=0.3), + ] + }, {"documents": [Document(content="text document 2", content_type="text", score=0.7)]}, ] - expected_outputs = {"documents": [Document(content="text document 2", content_type="text", score=0.7), - Document(content="text document 1", content_type="text", score=0.2)]} + expected_outputs = { + "documents": [ + Document(content="text document 2", content_type="text", score=0.7), + Document(content="text document 1", content_type="text", score=0.2), + ] + } join_docs = JoinDocuments(join_mode=join_mode) result, _ = join_docs.run(inputs) assert len(result["documents"]) == 2 if join_mode == "concatenate": - assert result['documents'] == expected_outputs['documents'] + assert result["documents"] == expected_outputs["documents"] else: pass From 7735deb97ba0d0468a62950b19eeefba9ac4086d Mon Sep 17 00:00:00 2001 From: nickprock Date: Tue, 10 Oct 2023 11:00:21 +0200 Subject: [PATCH 21/22] fixed test --- test/nodes/test_join_documents.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index f10155fdb0..5d4bac6b99 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -57,7 +57,6 @@ def test_joindocuments_preserves_root_node(): @pytest.mark.unit -@pytest.mark.parametrize("join_mode", ["concatenate", "merge", "reciprocal_rank_fusion"]) def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(join_mode): inputs = [ { @@ -75,10 +74,7 @@ def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(join_mode ] } - join_docs = JoinDocuments(join_mode=join_mode) + join_docs = JoinDocuments(join_mode="concatenate") result, _ = join_docs.run(inputs) assert len(result["documents"]) == 2 - if join_mode == "concatenate": - assert result["documents"] == expected_outputs["documents"] - else: - pass + assert result["documents"] == expected_outputs["documents"] From 3ee1c5bde81f09cdfee6ca497cd4633f5d15d2c7 Mon Sep 17 00:00:00 2001 From: nickprock Date: Tue, 10 Oct 2023 11:22:48 +0200 Subject: [PATCH 22/22] fixed --- test/nodes/test_join_documents.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index 5d4bac6b99..aa303e26b2 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -57,7 +57,7 @@ def test_joindocuments_preserves_root_node(): @pytest.mark.unit -def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(join_mode): +def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(): inputs = [ { "documents": [