diff --git a/haystack/nodes/other/join_docs.py b/haystack/nodes/other/join_docs.py index 4185873a7c..7ce0de819d 100644 --- a/haystack/nodes/other/join_docs.py +++ b/haystack/nodes/other/join_docs.py @@ -1,11 +1,10 @@ -from collections import defaultdict import logging +from collections import defaultdict from math import inf +from typing import List, Optional -from typing import Optional, List - -from haystack.schema import Document from haystack.nodes.other.join import JoinNode +from haystack.schema import Document logger = logging.getLogger(__name__) @@ -64,7 +63,7 @@ def run_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None): document_map = {doc.id: doc for result in results for doc in result} if self.join_mode == "concatenate": - scores_map = self._concatenate_results(results) + scores_map = self._concatenate_results(results, document_map) elif self.join_mode == "merge": scores_map = self._calculate_comb_sum(results) elif self.join_mode == "reciprocal_rank_fusion": @@ -118,11 +117,22 @@ def run_batch_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = return output, "output_1" - def _concatenate_results(self, results): + def _concatenate_results(self, results, document_map): """ Concatenates multiple document result lists. + Return the documents with the higher score. """ - return {doc.id: doc.score for result in results for doc in result} + list_id = list(document_map.keys()) + scores_map = {} + for idx in list_id: + tmp = [] + for result in results: + for doc in result: + if doc.id == idx: + tmp.append(doc) + item_best_score = max(tmp, key=lambda x: x.score) + scores_map.update({idx: item_best_score.score}) + return scores_map def _calculate_comb_sum(self, results): """ diff --git a/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml b/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml new file mode 100644 index 0000000000..e282f7b9fb --- /dev/null +++ b/releasenotes/notes/fix-joinDocuments-concatenate-56a7cdba00a7248e.yaml @@ -0,0 +1,4 @@ +--- +enhancements: + - | + Make JoinDocuments return only the document with the highest score if there are duplicate documents in the list. diff --git a/test/nodes/test_join_documents.py b/test/nodes/test_join_documents.py index 92cf5c8828..aa303e26b2 100644 --- a/test/nodes/test_join_documents.py +++ b/test/nodes/test_join_documents.py @@ -54,3 +54,27 @@ def test_joindocuments_preserves_root_node(): join_docs = JoinDocuments() result, _ = join_docs.run(inputs) assert result["root_node"] == "File" + + +@pytest.mark.unit +def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate(): + inputs = [ + { + "documents": [ + Document(content="text document 1", content_type="text", score=0.2), + Document(content="text document 2", content_type="text", score=0.3), + ] + }, + {"documents": [Document(content="text document 2", content_type="text", score=0.7)]}, + ] + expected_outputs = { + "documents": [ + Document(content="text document 2", content_type="text", score=0.7), + Document(content="text document 1", content_type="text", score=0.2), + ] + } + + join_docs = JoinDocuments(join_mode="concatenate") + result, _ = join_docs.run(inputs) + assert len(result["documents"]) == 2 + assert result["documents"] == expected_outputs["documents"]