Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixed join_docs.py concatenate #5970

Merged
merged 47 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
d326140
added hybrid search example
nickprock Jul 18, 2023
53b6363
formatted with back formatter
nickprock Jul 18, 2023
812fed5
renamed document
nickprock Jul 19, 2023
277833b
Merge branch 'dev/example' into main
nickprock Jul 19, 2023
8694d73
fixed
nickprock Jul 19, 2023
499abac
Merge branch 'dev/example' into main
nickprock Jul 19, 2023
29c5c3e
fixed typos
nickprock Jul 19, 2023
e88f310
Merge branch 'dev/example' into main
nickprock Jul 19, 2023
b7be547
added test
nickprock Jul 19, 2023
80be4a1
Merge branch 'dev/example' into main
nickprock Jul 19, 2023
c212d2f
fixed withespaces
nickprock Jul 19, 2023
d725d9d
Merge branch 'dev/example' into main
nickprock Jul 19, 2023
442f73e
Merge branch 'main' into main
nickprock Jul 20, 2023
90f699c
removed test for hybrid search
nickprock Jul 24, 2023
05210fa
fixed pylint
nickprock Jul 24, 2023
66b8f9a
Merge branch 'dev/example' into main
nickprock Jul 24, 2023
e9af129
Merge branch 'main' into main
nickprock Jul 24, 2023
bfd6c74
commented logging
nickprock Jul 24, 2023
9d79f7c
Merge branch 'dev/example' into main
nickprock Jul 24, 2023
d8e1a2a
Merge branch 'deepset-ai:main' into main
nickprock Sep 30, 2023
e4519bd
fixed bug in join_docs.py _concatenate_results
nickprock Oct 4, 2023
1ca525c
Merge branch 'deepset-ai:main' into main
nickprock Oct 4, 2023
fddb169
Update join_docs.py
nickprock Oct 4, 2023
0caa0d7
Merge branch 'main' into main
nickprock Oct 4, 2023
05e3837
format with black
nickprock Oct 4, 2023
cf72ea1
added releasenote on PR
nickprock Oct 5, 2023
d0ba0b1
Merge branch 'main' into dev/join_docs
nickprock Oct 5, 2023
b0314a0
Merge branch 'main' into dev/join_docs
nickprock Oct 6, 2023
7587328
Merge branch 'main' into dev/join_docs
nickprock Oct 6, 2023
dc70569
Merge branch 'main' into dev/join_docs
nickprock Oct 9, 2023
7ae098d
updated release notes
nickprock Oct 9, 2023
f15bcbb
updated test_join_documents
nickprock Oct 9, 2023
dbb96d3
updated test
nickprock Oct 9, 2023
37c3092
Merge branch 'main' into dev/join_docs
nickprock Oct 9, 2023
acd5009
updated test
nickprock Oct 9, 2023
c47e0c7
Merge remote-tracking branch 'origin/dev/join_docs' into dev/join_docs
nickprock Oct 9, 2023
cc2f5ec
Update test_join_documents.py
nickprock Oct 9, 2023
dced12f
formatted with black
nickprock Oct 9, 2023
aa9552d
Merge branch 'main' into dev/join_docs
nickprock Oct 9, 2023
021a3f2
Merge branch 'main' into dev/join_docs
nickprock Oct 10, 2023
7735deb
fixed test
nickprock Oct 10, 2023
3ee1c5b
fixed
nickprock Oct 10, 2023
3846103
Merge branch 'main' into dev/join_docs
nickprock Oct 11, 2023
83fcab0
Merge branch 'main' into dev/join_docs
nickprock Oct 12, 2023
471b201
Merge branch 'main' into dev/join_docs
nickprock Oct 12, 2023
6dfb4c2
Merge branch 'main' into dev/join_docs
anakin87 Oct 13, 2023
02b0c97
Merge branch 'main' into dev/join_docs
nickprock Oct 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions haystack/nodes/other/join_docs.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
from collections import defaultdict
import logging
from collections import defaultdict
from math import inf
from typing import List, Optional

from typing import Optional, List

from haystack.schema import Document
from haystack.nodes.other.join import JoinNode
from haystack.schema import Document

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -64,7 +63,7 @@ def run_accumulated(self, inputs: List[dict], top_k_join: Optional[int] = None):
document_map = {doc.id: doc for result in results for doc in result}

if self.join_mode == "concatenate":
scores_map = self._concatenate_results(results)
scores_map = self._concatenate_results(results, document_map)
elif self.join_mode == "merge":
scores_map = self._calculate_comb_sum(results)
elif self.join_mode == "reciprocal_rank_fusion":
Expand Down Expand Up @@ -118,11 +117,22 @@ def run_batch_accumulated(self, inputs: List[dict], top_k_join: Optional[int] =

return output, "output_1"

def _concatenate_results(self, results):
def _concatenate_results(self, results, document_map):
"""
Concatenates multiple document result lists.
Return the documents with the higher score.
"""
return {doc.id: doc.score for result in results for doc in result}
list_id = list(document_map.keys())
scores_map = {}
for idx in list_id:
tmp = []
for result in results:
for doc in result:
if doc.id == idx:
tmp.append(doc)
item_best_score = max(tmp, key=lambda x: x.score)
scores_map.update({idx: item_best_score.score})
return scores_map

def _calculate_comb_sum(self, results):
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
enhancements:
- |
Make JoinDocuments return only the document with the highest score if there are duplicate documents in the list.
24 changes: 24 additions & 0 deletions test/nodes/test_join_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,27 @@ def test_joindocuments_preserves_root_node():
join_docs = JoinDocuments()
result, _ = join_docs.run(inputs)
assert result["root_node"] == "File"


@pytest.mark.unit
def test_joindocuments_concatenate_keep_only_highest_ranking_duplicate():
inputs = [
{
"documents": [
Document(content="text document 1", content_type="text", score=0.2),
Document(content="text document 2", content_type="text", score=0.3),
]
},
{"documents": [Document(content="text document 2", content_type="text", score=0.7)]},
]
expected_outputs = {
"documents": [
Document(content="text document 2", content_type="text", score=0.7),
Document(content="text document 1", content_type="text", score=0.2),
]
}

join_docs = JoinDocuments(join_mode="concatenate")
result, _ = join_docs.run(inputs)
assert len(result["documents"]) == 2
assert result["documents"] == expected_outputs["documents"]