Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update document scores based on ranker node #2048

Merged
merged 13 commits into from
Jun 27, 2022
Prev Previous commit
Next Next commit
Merge branch 'master' into add_score_to_ranker
# Conflicts:
#	haystack/nodes/ranker/sentence_transformers.py
mathislucka committed Jun 23, 2022

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 472c4d63c994ee90e504e0cf4087bdeb5f382ac4
136 changes: 136 additions & 0 deletions haystack/nodes/ranker/sentence_transformers.py
Original file line number Diff line number Diff line change
@@ -134,3 +134,139 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
sorted_documents.append(doc)

return sorted_documents

def predict_batch(
self,
queries: List[str],
documents: Union[List[Document], List[List[Document]]],
top_k: Optional[int] = None,
batch_size: Optional[int] = None,
) -> Union[List[Document], List[List[Document]]]:
"""
Use loaded ranker model to re-rank the supplied lists of Documents.

Returns lists of Documents sorted by (desc.) similarity with the corresponding queries.


- If you provide a list containing a single query...

- ... and a single list of Documents, the single list of Documents will be re-ranked based on the
supplied query.
- ... and a list of lists of Documents, each list of Documents will be re-ranked individually based on the
supplied query.


- If you provide a list of multiple queries...

- ... you need to provide a list of lists of Documents. Each list of Documents will be re-ranked based on
its corresponding query.

:param queries: Single query string or list of queries
:param documents: Single list of Documents or list of lists of Documents to be reranked.
:param top_k: The maximum number of documents to return per Document list.
:param batch_size: Number of Documents to process at a time.
"""
if top_k is None:
top_k = self.top_k

if batch_size is None:
batch_size = self.batch_size

number_of_docs, all_queries, all_docs, single_list_of_docs = self._preprocess_batch_queries_and_docs(
queries=queries, documents=documents
)

batches = self._get_batches(all_queries=all_queries, all_docs=all_docs, batch_size=batch_size)
preds = []
for cur_queries, cur_docs in batches:
features = self.transformer_tokenizer(
cur_queries, [doc.content for doc in cur_docs], padding=True, truncation=True, return_tensors="pt"
).to(self.devices[0])

with torch.no_grad():
similarity_scores = self.transformer_model(**features).logits
preds.extend(similarity_scores)

logits_dim = similarity_scores.shape[1] # [batch_size, logits_dim]
if single_list_of_docs:
sorted_scores_and_documents = sorted(
zip(similarity_scores, documents),
key=lambda similarity_document_tuple:
# assume the last element in logits represents the `has_answer` label
similarity_document_tuple[0][-1] if logits_dim >= 2 else similarity_document_tuple[0],
reverse=True,
)

# rank documents according to scores
sorted_documents = [doc for _, doc in sorted_scores_and_documents if isinstance(doc, Document)]
return sorted_documents[:top_k]
else:
# Group predictions together
grouped_predictions = []
left_idx = 0
right_idx = 0
for number in number_of_docs:
right_idx = left_idx + number
grouped_predictions.append(similarity_scores[left_idx:right_idx])
left_idx = right_idx

result = []
for pred_group, doc_group in zip(grouped_predictions, documents):
sorted_scores_and_documents = sorted(
zip(pred_group, doc_group), # type: ignore
key=lambda similarity_document_tuple:
# assume the last element in logits represents the `has_answer` label
similarity_document_tuple[0][-1] if logits_dim >= 2 else similarity_document_tuple[0],
reverse=True,
)

# rank documents according to scores
sorted_documents = [doc for _, doc in sorted_scores_and_documents if isinstance(doc, Document)][:top_k]
result.append(sorted_documents)

return result

def _preprocess_batch_queries_and_docs(
self, queries: List[str], documents: Union[List[Document], List[List[Document]]]
) -> Tuple[List[int], List[str], List[Document], bool]:
number_of_docs = []
all_queries = []
all_docs: List[Document] = []
single_list_of_docs = False

# Docs case 1: single list of Documents -> rerank single list of Documents based on single query
if len(documents) > 0 and isinstance(documents[0], Document):
if len(queries) != 1:
raise HaystackError("Number of queries must be 1 if a single list of Documents is provided.")
query = queries[0]
number_of_docs = [len(documents)]
all_queries = [query] * len(documents)
all_docs = documents # type: ignore
single_list_of_docs = True

# Docs case 2: list of lists of Documents -> rerank each list of Documents based on corresponding query
# If queries contains a single query, apply it to each list of Documents
if len(documents) > 0 and isinstance(documents[0], list):
if len(queries) == 1:
queries = queries * len(documents)
if len(queries) != len(documents):
raise HaystackError("Number of queries must be equal to number of provided Document lists.")
for query, cur_docs in zip(queries, documents):
if not isinstance(cur_docs, list):
raise HaystackError(f"cur_docs was of type {type(cur_docs)}, but expected a list of Documents.")
number_of_docs.append(len(cur_docs))
all_queries.extend([query] * len(cur_docs))
all_docs.extend(cur_docs)

return number_of_docs, all_queries, all_docs, single_list_of_docs

@staticmethod
def _get_batches(
all_queries: List[str], all_docs: List[Document], batch_size: Optional[int]
) -> Iterator[Tuple[List[str], List[Document]]]:
if batch_size is None:
yield all_queries, all_docs
return
else:
for index in range(0, len(all_queries), batch_size):
yield all_queries[index : index + batch_size], all_docs[index : index + batch_size]
You are viewing a condensed version of this merge commit. You can view the full changes here.