Merge branch 'master' into add_score_to_ranker

# Conflicts: # haystack/nodes/ranker/sentence_transformers.py
deepset-ai · julian-risch · Jun 27, 2022 · Jan 21, 2022 · Jan 21, 2022 · Jun 23, 2022
commit 472c4d63c994ee90e504e0cf4087bdeb5f382ac4
diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py
@@ -134,3 +134,139 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] =
             sorted_documents.append(doc)
 
         return sorted_documents
+
+    def predict_batch(
+        self,
+        queries: List[str],
+        documents: Union[List[Document], List[List[Document]]],
+        top_k: Optional[int] = None,
+        batch_size: Optional[int] = None,
+    ) -> Union[List[Document], List[List[Document]]]:
+        """
+        Use loaded ranker model to re-rank the supplied lists of Documents.
+
+        Returns lists of Documents sorted by (desc.) similarity with the corresponding queries.
+
+
+        - If you provide a list containing a single query...
+
+            - ... and a single list of Documents, the single list of Documents will be re-ranked based on the
+              supplied query.
+            - ... and a list of lists of Documents, each list of Documents will be re-ranked individually based on the
+              supplied query.
+
+
+        - If you provide a list of multiple queries...
+
+            - ... you need to provide a list of lists of Documents. Each list of Documents will be re-ranked based on
+              its corresponding query.
+
+        :param queries: Single query string or list of queries
+        :param documents: Single list of Documents or list of lists of Documents to be reranked.
+        :param top_k: The maximum number of documents to return per Document list.
+        :param batch_size: Number of Documents to process at a time.
+        """
+        if top_k is None:
+            top_k = self.top_k
+
+        if batch_size is None:
+            batch_size = self.batch_size
+
+        number_of_docs, all_queries, all_docs, single_list_of_docs = self._preprocess_batch_queries_and_docs(
+            queries=queries, documents=documents
+        )
+
+        batches = self._get_batches(all_queries=all_queries, all_docs=all_docs, batch_size=batch_size)
+        preds = []
+        for cur_queries, cur_docs in batches:
+            features = self.transformer_tokenizer(
+                cur_queries, [doc.content for doc in cur_docs], padding=True, truncation=True, return_tensors="pt"
+            ).to(self.devices[0])
+
+            with torch.no_grad():
+                similarity_scores = self.transformer_model(**features).logits
+                preds.extend(similarity_scores)
+
+        logits_dim = similarity_scores.shape[1]  # [batch_size, logits_dim]
+        if single_list_of_docs:
+            sorted_scores_and_documents = sorted(
+                zip(similarity_scores, documents),
+                key=lambda similarity_document_tuple:
+                # assume the last element in logits represents the `has_answer` label
+                similarity_document_tuple[0][-1] if logits_dim >= 2 else similarity_document_tuple[0],
+                reverse=True,
+            )
+
+            # rank documents according to scores
+            sorted_documents = [doc for _, doc in sorted_scores_and_documents if isinstance(doc, Document)]
+            return sorted_documents[:top_k]
+        else:
+            # Group predictions together
+            grouped_predictions = []
+            left_idx = 0
+            right_idx = 0
+            for number in number_of_docs:
+                right_idx = left_idx + number
+                grouped_predictions.append(similarity_scores[left_idx:right_idx])
+                left_idx = right_idx
+
+            result = []
+            for pred_group, doc_group in zip(grouped_predictions, documents):
+                sorted_scores_and_documents = sorted(
+                    zip(pred_group, doc_group),  # type: ignore
+                    key=lambda similarity_document_tuple:
+                    # assume the last element in logits represents the `has_answer` label
+                    similarity_document_tuple[0][-1] if logits_dim >= 2 else similarity_document_tuple[0],
+                    reverse=True,
+                )
+
+                # rank documents according to scores
+                sorted_documents = [doc for _, doc in sorted_scores_and_documents if isinstance(doc, Document)][:top_k]
+                result.append(sorted_documents)
+
+            return result
+
+    def _preprocess_batch_queries_and_docs(
+        self, queries: List[str], documents: Union[List[Document], List[List[Document]]]
+    ) -> Tuple[List[int], List[str], List[Document], bool]:
+        number_of_docs = []
+        all_queries = []
+        all_docs: List[Document] = []
+        single_list_of_docs = False
+
+        # Docs case 1: single list of Documents -> rerank single list of Documents based on single query
+        if len(documents) > 0 and isinstance(documents[0], Document):
+            if len(queries) != 1:
+                raise HaystackError("Number of queries must be 1 if a single list of Documents is provided.")
+            query = queries[0]
+            number_of_docs = [len(documents)]
+            all_queries = [query] * len(documents)
+            all_docs = documents  # type: ignore
+            single_list_of_docs = True
+
+        # Docs case 2: list of lists of Documents -> rerank each list of Documents based on corresponding query
+        # If queries contains a single query, apply it to each list of Documents
+        if len(documents) > 0 and isinstance(documents[0], list):
+            if len(queries) == 1:
+                queries = queries * len(documents)
+            if len(queries) != len(documents):
+                raise HaystackError("Number of queries must be equal to number of provided Document lists.")
+            for query, cur_docs in zip(queries, documents):
+                if not isinstance(cur_docs, list):
+                    raise HaystackError(f"cur_docs was of type {type(cur_docs)}, but expected a list of Documents.")
+                number_of_docs.append(len(cur_docs))
+                all_queries.extend([query] * len(cur_docs))
+                all_docs.extend(cur_docs)
+
+        return number_of_docs, all_queries, all_docs, single_list_of_docs
+
+    @staticmethod
+    def _get_batches(
+        all_queries: List[str], all_docs: List[Document], batch_size: Optional[int]
+    ) -> Iterator[Tuple[List[str], List[Document]]]:
+        if batch_size is None:
+            yield all_queries, all_docs
+            return
+        else:
+            for index in range(0, len(all_queries), batch_size):
+                yield all_queries[index : index + batch_size], all_docs[index : index + batch_size]