deepset-ai · julian-risch · Aug 25, 2022 · Aug 1, 2022 · Aug 1, 2022 · Aug 1, 2022
@@ -466,7 +466,7 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used.
 #### TfidfRetriever.retrieve\_batch
 
 ```python
-def retrieve_batch(queries: List[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
+def retrieve_batch(queries: Union[str, List[str]], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]]
 ```
 
 Scan through documents in DocumentStore and return a small number documents

@@ -105,10 +105,12 @@ def run(self, query: str, documents: List[Document], top_k: Optional[int] = None
 
     def run_batch(  # type: ignore
         self,
-        queries: List[str],
+        queries: Union[str, List[str]],
         documents: Union[List[Document], List[List[Document]]],
         top_k: Optional[int] = None,
         batch_size: Optional[int] = None,
+        labels: Optional[List[MultiLabel]] = None,
+        add_isolated_node_eval: bool = False,
     ):
         self.query_count += len(queries) if isinstance(queries, list) else 1
         if not documents:
@@ -129,8 +131,37 @@ def run_batch(  # type: ignore
                 flattened_documents.extend(doc_list)
             else:
                 flattened_documents.append(doc_list)
-        for answer in answer_iterator:
+
+        results["answers_isolated"] = [
             BaseReader.add_doc_meta_data_to_answer(documents=flattened_documents, answer=answer)
+            for answer in answer_iterator
+        ]
+
+        # run evaluation with labels as node inputs
+        if add_isolated_node_eval and labels is not None:
+            relevant_documents = []
+            for labelx in labels:
+                relevant_documents.append([label.document for label in labelx.labels])
+            results_label_input = predict_batch(queries=queries, documents=relevant_documents, top_k=top_k)
+
+            # Add corresponding document_name and more meta data, if an answer contains the document_id
+            answer_iterator = itertools.chain.from_iterable(results_label_input["answers"])
+            if isinstance(documents[0], Document):
+                if isinstance(queries, list):
+                    answer_iterator = itertools.chain.from_iterable(
+                        itertools.chain.from_iterable(results_label_input["answers"])
+                    )
+            flattened_documents = []
+            for doc_list in documents:
+                if isinstance(doc_list, list):
+                    flattened_documents.extend(doc_list)
+                else:
+                    flattened_documents.append(doc_list)
+
+            results["answers_isolated"] = [
+                BaseReader.add_doc_meta_data_to_answer(documents=flattened_documents, answer=answer)
+                for answer in answer_iterator
+            ]
 
         return results, "output_1"
 

@@ -532,7 +532,7 @@ def retrieve(
 
     def retrieve_batch(
         self,
-        queries: List[str],
+        queries: Union[str, List[str]],
         filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None,
         top_k: Optional[int] = None,
         index: str = None,

@@ -217,6 +217,65 @@ def eval(
         )
         return output
 
+    def eval_batch(
+        self,
+        labels: List[MultiLabel],
+        params: Optional[dict] = None,
+        sas_model_name_or_path: Optional[str] = None,
+        sas_batch_size: int = 32,
+        sas_use_gpu: bool = True,
+        add_isolated_node_eval: bool = False,
+        custom_document_id_field: Optional[str] = None,
+        context_matching_min_length: int = 100,
+        context_matching_boost_split_overlaps: bool = True,
+        context_matching_threshold: float = 65.0,
+    ) -> EvaluationResult:
+
+        """
+         Evaluates the pipeline by running the pipeline once per query in the debug mode
+         and putting together all data that is needed for evaluation, for example, calculating metrics.
+
+        To calculate SAS (Semantic Answer Similarity) metrics, specify `sas_model_name_or_path`.
+
+         You can control the scope within which an Answer or a Document is considered correct afterwards (see `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`).
+         For some of these scopes, you need to add the following information during `eval()`:
+         - `custom_document_id_field` parameter to select a custom document ID from document's metadata for ID matching (only affects 'document_id' scopes).
+         - `context_matching_...` parameter to fine-tune the fuzzy matching mechanism that determines whether text contexts match each other (only affects 'context' scopes, default values should work most of the time).
+
+         :param labels: The labels to evaluate on.
+         :param params: Parameters for the `retriever` and `reader`. For instance,
+                        params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}.
+         :param sas_model_name_or_path: Sentence transformers semantic textual similarity model you want to use for the SAS value calculation.
+                                     It should be a path or a string pointing to downloadable models.
+         :param sas_batch_size: Number of prediction label pairs to encode at once by cross encoder or sentence transformer while calculating SAS.
+         :param sas_use_gpu: Whether to use a GPU or the CPU for calculating semantic answer similarity.
+                             Falls back to CPU if no GPU is available.
+         :param add_isolated_node_eval: Whether to additionally evaluate the reader based on labels as input, instead of the output of the previous node in the pipeline.
+         :param custom_document_id_field: Custom field name within `Document`'s `meta` which identifies the document and is used as a criterion for matching documents to labels during evaluation.
+                                          This is especially useful if you want to match documents on other criteria (for example, file names) than the default document IDs, as these could be heavily influenced by preprocessing.
+                                          If not set, the default `Document`'s `id` is used as the criterion for matching documents to labels.
+         :param context_matching_min_length: The minimum string length context and candidate need to have to be scored.
+                            Returns 0.0 otherwise.
+         :param context_matching_boost_split_overlaps: Whether to boost split overlaps (for example, [AB] <-> [BC]) that result from different preprocessing parameters.
+                                  If we detect that the score is near a half match and the matching part of the candidate is at its boundaries,
+                                  we cut the context on the same side, recalculate the score, and take the mean of both.
+                                  Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total.
+         :param context_matching_threshold: Score threshold that candidates must surpass to be included into the result list. Range: [0,100]
+        """
+        output = self.pipeline.eval_batch(
+            labels=labels,
+            params=params,
+            sas_model_name_or_path=sas_model_name_or_path,
+            sas_batch_size=sas_batch_size,
+            sas_use_gpu=sas_use_gpu,
+            add_isolated_node_eval=add_isolated_node_eval,
+            custom_document_id_field=custom_document_id_field,
+            context_matching_boost_split_overlaps=context_matching_boost_split_overlaps,
+            context_matching_min_length=context_matching_min_length,
+            context_matching_threshold=context_matching_threshold,
+        )
+        return output
+
     def print_eval_report(
         self,
         eval_result: EvaluationResult,

@@ -54,7 +54,7 @@ def test_generativeqa_calculate_metrics(
 @pytest.mark.parametrize("retriever_with_docs", ["embedding"], indirect=True)
 def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDocumentStore, retriever_with_docs):
     document_store_with_docs.update_embeddings(retriever=retriever_with_docs)
-    summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distill-pegasus-xsum-16-4", use_gpu=-1)
+    summarizer = TransformersSummarizer(model_name_or_path="sshleifer/distill-pegasus-xsum-16-4", use_gpu=False)
     pipeline = SearchSummarizationPipeline(
         retriever=retriever_with_docs, summarizer=summarizer, return_in_answer_format=True
     )
@@ -1277,7 +1277,6 @@ def test_multi_retriever_pipeline_with_asymmetric_qa_eval(document_store_with_do
 
     assert "ESRetriever" in eval_result
     assert "DPRRetriever" in eval_result
-    assert "DPRRetriever" in eval_result
     assert "QAReader" in eval_result
     assert len(eval_result) == 3