deepset-ai · tstadel · Mar 3, 2022 · Feb 25, 2022 · Feb 25, 2022 · Feb 25, 2022
diff --git a/docs/_src/tutorials/tutorials/5.md b/docs/_src/tutorials/tutorials/5.md
@@ -38,13 +38,6 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial
 !pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]
 ```
 
-
-```python
-from haystack.modeling.utils import initialize_device_settings
-
-devices, n_gpu = initialize_device_settings(use_cuda=True)
-```
-
 ## Start an Elasticsearch server
 You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source.
 
@@ -137,6 +130,7 @@ document_store.add_eval_data(
 from haystack.nodes import ElasticsearchRetriever
 
 retriever = ElasticsearchRetriever(document_store=document_store)
+
 # Alternative: Evaluate dense retrievers (DensePassageRetriever or EmbeddingRetriever)
 # DensePassageRetriever uses two separate transformer based encoders for query and document.
 # In contrast, EmbeddingRetriever uses a single encoder for both.
@@ -145,6 +139,7 @@ retriever = ElasticsearchRetriever(document_store=document_store)
 #        the max_seq_len limitations of Transformers
 # The SentenceTransformer model "all-mpnet-base-v2" generally works well with the EmbeddingRetriever on any kind of English text.
 # For more information check out the documentation at: https://www.sbert.net/docs/pretrained_models.html
+
 # from haystack.retriever import DensePassageRetriever, EmbeddingRetriever
 # retriever = DensePassageRetriever(document_store=document_store,
 #                                   query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
@@ -171,6 +166,7 @@ pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
 
 # The evaluation also works with any other pipeline.
 # For example you could use a DocumentSearchPipeline as an alternative:
+
 # from haystack.pipelines import DocumentSearchPipeline
 # pipeline = DocumentSearchPipeline(retriever=retriever)
 ```
@@ -188,21 +184,34 @@ The generation of predictions is seperated from the calculation of metrics. This
 from haystack.schema import EvaluationResult, MultiLabel
 
 # We can load evaluation labels from the document store
+# We are also opting to filter out no_answer samples
 eval_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=False)
+eval_labels = [label for label in eval_labels if not label.no_answer]  # filter out no_answer cases
+
+## Alternative: Define queries and labels directly
 
-# Alternative: Define queries and labels directly
-# from haystack.schema import Answer, Document, Label, Span
 # eval_labels = [
-#        MultiLabel(labels=[Label(query="who is written in the book of life",
-#        answer=Answer(answer="every person who is destined for Heaven or the World to Come",
-#        offsets_in_context=[Span(374, 434)]),
-#        document=Document(id='1b090aec7dbd1af6739c4c80f8995877-0',
-#        content_type="text",
-#        content='Book of Life - wikipedia Book of Life Jump to: navigation, search This article is about the book mentioned in Christian and Jewish religious teachings. For other uses, see The Book of Life. In Christianity and Judaism, the Book of Life (Hebrew: ספר החיים, transliterated Sefer HaChaim; Greek: βιβλίον τῆς ζωῆς Biblíon tēs Zōēs) is the book in which God records the names of every person who is destined for Heaven or the World to Come. According to the Talmud it is open on Rosh Hashanah, as is its analog for the wicked, the Book of the Dead. For this reason extra mention is made for the Book of Life during Amidah recitations during the Days of Awe, the ten days between Rosh Hashanah, the Jewish new year, and Yom Kippur, the day of atonement (the two High Holidays, particularly in the prayer Unetaneh Tokef). Contents (hide) 1 In the Hebrew Bible 2 Book of Jubilees 3 References in the New Testament 4 The eschatological or annual roll-call 5 Fundraising 6 See also 7 Notes 8 References In the Hebrew Bible(edit) In the Hebrew Bible the Book of Life - the book or muster-roll of God - records forever all people considered righteous before God'),
-#        is_correct_answer=True,
-#        is_correct_document=True,
-#        origin="gold-label")])
-#    ]
+#    MultiLabel(
+#        labels=[
+#            Label(
+#                query="who is written in the book of life",
+#                answer=Answer(
+#                    answer="every person who is destined for Heaven or the World to Come",
+#                    offsets_in_context=[Span(374, 434)]
+#                ),
+#                document=Document(
+#                    id='1b090aec7dbd1af6739c4c80f8995877-0',
+#                    content_type="text",
+#                    content='Book of Life - wikipedia Book of Life Jump to: navigation, search This article is
+#                       about the book mentioned in Christian and Jewish religious teachings...'
+#                ),
+#                is_correct_answer=True,
+#                is_correct_document=True,
+#                origin="gold-label"
+#            )
+#        ]
+#    )
+# ]
 
 # Similar to pipeline.run() we can execute pipeline.eval()
 eval_result = pipeline.eval(labels=eval_labels, params={"Retriever": {"top_k": 5}})
@@ -226,13 +235,14 @@ reader_result.head()
 
 ```python
 # We can filter for all documents retrieved for a given query
-retriever_book_of_life = retriever_result[retriever_result["query"] == "who is written in the book of life"]
+query = "who is written in the book of life"
+retriever_book_of_life = retriever_result[retriever_result["query"] == query]
 ```
 
 
 ```python
 # We can also filter for all answers predicted for a given query
-reader_book_of_life = reader_result[reader_result["query"] == "who is written in the book of life"]
+reader_book_of_life = reader_result[reader_result["query"] == query]
 ```
 
 
@@ -242,7 +252,9 @@ eval_result.save("../")
 ```
 
 ## Calculating Evaluation Metrics
-Load an EvaluationResult to quickly calculate standard evaluation metrics for all predictions, such as F1-score of each individual prediction of the Reader node or recall of the retriever.
+Load an EvaluationResult to quickly calculate standard evaluation metrics for all predictions,
+such as F1-score of each individual prediction of the Reader node or recall of the retriever.
+To learn more about the metrics, see [Evaluation Metrics](https://haystack.deepset.ai/guides/evaluation#metrics-retrieval)
 
 
 ```python
@@ -281,14 +293,15 @@ metrics = advanced_eval_result.calculate_metrics()
 print(metrics["Reader"]["sas"])
 ```
 
-## Isolated Evaluation Mode to Understand Upper Bounds of the Reader's Performance
-The isolated node evaluation uses labels as input to the reader node instead of the output of the preceeding retriever node.
-Thereby, we can additionally calculate the upper bounds of the evaluation metrics of the reader.
+## Isolated Evaluation Mode
+The isolated node evaluation uses labels as input to the Reader node instead of the output of the preceeding Retriever node.
+Thereby, we can additionally calculate the upper bounds of the evaluation metrics of the Reader. Note that even with isolated evaluation enabled, integrated evaluation will still be running.
+
 
 
 ```python
 eval_result_with_upper_bounds = pipeline.eval(
-    labels=eval_labels, params={"Retriever": {"top_k": 1}}, add_isolated_node_eval=True
+    labels=eval_labels, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 5}}, add_isolated_node_eval=True
 )
 ```
 
@@ -304,6 +317,7 @@ Here we evaluate only the retriever, based on whether the gold_label document is
 
 ```python
 ## Evaluate Retriever on its own
+# Note that no_answer samples are omitted when evaluation is performed with this method
 retriever_eval_results = retriever.eval(top_k=5, label_index=label_index, doc_index=doc_index)
 # Retriever Recall is the proportion of questions for which the correct document containing the answer is
 # among the correct documents
@@ -312,6 +326,16 @@ print("Retriever Recall:", retriever_eval_results["recall"])
 print("Retriever Mean Avg Precision:", retriever_eval_results["map"])
 ```
 
+Just as a sanity check, we can compare the recall from `retriever.eval()` with the multi hit recall from `pipeline.eval(add_isolated_node_eval=True)`.
+These two recall metrics are only comparable since we chose to filter out no_answer samples when generating eval_labels.
+
+
+
+```python
+metrics = eval_result_with_upper_bounds.calculate_metrics()
+print(metrics["Retriever"]["recall_multi_hit"])
+```
+
 ## Evaluation of Individual Components: Reader
 Here we evaluate only the reader in a closed domain fashion i.e. the reader is given one query
 and its corresponding relevant document and metrics are calculated on whether the right position in this text is selected by
@@ -320,9 +344,7 @@ the model as the answer span (i.e. SQuAD style)
 
 ```python
 # Evaluate Reader on its own
-reader_eval_results = reader.eval(
-    document_store=document_store, device=devices[0], label_index=label_index, doc_index=doc_index
-)
+reader_eval_results = reader.eval(document_store=document_store, label_index=label_index, doc_index=doc_index)
 # Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch
 # reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device)