Replace dpr with embeddingretriever tut14 (#2336)

* add updated graph images for tutorial14 * ipynb: replaced DPR with EmbeddingRetriever, added TODO for further inspection of failing code * Revert "ipynb: replaced DPR with EmbeddingRetriever, added TODO for further inspection of failing code" This reverts commit f4b6f3e. * ipynb: replaced DPR with EmbeddingRetriever, added TODO for further inspection of failing code * ipynb: quick fix to avoid failure in print_answers * py: quick fix to avoid failure in print_answers * Update Documentation & Code Style * ipynb: remove DPR, remove images * Revert "ipynb: remove DPR, remove images" This reverts commit dfa1e75. * ipynb: remove DPR, remove images * py: replace DPR with EmbeddingRetriever * Update Documentation & Code Style * correcting a typo * Update Documentation & Code Style Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: TuanaCelik <[email protected]>
deepset-ai · Mar 28, 2022 · 04b56f0 · 04b56f0
1 parent b20a1f8
commit 04b56f0
Show file tree

Hide file tree

Showing 5 changed files with 103 additions and 1,189 deletions.
diff --git a/docs/_src/img/tutorial14_pipeline_classifier.png b/docs/_src/img/tutorial14_pipeline_classifier.png
diff --git a/docs/_src/img/tutorial14_question_classifier.png b/docs/_src/img/tutorial14_question_classifier.png
diff --git a/docs/_src/tutorials/tutorials/14.md b/docs/_src/tutorials/tutorials/14.md
@@ -91,12 +91,19 @@ be used indexed into our `DocumentStore`
 
 
 ```python
-from haystack.utils import print_answers, fetch_archive_from_http, convert_files_to_dicts, clean_wiki_text, launch_es
+from haystack.utils import (
+    print_answers,
+    print_documents,
+    fetch_archive_from_http,
+    convert_files_to_dicts,
+    clean_wiki_text,
+    launch_es,
+)
 from haystack.pipelines import Pipeline, RootNode
 from haystack.document_stores import ElasticsearchDocumentStore
 from haystack.nodes import (
     ElasticsearchRetriever,
-    DensePassageRetriever,
+    EmbeddingRetriever,
     FARMReader,
     TransformersQueryClassifier,
     SklearnQueryClassifier,
@@ -120,8 +127,12 @@ document_store.write_documents(got_dicts)
 es_retriever = ElasticsearchRetriever(document_store=document_store)
 
 # Initialize dense retriever
-dpr_retriever = DensePassageRetriever(document_store)
-document_store.update_embeddings(dpr_retriever, update_existing_embeddings=False)
+embedding_retriever = EmbeddingRetriever(
+    document_store=document_store,
+    model_format="sentence_transformers",
+    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
+)
+document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False)
 
 reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
 ```
@@ -133,10 +144,10 @@ The keyword vs question/statement query classifier essentially distinguishes bet
 *  Getting better search results (e.g. by routing only proper questions to DPR / QA branches and not keyword queries)
 *  Less GPU costs (e.g. if 50% of your traffic is only keyword queries you could just use elastic here and save the GPU resources for the other 50% of traffic with semantic queries)
 
-![image](https://user-images.githubusercontent.com/6007894/127831511-f55bad86-4b4f-4b54-9889-7bba37e475c6.png)
+![image]()
 
 
-Below, we define a `SklQueryClassifier` and show how to use it:
+Below, we define a `SklearnQueryClassifier` and show how to use it:
 
 Read more about the trained model and dataset used [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt)
 
@@ -145,17 +156,19 @@ Read more about the trained model and dataset used [here](https://ext-models-hay
 # Here we build the pipeline
 sklearn_keyword_classifier = Pipeline()
 sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"])
-sklearn_keyword_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"])
+sklearn_keyword_classifier.add_node(
+    component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"]
+)
 sklearn_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
-sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
+sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
 sklearn_keyword_classifier.draw("pipeline_classifier.png")
 ```
 
 
 ```python
 # Run only the dense retriever on the full sentence query
 res_1 = sklearn_keyword_classifier.run(query="Who is the father of Arya Stark?")
-print("DPR Results" + "\n" + "=" * 15)
+print("Embedding Retriever Results" + "\n" + "=" * 15)
 print_answers(res_1, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
@@ -168,7 +181,7 @@ print_answers(res_2, details="minimum")
 ```python
 # Run only the dense retriever on the full sentence query
 res_3 = sklearn_keyword_classifier.run(query="which country was jon snow filmed ?")
-print("DPR Results" + "\n" + "=" * 15)
+print("Embedding Retriever Results" + "\n" + "=" * 15)
 print_answers(res_3, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
@@ -181,7 +194,7 @@ print_answers(res_4, details="minimum")
 ```python
 # Run only the dense retriever on the full sentence query
 res_5 = sklearn_keyword_classifier.run(query="who are the younger brothers of arya stark ?")
-print("DPR Results" + "\n" + "=" * 15)
+print("Embedding Retriever Results" + "\n" + "=" * 15)
 print_answers(res_5, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
@@ -206,18 +219,18 @@ transformer_keyword_classifier.add_node(
     component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]
 )
 transformer_keyword_classifier.add_node(
-    component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]
+    component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"]
 )
 transformer_keyword_classifier.add_node(component=es_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"])
-transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "DPRRetriever"])
+transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"])
 transformer_keyword_classifier.draw("pipeline_classifier.png")
 ```
 
 
 ```python
 # Run only the dense retriever on the full sentence query
 res_1 = transformer_keyword_classifier.run(query="Who is the father of Arya Stark?")
-print("DPR Results" + "\n" + "=" * 15)
+print("Embedding Retriever Results" + "\n" + "=" * 15)
 print_answers(res_1, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
@@ -230,7 +243,7 @@ print_answers(res_2, details="minimum")
 ```python
 # Run only the dense retriever on the full sentence query
 res_3 = transformer_keyword_classifier.run(query="which country was jon snow filmed ?")
-print("DPR Results" + "\n" + "=" * 15)
+print("Embedding Retriever Results" + "\n" + "=" * 15)
 print_answers(res_3, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
@@ -243,7 +256,7 @@ print_answers(res_4, details="minimum")
 ```python
 # Run only the dense retriever on the full sentence query
 res_5 = transformer_keyword_classifier.run(query="who are the younger brothers of arya stark ?")
-print("DPR Results" + "\n" + "=" * 15)
+print("Embedding Retriever Results" + "\n" + "=" * 15)
 print_answers(res_5, details="minimum")
 
 # Run only the sparse retriever on a keyword based query
@@ -256,7 +269,7 @@ print_answers(res_6, details="minimum")
 
 One possible use case of this classifier could be to route queries after the document retrieval to only send questions to QA reader and in case of declarative sentence, just return the DPR/ES results back to user to enhance user experience and only show answers when user explicitly asks it.
 
-![image](https://user-images.githubusercontent.com/6007894/127864452-f931ea7f-2e62-4f59-85dc-056d56eb9295.png)
+![image]()
 
 
 Below, we define a `TransformersQueryClassifier` and show how to use it:
@@ -267,24 +280,23 @@ Read more about the trained model and dataset used [here](https://huggingface.co
 ```python
 # Here we build the pipeline
 transformer_question_classifier = Pipeline()
-transformer_question_classifier.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
+transformer_question_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"])
 transformer_question_classifier.add_node(
     component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"),
     name="QueryClassifier",
-    inputs=["DPRRetriever"],
+    inputs=["EmbeddingRetriever"],
 )
 transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"])
 transformer_question_classifier.draw("question_classifier.png")
 
 # Run only the QA reader on the question query
 res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?")
-print("DPR Results" + "\n" + "=" * 15)
+print("Embedding Retriever Results" + "\n" + "=" * 15)
 print_answers(res_1, details="minimum")
 
-# Show only DPR results
 res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.")
 print("ES Results" + "\n" + "=" * 15)
-print_answers(res_2, details="minimum")
+print_documents(res_2)
 ```
 
 ## Standalone Query Classifier