deepset-ai · tanaysoni · Nov 30, 2020 · Nov 24, 2020 · Nov 26, 2020 · Nov 26, 2020
diff --git a/haystack/document_store/elasticsearch.py b/haystack/document_store/elasticsearch.py
@@ -429,10 +429,10 @@ def query(
                 body["query"]["bool"]["filter"] = filter_clause
 
         # Retrieval via custom query
-        elif custom_query:  # substitute placeholder for question and filters for the custom_query template string
+        elif custom_query:  # substitute placeholder for query and filters for the custom_query template string
             template = Template(custom_query)
-            # replace all "${question}" placeholder(s) with query
-            substitutions = {"question": query}
+            # replace all "${query}" placeholder(s) with query
+            substitutions = {"query": query}
             # For each filter we got passed, we'll try to find & replace the corresponding placeholder in the template
             # Example: filters={"years":[2018]} => replaces {$years} in custom_query with '[2018]'
             if filters:

diff --git a/haystack/finder.py b/haystack/finder.py
@@ -28,8 +28,14 @@ def __init__(self, reader: Optional[BaseReader], retriever: Optional[BaseRetriev
         :param reader: Reader instance
         :param retriever: Retriever instance
         """
-        logger.warning("The 'Finder' class will be deprecated in the next Haystack release in favour of the new"
-                       "`Pipeline` class.")
+        logger.warning(
+            """DEPRECATION WARNINGS: 
+            1. The 'Finder' class will be deprecated in the next Haystack release in 
+            favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components
+            including Retriever, Readers, and Generators.
+            For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544
+            2. The `question` parameter in search requests & results is renamed to `query`."""
+        )
         self.retriever = retriever
         self.reader = reader
         if self.reader is None and self.retriever is None:
@@ -48,6 +54,14 @@ def get_answers(self, question: str, top_k_reader: int = 1, top_k_retriever: int
         :return:
         """
 
+        logger.warning(
+            """DEPRECATION WARNINGS: 
+            1. The 'Finder' class will be deprecated in the next Haystack release in 
+            favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components
+            including Retriever, Readers, and Generators.
+            For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544
+            2. The `question` parameter in search requests & results is renamed to `query`."""
+        )
         if self.retriever is None or self.reader is None:
             raise AttributeError("Finder.get_answers requires self.retriever AND self.reader")
 
@@ -65,9 +79,10 @@ def get_answers(self, question: str, top_k_reader: int = 1, top_k_retriever: int
         len_chars = sum([len(d.text) for d in documents])
         logger.info(f"Reader is looking for detailed answer in {len_chars} chars ...")
 
-        results = self.reader.predict(question=question,
+        results = self.reader.predict(query=question,
                                       documents=documents,
                                       top_k=top_k_reader)  # type: Dict[str, Any]
+        results["question"] = results["query"]
 
         # Add corresponding document_name and more meta data, if an answer contains the document_id
         for ans in results["answers"]:
@@ -364,7 +379,7 @@ def eval_batch(
         self.reader.return_no_answers = True
         reader_start_time = time.time()
         predictions = self.reader.predict_batch(questions_with_correct_doc,
-                                                top_k_per_question=top_k_reader, batch_size=batch_size)
+                                                top_k=top_k_reader, batch_size=batch_size)
         reader_total_time = time.time() - reader_start_time
 
         for pred in predictions:

diff --git a/haystack/pipeline.py b/haystack/pipeline.py
@@ -127,8 +127,8 @@ def __init__(self, reader: BaseReader, retriever: BaseRetriever):
         self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
         self.pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])
 
-    def run(self, question, top_k_retriever=5, top_k_reader=5):
-        output = self.pipeline.run(question=question,
+    def run(self, query, top_k_retriever=5, top_k_reader=5):
+        output = self.pipeline.run(query=query,
                                    top_k_retriever=top_k_retriever,
                                    top_k_reader=top_k_reader)
         return output
@@ -150,8 +150,8 @@ def __init__(self, retriever: BaseRetriever):
         self.pipeline = Pipeline()
         self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
 
-    def run(self, question, top_k_retriever=5):
-        output = self.pipeline.run(question=question, top_k_retriever=top_k_retriever)
+    def run(self, query, top_k_retriever=5):
+        output = self.pipeline.run(query=query, top_k_retriever=top_k_retriever)
         document_dicts = [doc.to_dict() for doc in output["documents"]]
         output["documents"] = document_dicts
         return output
@@ -183,7 +183,7 @@ def run(self, **kwargs):
         for i, _ in inputs:
             documents.extend(i["documents"])
         output = {
-            "question": inputs[0][0]["question"],
+            "query": inputs[0][0]["query"],
             "documents": documents
         }
         return output, "output_1"
diff --git a/haystack/reader/base.py b/haystack/reader/base.py
@@ -12,12 +12,11 @@ class BaseReader(ABC):
     outgoing_edges = 1
 
     @abstractmethod
-    def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None):
+    def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None):
         pass
 
     @abstractmethod
-    def predict_batch(self, question_doc_list: List[dict], top_k_per_question: Optional[int] = None,
-                      batch_size: Optional[int] = None):
+    def predict_batch(self, query_doc_list: List[dict], top_k: Optional[int] = None, batch_size: Optional[int] = None):
         pass
 
     @staticmethod
@@ -47,9 +46,9 @@ def _calc_no_answer(no_ans_gaps: Sequence[float], best_score_answer: float):
                "meta": None,}
         return no_ans_prediction, max_no_ans_gap
 
-    def run(self, question: str, documents: List[Document], top_k: Optional[int] = None):
+    def run(self, query: str, documents: List[Document], top_k: Optional[int] = None):
         if documents:
-            results = self.predict(question=question, documents=documents, top_k=top_k)
+            results = self.predict(query=query, documents=documents, top_k=top_k)
         else:
             results = {"answers": []}
 

diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py
@@ -243,16 +243,16 @@ def save(self, directory: Path):
         self.inferencer.model.save(directory)
         self.inferencer.processor.save(directory)
 
-    def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int = None, batch_size: int = None):
+    def predict_batch(self, query_doc_list: List[dict], top_k: int = None, batch_size: int = None):
         """
-        Use loaded QA model to find answers for a list of questions in each question's supplied list of Document.
+        Use loaded QA model to find answers for a list of queries in each query's supplied list of Document.
 
         Returns list of dictionaries containing answers sorted by (desc.) probability
 
-        :param question_doc_list: List of dictionaries containing questions with their retrieved documents
-        :param top_k_per_question: The maximum number of answers to return for each question
+        :param query_doc_list: List of dictionaries containing queries with their retrieved documents
+        :param top_k: The maximum number of answers to return for each query
         :param batch_size: Number of samples the model receives in one batch for inference
-        :return: List of dictionaries containing question and answers
+        :return: List of dictionaries containing query and answers
         """
 
         # convert input to FARM format
@@ -261,20 +261,20 @@ def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int =
         labels = []
 
         # build input objects for inference_from_objects
-        for question_with_docs in question_doc_list:
-            documents = question_with_docs["docs"]
-            question = question_with_docs["question"]
-            labels.append(question)
+        for query_with_docs in query_doc_list:
+            documents = query_with_docs["docs"]
+            query = query_with_docs["question"]
+            labels.append(query)
             number_of_docs.append(len(documents))
 
             for doc in documents:
                 cur = QAInput(doc_text=doc.text,
-                              questions=Question(text=question.question,
+                              questions=Question(text=query.question,
                                                  uid=doc.id))
                 inputs.append(cur)
 
         self.inferencer.batch_size = batch_size
-        # make predictions on all document-question pairs
+        # make predictions on all document-query pairs
         predictions = self.inferencer.inference_from_objects(
             objects=inputs, return_json=False, multiprocessing_chunksize=1
         )
@@ -290,27 +290,27 @@ def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int =
 
         result = []
         for idx, group in enumerate(grouped_predictions):
-            answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k_per_question)
-            question = group[0].question
+            answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k)
+            query = group[0].question
             cur_label = labels[idx]
             result.append({
-                "question": question,
+                "query": query,
                 "no_ans_gap": max_no_ans_gap,
                 "answers": answers,
                 "label": cur_label
             })
 
         return result
 
-    def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None):
+    def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None):
         """
-        Use loaded QA model to find answers for a question in the supplied list of Document.
+        Use loaded QA model to find answers for a query in the supplied list of Document.
 
         Returns dictionaries containing answers sorted by (desc.) probability.
         Example:
          ```python
             |{
-            |    'question': 'Who is the father of Arya Stark?',
+            |    'query': 'Who is the father of Arya Stark?',
             |    'answers':[
             |                 {'answer': 'Eddard,',
             |                 'context': " She travels with her father, Eddard, to King's Landing when he is ",
@@ -324,17 +324,17 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
             |}
          ```
 
-        :param question: Question string
+        :param query: Query string
         :param documents: List of Document in which to search for the answer
         :param top_k: The maximum number of answers to return
-        :return: Dict containing question and answers
+        :return: Dict containing query and answers
         """
 
         # convert input to FARM format
         inputs = []
         for doc in documents:
             cur = QAInput(doc_text=doc.text,
-                          questions=Question(text=question,
+                          questions=Question(text=query,
                                              uid=doc.id))
             inputs.append(cur)
 
@@ -345,7 +345,7 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
         )
         # assemble answers from all the different documents & format them.
         answers, max_no_ans_gap = self._extract_answers_of_predictions(predictions, top_k)
-        result = {"question": question,
+        result = {"query": query,
                   "no_ans_gap": max_no_ans_gap,
                   "answers": answers}
 

diff --git a/haystack/reader/transformers.py b/haystack/reader/transformers.py
@@ -65,16 +65,16 @@ def __init__(
 
         # TODO context_window_size behaviour different from behavior in FARMReader
 
-    def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None):
+    def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None):
         """
-        Use loaded QA model to find answers for a question in the supplied list of Document.
+        Use loaded QA model to find answers for a query in the supplied list of Document.
 
         Returns dictionaries containing answers sorted by (desc.) probability.
         Example:
-        
+
          ```python
             |{
-            |    'question': 'Who is the father of Arya Stark?',
+            |    'query': 'Who is the father of Arya Stark?',
             |    'answers':[
             |                 {'answer': 'Eddard,',
             |                 'context': " She travels with her father, Eddard, to King's Landing when he is ",
@@ -88,19 +88,19 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
             |}
          ```
 
-        :param question: Question string
+        :param query: Query string
         :param documents: List of Document in which to search for the answer
         :param top_k: The maximum number of answers to return
-        :return: Dict containing question and answers
+        :return: Dict containing query and answers
 
         """
         # get top-answers for each candidate passage
         answers = []
         no_ans_gaps = []
         best_overall_score = 0
         for doc in documents:
-            query = {"context": doc.text, "question": question}
-            predictions = self.model(query,
+            transformers_query = {"context": doc.text, "question": query}
+            predictions = self.model(transformers_query,
                                      topk=self.top_k_per_candidate,
                                      handle_impossible_answer=self.return_no_answers,
                                      max_seq_len=self.max_seq_len,
@@ -149,12 +149,11 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
         )
         answers = answers[:top_k]
 
-        results = {"question": question,
+        results = {"query": query,
                    "answers": answers}
 
         return results
 
-    def predict_batch(self, question_doc_list: List[dict], top_k_per_question: Optional[int] = None,
-                      batch_size: Optional[int] = None):
+    def predict_batch(self, query_doc_list: List[dict], top_k: Optional[int] = None,  batch_size: Optional[int] = None):
 
         raise NotImplementedError("Batch prediction not yet available in TransformersReader.")
diff --git a/haystack/retriever/base.py b/haystack/retriever/base.py
@@ -51,7 +51,7 @@ def eval(
     ) -> dict:
         """
         Performs evaluation on the Retriever.
-        Retriever is evaluated based on whether it finds the correct document given the question string and at which
+        Retriever is evaluated based on whether it finds the correct document given the query string and at which
         position in the ranking of documents the correct document is.
 
         |  Returns a dict containing the following metrics:
@@ -67,7 +67,7 @@ def eval(
 
         :param label_index: Index/Table in DocumentStore where labeled questions are stored
         :param doc_index: Index/Table in DocumentStore where documents that are used for evaluation are stored
-        :param top_k: How many documents to return per question
+        :param top_k: How many documents to return per query
         :param open_domain: If ``True``, retrieval will be evaluated by checking if the answer string to a question is
                             contained in the retrieved docs (common approach in open-domain QA).
                             If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids
@@ -169,17 +169,17 @@ def eval(
 
     def run(
             self,
-            question: str,
+            query: str,
             filters: Optional[dict] = None,
             top_k_retriever: Optional[int] = None,
             top_k_reader: Optional[int] = None,
     ):
         if top_k_retriever:
-            documents = self.retrieve(query=question, filters=filters, top_k=top_k_retriever)
+            documents = self.retrieve(query=query, filters=filters, top_k=top_k_retriever)
         else:
-            documents = self.retrieve(query=question, filters=filters)
+            documents = self.retrieve(query=query, filters=filters)
         output = {
-            "question": question,
+            "query": query,
             "documents": documents,
             "top_k": top_k_reader
         }

diff --git a/haystack/retriever/sparse.py b/haystack/retriever/sparse.py
@@ -18,7 +18,7 @@ class ElasticsearchRetriever(BaseRetriever):
     def __init__(self, document_store: ElasticsearchDocumentStore, custom_query: str = None):
         """
         :param document_store: an instance of a DocumentStore to retrieve documents from.
-        :param custom_query: query string as per Elasticsearch DSL with a mandatory question placeholder($question).
+        :param custom_query: query string as per Elasticsearch DSL with a mandatory query placeholder(query).
 
                              Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
                              that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
@@ -32,7 +32,7 @@ def __init__(self, document_store: ElasticsearchDocumentStore, custom_query: str
                                 |        "query": {
                                 |            "bool": {
                                 |                "should": [{"multi_match": {
-                                |                    "query": "${question}",                 // mandatory $question placeholder
+                                |                    "query": "${query}",                 // mandatory query placeholder
                                 |                    "type": "most_fields",
                                 |                    "fields": ["text", "title"]}}],
                                 |                "filter": [                                 // optional custom filters

diff --git a/haystack/utils.py b/haystack/utils.py
@@ -40,19 +40,19 @@ def export_answers_to_csv(agg_results: list, output_file):
     if isinstance(agg_results, dict):
         agg_results = [agg_results]
 
-    assert "question" in agg_results[0], f"Wrong format used for {agg_results[0]}"
+    assert "query" in agg_results[0], f"Wrong format used for {agg_results[0]}"
     assert "answers" in agg_results[0], f"Wrong format used for {agg_results[0]}"
 
     data = {} # type: Dict[str, List[Any]]
-    data["question"] = []
+    data["query"] = []
     data["prediction"] = []
     data["prediction_rank"] = []
     data["prediction_context"] = []
 
     for res in agg_results:
         for i in range(len(res["answers"])):
             temp = res["answers"][i]
-            data["question"].append(res["question"])
+            data["query"].append(res["query"])
             data["prediction"].append(temp["answer"])
             data["prediction_rank"].append(i + 1)
             data["prediction_context"].append(temp["context"])

diff --git a/test/conftest.py b/test/conftest.py
@@ -231,14 +231,14 @@ def no_answer_reader(request, transformers_roberta, farm_roberta):
 @pytest.fixture()
 def prediction(reader, test_docs_xs):
     docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
-    prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
+    prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=5)
     return prediction
 
 
 @pytest.fixture()
 def no_answer_prediction(no_answer_reader, test_docs_xs):
     docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
-    prediction = no_answer_reader.predict(question="What is the meaning of life?", documents=docs, top_k=5)
+    prediction = no_answer_reader.predict(query="What is the meaning of life?", documents=docs, top_k=5)
     return prediction