From 903ed3930be6ec9b8a471e452a8a8faccf62956e Mon Sep 17 00:00:00 2001 From: Tanay Soni Date: Tue, 24 Nov 2020 12:12:56 +0100 Subject: [PATCH 1/5] Rename question parameter to query --- haystack/document_store/elasticsearch.py | 6 +- haystack/eval.py | 2 +- haystack/finder.py | 4 +- haystack/pipeline.py | 10 +- haystack/reader/base.py | 8 +- haystack/reader/farm.py | 42 ++--- haystack/reader/transformers.py | 20 +-- haystack/retriever/base.py | 12 +- haystack/retriever/sparse.py | 4 +- haystack/utils.py | 6 +- test/conftest.py | 4 +- test/test_finder.py | 2 +- test/test_pipeline.py | 8 +- test/test_reader.py | 10 +- test/test_rest_api.py | 196 +++++++++++------------ 15 files changed, 167 insertions(+), 167 deletions(-) diff --git a/haystack/document_store/elasticsearch.py b/haystack/document_store/elasticsearch.py index 60dfcba295..19eae85a67 100644 --- a/haystack/document_store/elasticsearch.py +++ b/haystack/document_store/elasticsearch.py @@ -429,10 +429,10 @@ def query( body["query"]["bool"]["filter"] = filter_clause # Retrieval via custom query - elif custom_query: # substitute placeholder for question and filters for the custom_query template string + elif custom_query: # substitute placeholder for query and filters for the custom_query template string template = Template(custom_query) - # replace all "${question}" placeholder(s) with query - substitutions = {"question": query} + # replace all "${query}" placeholder(s) with query + substitutions = {"query": query} # For each filter we got passed, we'll try to find & replace the corresponding placeholder in the template # Example: filters={"years":[2018]} => replaces {$years} in custom_query with '[2018]' if filters: diff --git a/haystack/eval.py b/haystack/eval.py index 2ce89bb887..47baf9b05b 100644 --- a/haystack/eval.py +++ b/haystack/eval.py @@ -59,7 +59,7 @@ def calculate_average_precision_and_reciprocal_rank(questions_with_docs: List[di if found_relevant_doc: questions_with_correct_doc.append({ - "question": question["question"], + "query": question["question"], "docs": question["docs"] }) diff --git a/haystack/finder.py b/haystack/finder.py index 31a62ca9eb..62246d351e 100644 --- a/haystack/finder.py +++ b/haystack/finder.py @@ -65,7 +65,7 @@ def get_answers(self, question: str, top_k_reader: int = 1, top_k_retriever: int len_chars = sum([len(d.text) for d in documents]) logger.info(f"Reader is looking for detailed answer in {len_chars} chars ...") - results = self.reader.predict(question=question, + results = self.reader.predict(query=question, documents=documents, top_k=top_k_reader) # type: Dict[str, Any] @@ -364,7 +364,7 @@ def eval_batch( self.reader.return_no_answers = True reader_start_time = time.time() predictions = self.reader.predict_batch(questions_with_correct_doc, - top_k_per_question=top_k_reader, batch_size=batch_size) + top_k_per_query=top_k_reader, batch_size=batch_size) reader_total_time = time.time() - reader_start_time for pred in predictions: diff --git a/haystack/pipeline.py b/haystack/pipeline.py index 0e68a2b71c..b45b184bd4 100644 --- a/haystack/pipeline.py +++ b/haystack/pipeline.py @@ -127,8 +127,8 @@ def __init__(self, reader: BaseReader, retriever: BaseRetriever): self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) self.pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"]) - def run(self, question, top_k_retriever=5, top_k_reader=5): - output = self.pipeline.run(question=question, + def run(self, query, top_k_retriever=5, top_k_reader=5): + output = self.pipeline.run(query=query, top_k_retriever=top_k_retriever, top_k_reader=top_k_reader) return output @@ -150,8 +150,8 @@ def __init__(self, retriever: BaseRetriever): self.pipeline = Pipeline() self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"]) - def run(self, question, top_k_retriever=5): - output = self.pipeline.run(question=question, top_k_retriever=top_k_retriever) + def run(self, query, top_k_retriever=5): + output = self.pipeline.run(query=query, top_k_retriever=top_k_retriever) document_dicts = [doc.to_dict() for doc in output["documents"]] output["documents"] = document_dicts return output @@ -183,7 +183,7 @@ def run(self, **kwargs): for i, _ in inputs: documents.extend(i["documents"]) output = { - "question": inputs[0][0]["question"], + "query": inputs[0][0]["query"], "documents": documents } return output, "output_1" diff --git a/haystack/reader/base.py b/haystack/reader/base.py index 1b2d54382a..c2db96e098 100644 --- a/haystack/reader/base.py +++ b/haystack/reader/base.py @@ -12,11 +12,11 @@ class BaseReader(ABC): outgoing_edges = 1 @abstractmethod - def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None): + def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None): pass @abstractmethod - def predict_batch(self, question_doc_list: List[dict], top_k_per_question: Optional[int] = None, + def predict_batch(self, query_doc_list: List[dict], top_k_per_query: Optional[int] = None, batch_size: Optional[int] = None): pass @@ -47,9 +47,9 @@ def _calc_no_answer(no_ans_gaps: Sequence[float], best_score_answer: float): "meta": None,} return no_ans_prediction, max_no_ans_gap - def run(self, question: str, documents: List[Document], top_k: Optional[int] = None): + def run(self, query: str, documents: List[Document], top_k: Optional[int] = None): if documents: - results = self.predict(question=question, documents=documents, top_k=top_k) + results = self.predict(query=query, documents=documents, top_k=top_k) else: results = {"answers": []} diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index 5be14a920a..32d7814512 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -243,16 +243,16 @@ def save(self, directory: Path): self.inferencer.model.save(directory) self.inferencer.processor.save(directory) - def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int = None, batch_size: int = None): + def predict_batch(self, query_doc_list: List[dict], top_k_per_query: int = None, batch_size: int = None): """ - Use loaded QA model to find answers for a list of questions in each question's supplied list of Document. + Use loaded QA model to find answers for a list of queries in each query's supplied list of Document. Returns list of dictionaries containing answers sorted by (desc.) probability - :param question_doc_list: List of dictionaries containing questions with their retrieved documents - :param top_k_per_question: The maximum number of answers to return for each question + :param query_doc_list: List of dictionaries containing queries with their retrieved documents + :param top_k_per_query: The maximum number of answers to return for each query :param batch_size: Number of samples the model receives in one batch for inference - :return: List of dictionaries containing question and answers + :return: List of dictionaries containing query and answers """ # convert input to FARM format @@ -261,20 +261,20 @@ def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int = labels = [] # build input objects for inference_from_objects - for question_with_docs in question_doc_list: - documents = question_with_docs["docs"] - question = question_with_docs["question"] - labels.append(question) + for query_with_docs in query_doc_list: + documents = query_with_docs["docs"] + query = query_with_docs["query"] + labels.append(query) number_of_docs.append(len(documents)) for doc in documents: cur = QAInput(doc_text=doc.text, - questions=Question(text=question.question, + questions=Question(text=query.question, uid=doc.id)) inputs.append(cur) self.inferencer.batch_size = batch_size - # make predictions on all document-question pairs + # make predictions on all document-query pairs predictions = self.inferencer.inference_from_objects( objects=inputs, return_json=False, multiprocessing_chunksize=1 ) @@ -290,11 +290,11 @@ def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int = result = [] for idx, group in enumerate(grouped_predictions): - answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k_per_question) - question = group[0].question + answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k_per_query) + query = group[0].question cur_label = labels[idx] result.append({ - "question": question, + "query": query, "no_ans_gap": max_no_ans_gap, "answers": answers, "label": cur_label @@ -302,15 +302,15 @@ def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int = return result - def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None): + def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None): """ - Use loaded QA model to find answers for a question in the supplied list of Document. + Use loaded QA model to find answers for a query in the supplied list of Document. Returns dictionaries containing answers sorted by (desc.) probability. Example: ```python |{ - | 'question': 'Who is the father of Arya Stark?', + | 'query': 'Who is the father of Arya Stark?', | 'answers':[ | {'answer': 'Eddard,', | 'context': " She travels with her father, Eddard, to King's Landing when he is ", @@ -324,17 +324,17 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int] |} ``` - :param question: Question string + :param query: Query string :param documents: List of Document in which to search for the answer :param top_k: The maximum number of answers to return - :return: Dict containing question and answers + :return: Dict containing query and answers """ # convert input to FARM format inputs = [] for doc in documents: cur = QAInput(doc_text=doc.text, - questions=Question(text=question, + questions=Question(text=query, uid=doc.id)) inputs.append(cur) @@ -345,7 +345,7 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int] ) # assemble answers from all the different documents & format them. answers, max_no_ans_gap = self._extract_answers_of_predictions(predictions, top_k) - result = {"question": question, + result = {"query": query, "no_ans_gap": max_no_ans_gap, "answers": answers} diff --git a/haystack/reader/transformers.py b/haystack/reader/transformers.py index 59432bd8ba..2e6001009a 100644 --- a/haystack/reader/transformers.py +++ b/haystack/reader/transformers.py @@ -65,16 +65,16 @@ def __init__( # TODO context_window_size behaviour different from behavior in FARMReader - def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None): + def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None): """ - Use loaded QA model to find answers for a question in the supplied list of Document. + Use loaded QA model to find answers for a query in the supplied list of Document. Returns dictionaries containing answers sorted by (desc.) probability. Example: - + ```python |{ - | 'question': 'Who is the father of Arya Stark?', + | 'query': 'Who is the father of Arya Stark?', | 'answers':[ | {'answer': 'Eddard,', | 'context': " She travels with her father, Eddard, to King's Landing when he is ", @@ -88,10 +88,10 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int] |} ``` - :param question: Question string + :param query: Query string :param documents: List of Document in which to search for the answer :param top_k: The maximum number of answers to return - :return: Dict containing question and answers + :return: Dict containing query and answers """ # get top-answers for each candidate passage @@ -99,8 +99,8 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int] no_ans_gaps = [] best_overall_score = 0 for doc in documents: - query = {"context": doc.text, "question": question} - predictions = self.model(query, + transformers_query = {"context": doc.text, "question": query} + predictions = self.model(transformers_query, topk=self.top_k_per_candidate, handle_impossible_answer=self.return_no_answers, max_seq_len=self.max_seq_len, @@ -149,12 +149,12 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int] ) answers = answers[:top_k] - results = {"question": question, + results = {"query": query, "answers": answers} return results - def predict_batch(self, question_doc_list: List[dict], top_k_per_question: Optional[int] = None, + def predict_batch(self, query_doc_list: List[dict], top_k_per_query: Optional[int] = None, batch_size: Optional[int] = None): raise NotImplementedError("Batch prediction not yet available in TransformersReader.") diff --git a/haystack/retriever/base.py b/haystack/retriever/base.py index 3b87b2795b..1152ed8d4f 100644 --- a/haystack/retriever/base.py +++ b/haystack/retriever/base.py @@ -51,7 +51,7 @@ def eval( ) -> dict: """ Performs evaluation on the Retriever. - Retriever is evaluated based on whether it finds the correct document given the question string and at which + Retriever is evaluated based on whether it finds the correct document given the query string and at which position in the ranking of documents the correct document is. | Returns a dict containing the following metrics: @@ -67,7 +67,7 @@ def eval( :param label_index: Index/Table in DocumentStore where labeled questions are stored :param doc_index: Index/Table in DocumentStore where documents that are used for evaluation are stored - :param top_k: How many documents to return per question + :param top_k: How many documents to return per query :param open_domain: If ``True``, retrieval will be evaluated by checking if the answer string to a question is contained in the retrieved docs (common approach in open-domain QA). If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids @@ -169,17 +169,17 @@ def eval( def run( self, - question: str, + query: str, filters: Optional[dict] = None, top_k_retriever: Optional[int] = None, top_k_reader: Optional[int] = None, ): if top_k_retriever: - documents = self.retrieve(query=question, filters=filters, top_k=top_k_retriever) + documents = self.retrieve(query=query, filters=filters, top_k=top_k_retriever) else: - documents = self.retrieve(query=question, filters=filters) + documents = self.retrieve(query=query, filters=filters) output = { - "question": question, + "query": query, "documents": documents, "top_k": top_k_reader } diff --git a/haystack/retriever/sparse.py b/haystack/retriever/sparse.py index cd7f1df89d..45e2d5a3e9 100644 --- a/haystack/retriever/sparse.py +++ b/haystack/retriever/sparse.py @@ -18,7 +18,7 @@ class ElasticsearchRetriever(BaseRetriever): def __init__(self, document_store: ElasticsearchDocumentStore, custom_query: str = None): """ :param document_store: an instance of a DocumentStore to retrieve documents from. - :param custom_query: query string as per Elasticsearch DSL with a mandatory question placeholder($question). + :param custom_query: query string as per Elasticsearch DSL with a mandatory query placeholder(query). Optionally, ES `filter` clause can be added where the values of `terms` are placeholders that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) @@ -32,7 +32,7 @@ def __init__(self, document_store: ElasticsearchDocumentStore, custom_query: str | "query": { | "bool": { | "should": [{"multi_match": { - | "query": "${question}", // mandatory $question placeholder + | "query": "${query}", // mandatory query placeholder | "type": "most_fields", | "fields": ["text", "title"]}}], | "filter": [ // optional custom filters diff --git a/haystack/utils.py b/haystack/utils.py index 6a7e483b94..9f161a231b 100644 --- a/haystack/utils.py +++ b/haystack/utils.py @@ -40,11 +40,11 @@ def export_answers_to_csv(agg_results: list, output_file): if isinstance(agg_results, dict): agg_results = [agg_results] - assert "question" in agg_results[0], f"Wrong format used for {agg_results[0]}" + assert "query" in agg_results[0], f"Wrong format used for {agg_results[0]}" assert "answers" in agg_results[0], f"Wrong format used for {agg_results[0]}" data = {} # type: Dict[str, List[Any]] - data["question"] = [] + data["query"] = [] data["prediction"] = [] data["prediction_rank"] = [] data["prediction_context"] = [] @@ -52,7 +52,7 @@ def export_answers_to_csv(agg_results: list, output_file): for res in agg_results: for i in range(len(res["answers"])): temp = res["answers"][i] - data["question"].append(res["question"]) + data["query"].append(res["query"]) data["prediction"].append(temp["answer"]) data["prediction_rank"].append(i + 1) data["prediction_context"].append(temp["context"]) diff --git a/test/conftest.py b/test/conftest.py index af6888eb04..e12639a172 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -231,14 +231,14 @@ def no_answer_reader(request, transformers_roberta, farm_roberta): @pytest.fixture() def prediction(reader, test_docs_xs): docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs] - prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) + prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=5) return prediction @pytest.fixture() def no_answer_prediction(no_answer_reader, test_docs_xs): docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs] - prediction = no_answer_reader.predict(question="What is the meaning of life?", documents=docs, top_k=5) + prediction = no_answer_reader.predict(query="What is the meaning of life?", documents=docs, top_k=5) return prediction diff --git a/test/test_finder.py b/test/test_finder.py index 2c842f47d2..4dbeb18e78 100644 --- a/test/test_finder.py +++ b/test/test_finder.py @@ -10,7 +10,7 @@ def test_finder_get_answers(reader, retriever_with_docs, document_store_with_doc prediction = finder.get_answers(question="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3) assert prediction is not None - assert prediction["question"] == "Who lives in Berlin?" + assert prediction["query"] == "Who lives in Berlin?" assert prediction["answers"][0]["answer"] == "Carla" assert prediction["answers"][0]["probability"] <= 1 assert prediction["answers"][0]["probability"] >= 0 diff --git a/test/test_pipeline.py b/test/test_pipeline.py index 9e0aa1c3d1..0136a21573 100644 --- a/test/test_pipeline.py +++ b/test/test_pipeline.py @@ -25,9 +25,9 @@ def test_graph_creation(reader, retriever_with_docs, document_store_with_docs): @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) def test_extractive_qa_answers(reader, retriever_with_docs, document_store_with_docs): pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) - prediction = pipeline.run(question="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3) + prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=3) assert prediction is not None - assert prediction["question"] == "Who lives in Berlin?" + assert prediction["query"] == "Who lives in Berlin?" assert prediction["answers"][0]["answer"] == "Carla" assert prediction["answers"][0]["probability"] <= 1 assert prediction["answers"][0]["probability"] >= 0 @@ -41,7 +41,7 @@ def test_extractive_qa_answers(reader, retriever_with_docs, document_store_with_ @pytest.mark.parametrize("retriever_with_docs", ["tfidf"], indirect=True) def test_extractive_qa_offsets(reader, retriever_with_docs, document_store_with_docs): pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) - prediction = pipeline.run(question="Who lives in Berlin?", top_k_retriever=10, top_k_reader=5) + prediction = pipeline.run(query="Who lives in Berlin?", top_k_retriever=10, top_k_reader=5) assert prediction["answers"][0]["offset_start"] == 11 assert prediction["answers"][0]["offset_end"] == 16 @@ -56,7 +56,7 @@ def test_extractive_qa_offsets(reader, retriever_with_docs, document_store_with_ def test_extractive_qa_answers_single_result(reader, retriever_with_docs, document_store_with_docs): pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever_with_docs) query = "testing finder" - prediction = pipeline.run(question=query, top_k_retriever=1, top_k_reader=1) + prediction = pipeline.run(query=query, top_k_retriever=1, top_k_reader=1) assert prediction is not None assert len(prediction["answers"]) == 1 diff --git a/test/test_reader.py b/test/test_reader.py index 8198757734..14e68dd491 100644 --- a/test/test_reader.py +++ b/test/test_reader.py @@ -14,7 +14,7 @@ def test_reader_basic(reader): def test_output(prediction): assert prediction is not None - assert prediction["question"] == "Who lives in Berlin?" + assert prediction["query"] == "Who lives in Berlin?" assert prediction["answers"][0]["answer"] == "Carla" assert prediction["answers"][0]["offset_start"] == 11 assert prediction["answers"][0]["offset_end"] == 16 @@ -27,7 +27,7 @@ def test_output(prediction): @pytest.mark.slow def test_no_answer_output(no_answer_prediction): assert no_answer_prediction is not None - assert no_answer_prediction["question"] == "What is the meaning of life?" + assert no_answer_prediction["query"] == "What is the meaning of life?" assert math.isclose(no_answer_prediction["no_ans_gap"], -13.048564434051514, rel_tol=0.0001) assert no_answer_prediction["answers"][0]["answer"] is None assert no_answer_prediction["answers"][0]["offset_start"] == 0 @@ -48,7 +48,7 @@ def test_no_answer_output(no_answer_prediction): @pytest.mark.slow def test_prediction_attributes(prediction): # TODO FARM's prediction also has no_ans_gap - attributes_gold = ["question", "answers"] + attributes_gold = ["query", "answers"] for ag in attributes_gold: assert ag in prediction @@ -73,7 +73,7 @@ def test_context_window_size(reader, test_docs_xs, window_size): old_window_size = reader.inferencer.model.prediction_heads[0].context_window_size reader.inferencer.model.prediction_heads[0].context_window_size = window_size - prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5) + prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=5) for answer in prediction["answers"]: # If the extracted answer is larger than the context window, the context window is expanded. # If the extracted answer is odd in length, the resulting context window is one less than context_window_size @@ -106,7 +106,7 @@ def test_top_k(reader, test_docs_xs, top_k): except: print("WARNING: Could not set `top_k_per_sample` in FARM. Please update FARM version.") - prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=top_k) + prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=top_k) assert len(prediction["answers"]) == top_k reader.top_k_per_candidate = old_top_k_per_candidate diff --git a/test/test_rest_api.py b/test/test_rest_api.py index 35f4433b08..cfb0856a20 100644 --- a/test/test_rest_api.py +++ b/test/test_rest_api.py @@ -1,98 +1,98 @@ -import pytest -from fastapi.testclient import TestClient - -from haystack import Finder -from haystack.retriever.sparse import ElasticsearchRetriever - -# TODO: Add integration tests for other APIs - - -def get_test_client_and_override_dependencies(reader, document_store_with_docs): - from rest_api.application import app - from rest_api.controller import search - - search.document_store = document_store_with_docs - search.retriever = ElasticsearchRetriever(document_store=document_store_with_docs) - search.FINDERS = {1: Finder(reader=reader, retriever=search.retriever)} - - return TestClient(app) - - -@pytest.mark.slow -@pytest.mark.elasticsearch -@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) -@pytest.mark.parametrize("reader", ["farm"], indirect=True) -def test_qa_api_filters(reader, document_store_with_docs): - client = get_test_client_and_override_dependencies(reader, document_store_with_docs) - - query_with_no_filter_value = {"questions": ["Where does Carla lives?"]} - response = client.post(url="/models/1/doc-qa", json=query_with_no_filter_value) - assert 200 == response.status_code - response_json = response.json() - assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" - - query_with_single_filter_value = {"questions": ["Where does Carla lives?"], "filters": {"name": "filename1"}} - response = client.post(url="/models/1/doc-qa", json=query_with_single_filter_value) - assert 200 == response.status_code - response_json = response.json() - assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" - - query_with_a_list_of_filter_values = { - "questions": ["Where does Carla lives?"], - "filters": {"name": ["filename1", "filename2"]}, - } - response = client.post(url="/models/1/doc-qa", json=query_with_a_list_of_filter_values) - assert 200 == response.status_code - response_json = response.json() - assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" - - query_with_non_existing_filter_value = { - "questions": ["Where does Carla lives?"], - "filters": {"name": ["invalid-name"]}, - } - response = client.post(url="/models/1/doc-qa", json=query_with_non_existing_filter_value) - assert 200 == response.status_code - response_json = response.json() - assert len(response_json["results"][0]["answers"]) == 0 - - -@pytest.mark.slow -@pytest.mark.elasticsearch -@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) -@pytest.mark.parametrize("reader", ["farm"], indirect=True) -def test_query_api_filters(reader, document_store_with_docs): - client = get_test_client_and_override_dependencies(reader, document_store_with_docs) - - query = { - "size": 1, - "query": { - "bool": { - "should": [ - { - "multi_match": { - "query": "Where Paul lives?" - } - } - ], - "filter": [ - { - "terms": { - "name": "filename2" - } - } - ] - } - } - } - - response = client.post(url="/models/1/query?top_k_reader=1", json=query) - assert 200 == response.status_code - response_json = response.json() - assert 1 == response_json['hits']['total']['value'] - assert 1 == len(response_json['hits']['hits']) - assert response_json['hits']['hits'][0]["_score"] is not None - assert response_json['hits']['hits'][0]["_source"]["meta"] is not None - assert response_json['hits']['hits'][0]["_id"] is not None - assert "New York" == response_json['hits']['hits'][0]["_source"]["answer"] - assert "My name is Paul and I live in New York" == response_json['hits']['hits'][0]["_source"]["context"] - +# import pytest +# from fastapi.testclient import TestClient +# +# from haystack.pipeline import ExtractiveQAPipeline +# from haystack.retriever.sparse import ElasticsearchRetriever +# +# # TODO: Add integration tests for other APIs +# +# +# def get_test_client_and_override_dependencies(reader, document_store_with_docs): +# from rest_api.application import app +# from rest_api.controller import search +# +# search.document_store = document_store_with_docs +# search.retriever = ElasticsearchRetriever(document_store=document_store_with_docs) +# search.PIPELINES = {1: ExtractiveQAPipeline(reader=reader, retriever=search.retriever)} +# +# return TestClient(app) +# +# +# @pytest.mark.slow +# @pytest.mark.elasticsearch +# @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) +# @pytest.mark.parametrize("reader", ["farm"], indirect=True) +# def test_qa_api_filters(reader, document_store_with_docs): +# client = get_test_client_and_override_dependencies(reader, document_store_with_docs) +# +# query_with_no_filter_value = {"questions": ["Where does Carla lives?"]} +# response = client.post(url="/models/1/doc-qa", json=query_with_no_filter_value) +# assert 200 == response.status_code +# response_json = response.json() +# assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" +# +# query_with_single_filter_value = {"questions": ["Where does Carla lives?"], "filters": {"name": "filename1"}} +# response = client.post(url="/models/1/doc-qa", json=query_with_single_filter_value) +# assert 200 == response.status_code +# response_json = response.json() +# assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" +# +# query_with_a_list_of_filter_values = { +# "questions": ["Where does Carla lives?"], +# "filters": {"name": ["filename1", "filename2"]}, +# } +# response = client.post(url="/models/1/doc-qa", json=query_with_a_list_of_filter_values) +# assert 200 == response.status_code +# response_json = response.json() +# assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" +# +# query_with_non_existing_filter_value = { +# "questions": ["Where does Carla lives?"], +# "filters": {"name": ["invalid-name"]}, +# } +# response = client.post(url="/models/1/doc-qa", json=query_with_non_existing_filter_value) +# assert 200 == response.status_code +# response_json = response.json() +# assert len(response_json["results"][0]["answers"]) == 0 +# +# +# @pytest.mark.slow +# @pytest.mark.elasticsearch +# @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) +# @pytest.mark.parametrize("reader", ["farm"], indirect=True) +# def test_query_api_filters(reader, document_store_with_docs): +# client = get_test_client_and_override_dependencies(reader, document_store_with_docs) +# +# query = { +# "size": 1, +# "query": { +# "bool": { +# "should": [ +# { +# "multi_match": { +# "query": "Where Paul lives?" +# } +# } +# ], +# "filter": [ +# { +# "terms": { +# "name": "filename2" +# } +# } +# ] +# } +# } +# } +# +# response = client.post(url="/models/1/query?top_k_reader=1", json=query) +# assert 200 == response.status_code +# response_json = response.json() +# assert 1 == response_json['hits']['total']['value'] +# assert 1 == len(response_json['hits']['hits']) +# assert response_json['hits']['hits'][0]["_score"] is not None +# assert response_json['hits']['hits'][0]["_source"]["meta"] is not None +# assert response_json['hits']['hits'][0]["_id"] is not None +# assert "New York" == response_json['hits']['hits'][0]["_source"]["answer"] +# assert "My name is Paul and I live in New York" == response_json['hits']['hits'][0]["_source"]["context"] +# From 27335134492b37b56c4c2980e3b9e401bf3971e0 Mon Sep 17 00:00:00 2001 From: Tanay Soni Date: Thu, 26 Nov 2020 12:04:37 +0100 Subject: [PATCH 2/5] Revert change for eval --- haystack/eval.py | 2 +- haystack/reader/farm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/haystack/eval.py b/haystack/eval.py index 47baf9b05b..2ce89bb887 100644 --- a/haystack/eval.py +++ b/haystack/eval.py @@ -59,7 +59,7 @@ def calculate_average_precision_and_reciprocal_rank(questions_with_docs: List[di if found_relevant_doc: questions_with_correct_doc.append({ - "query": question["question"], + "question": question["question"], "docs": question["docs"] }) diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index 32d7814512..8fb82aa3c0 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -263,7 +263,7 @@ def predict_batch(self, query_doc_list: List[dict], top_k_per_query: int = None, # build input objects for inference_from_objects for query_with_docs in query_doc_list: documents = query_with_docs["docs"] - query = query_with_docs["query"] + query = query_with_docs["question"] labels.append(query) number_of_docs.append(len(documents)) From c872e565ad62d9b77ae24fdd46c7f6e9e83b9cd4 Mon Sep 17 00:00:00 2001 From: Tanay Soni Date: Thu, 26 Nov 2020 12:44:41 +0100 Subject: [PATCH 3/5] Add depcreation warnings --- haystack/finder.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/haystack/finder.py b/haystack/finder.py index 62246d351e..fab203d7dd 100644 --- a/haystack/finder.py +++ b/haystack/finder.py @@ -28,8 +28,14 @@ def __init__(self, reader: Optional[BaseReader], retriever: Optional[BaseRetriev :param reader: Reader instance :param retriever: Retriever instance """ - logger.warning("The 'Finder' class will be deprecated in the next Haystack release in favour of the new" - "`Pipeline` class.") + logger.warning( + """DEPRECATION WARNINGS: + 1. The 'Finder' class will be deprecated in the next Haystack release in + favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components + including Retriever, Readers, and Generators. + For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544 + 2. The `question` parameter in search requests & results is renamed to `query`.""" + ) self.retriever = retriever self.reader = reader if self.reader is None and self.retriever is None: @@ -48,6 +54,14 @@ def get_answers(self, question: str, top_k_reader: int = 1, top_k_retriever: int :return: """ + logger.warning( + """DEPRECATION WARNINGS: + 1. The 'Finder' class will be deprecated in the next Haystack release in + favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components + including Retriever, Readers, and Generators. + For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544 + 2. The `question` parameter in search requests & results is renamed to `query`.""" + ) if self.retriever is None or self.reader is None: raise AttributeError("Finder.get_answers requires self.retriever AND self.reader") @@ -68,6 +82,7 @@ def get_answers(self, question: str, top_k_reader: int = 1, top_k_retriever: int results = self.reader.predict(query=question, documents=documents, top_k=top_k_reader) # type: Dict[str, Any] + results["question"] = results["query"] # Add corresponding document_name and more meta data, if an answer contains the document_id for ans in results["answers"]: From ef443b51fc84c9b03b6bd1e4540a6155e4380bbd Mon Sep 17 00:00:00 2001 From: Tanay Soni Date: Thu, 26 Nov 2020 12:45:22 +0100 Subject: [PATCH 4/5] Enable REST API tests --- test/test_rest_api.py | 196 +++++++++++++++++++++--------------------- 1 file changed, 98 insertions(+), 98 deletions(-) diff --git a/test/test_rest_api.py b/test/test_rest_api.py index cfb0856a20..35f4433b08 100644 --- a/test/test_rest_api.py +++ b/test/test_rest_api.py @@ -1,98 +1,98 @@ -# import pytest -# from fastapi.testclient import TestClient -# -# from haystack.pipeline import ExtractiveQAPipeline -# from haystack.retriever.sparse import ElasticsearchRetriever -# -# # TODO: Add integration tests for other APIs -# -# -# def get_test_client_and_override_dependencies(reader, document_store_with_docs): -# from rest_api.application import app -# from rest_api.controller import search -# -# search.document_store = document_store_with_docs -# search.retriever = ElasticsearchRetriever(document_store=document_store_with_docs) -# search.PIPELINES = {1: ExtractiveQAPipeline(reader=reader, retriever=search.retriever)} -# -# return TestClient(app) -# -# -# @pytest.mark.slow -# @pytest.mark.elasticsearch -# @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) -# @pytest.mark.parametrize("reader", ["farm"], indirect=True) -# def test_qa_api_filters(reader, document_store_with_docs): -# client = get_test_client_and_override_dependencies(reader, document_store_with_docs) -# -# query_with_no_filter_value = {"questions": ["Where does Carla lives?"]} -# response = client.post(url="/models/1/doc-qa", json=query_with_no_filter_value) -# assert 200 == response.status_code -# response_json = response.json() -# assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" -# -# query_with_single_filter_value = {"questions": ["Where does Carla lives?"], "filters": {"name": "filename1"}} -# response = client.post(url="/models/1/doc-qa", json=query_with_single_filter_value) -# assert 200 == response.status_code -# response_json = response.json() -# assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" -# -# query_with_a_list_of_filter_values = { -# "questions": ["Where does Carla lives?"], -# "filters": {"name": ["filename1", "filename2"]}, -# } -# response = client.post(url="/models/1/doc-qa", json=query_with_a_list_of_filter_values) -# assert 200 == response.status_code -# response_json = response.json() -# assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" -# -# query_with_non_existing_filter_value = { -# "questions": ["Where does Carla lives?"], -# "filters": {"name": ["invalid-name"]}, -# } -# response = client.post(url="/models/1/doc-qa", json=query_with_non_existing_filter_value) -# assert 200 == response.status_code -# response_json = response.json() -# assert len(response_json["results"][0]["answers"]) == 0 -# -# -# @pytest.mark.slow -# @pytest.mark.elasticsearch -# @pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) -# @pytest.mark.parametrize("reader", ["farm"], indirect=True) -# def test_query_api_filters(reader, document_store_with_docs): -# client = get_test_client_and_override_dependencies(reader, document_store_with_docs) -# -# query = { -# "size": 1, -# "query": { -# "bool": { -# "should": [ -# { -# "multi_match": { -# "query": "Where Paul lives?" -# } -# } -# ], -# "filter": [ -# { -# "terms": { -# "name": "filename2" -# } -# } -# ] -# } -# } -# } -# -# response = client.post(url="/models/1/query?top_k_reader=1", json=query) -# assert 200 == response.status_code -# response_json = response.json() -# assert 1 == response_json['hits']['total']['value'] -# assert 1 == len(response_json['hits']['hits']) -# assert response_json['hits']['hits'][0]["_score"] is not None -# assert response_json['hits']['hits'][0]["_source"]["meta"] is not None -# assert response_json['hits']['hits'][0]["_id"] is not None -# assert "New York" == response_json['hits']['hits'][0]["_source"]["answer"] -# assert "My name is Paul and I live in New York" == response_json['hits']['hits'][0]["_source"]["context"] -# +import pytest +from fastapi.testclient import TestClient + +from haystack import Finder +from haystack.retriever.sparse import ElasticsearchRetriever + +# TODO: Add integration tests for other APIs + + +def get_test_client_and_override_dependencies(reader, document_store_with_docs): + from rest_api.application import app + from rest_api.controller import search + + search.document_store = document_store_with_docs + search.retriever = ElasticsearchRetriever(document_store=document_store_with_docs) + search.FINDERS = {1: Finder(reader=reader, retriever=search.retriever)} + + return TestClient(app) + + +@pytest.mark.slow +@pytest.mark.elasticsearch +@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_qa_api_filters(reader, document_store_with_docs): + client = get_test_client_and_override_dependencies(reader, document_store_with_docs) + + query_with_no_filter_value = {"questions": ["Where does Carla lives?"]} + response = client.post(url="/models/1/doc-qa", json=query_with_no_filter_value) + assert 200 == response.status_code + response_json = response.json() + assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" + + query_with_single_filter_value = {"questions": ["Where does Carla lives?"], "filters": {"name": "filename1"}} + response = client.post(url="/models/1/doc-qa", json=query_with_single_filter_value) + assert 200 == response.status_code + response_json = response.json() + assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" + + query_with_a_list_of_filter_values = { + "questions": ["Where does Carla lives?"], + "filters": {"name": ["filename1", "filename2"]}, + } + response = client.post(url="/models/1/doc-qa", json=query_with_a_list_of_filter_values) + assert 200 == response.status_code + response_json = response.json() + assert response_json["results"][0]["answers"][0]["answer"] == "Berlin" + + query_with_non_existing_filter_value = { + "questions": ["Where does Carla lives?"], + "filters": {"name": ["invalid-name"]}, + } + response = client.post(url="/models/1/doc-qa", json=query_with_non_existing_filter_value) + assert 200 == response.status_code + response_json = response.json() + assert len(response_json["results"][0]["answers"]) == 0 + + +@pytest.mark.slow +@pytest.mark.elasticsearch +@pytest.mark.parametrize("document_store_with_docs", ["elasticsearch"], indirect=True) +@pytest.mark.parametrize("reader", ["farm"], indirect=True) +def test_query_api_filters(reader, document_store_with_docs): + client = get_test_client_and_override_dependencies(reader, document_store_with_docs) + + query = { + "size": 1, + "query": { + "bool": { + "should": [ + { + "multi_match": { + "query": "Where Paul lives?" + } + } + ], + "filter": [ + { + "terms": { + "name": "filename2" + } + } + ] + } + } + } + + response = client.post(url="/models/1/query?top_k_reader=1", json=query) + assert 200 == response.status_code + response_json = response.json() + assert 1 == response_json['hits']['total']['value'] + assert 1 == len(response_json['hits']['hits']) + assert response_json['hits']['hits'][0]["_score"] is not None + assert response_json['hits']['hits'][0]["_source"]["meta"] is not None + assert response_json['hits']['hits'][0]["_id"] is not None + assert "New York" == response_json['hits']['hits'][0]["_source"]["answer"] + assert "My name is Paul and I live in New York" == response_json['hits']['hits'][0]["_source"]["context"] + From f31d1891d375513aacb0a38f49a00b8327a4b616 Mon Sep 17 00:00:00 2001 From: Tanay Soni Date: Fri, 27 Nov 2020 14:00:30 +0100 Subject: [PATCH 5/5] Rename top_k_per_query to top_k --- haystack/finder.py | 2 +- haystack/reader/base.py | 3 +-- haystack/reader/farm.py | 6 +++--- haystack/reader/transformers.py | 3 +-- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/haystack/finder.py b/haystack/finder.py index fab203d7dd..84af6af7b5 100644 --- a/haystack/finder.py +++ b/haystack/finder.py @@ -379,7 +379,7 @@ def eval_batch( self.reader.return_no_answers = True reader_start_time = time.time() predictions = self.reader.predict_batch(questions_with_correct_doc, - top_k_per_query=top_k_reader, batch_size=batch_size) + top_k=top_k_reader, batch_size=batch_size) reader_total_time = time.time() - reader_start_time for pred in predictions: diff --git a/haystack/reader/base.py b/haystack/reader/base.py index c2db96e098..49fdb71321 100644 --- a/haystack/reader/base.py +++ b/haystack/reader/base.py @@ -16,8 +16,7 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = pass @abstractmethod - def predict_batch(self, query_doc_list: List[dict], top_k_per_query: Optional[int] = None, - batch_size: Optional[int] = None): + def predict_batch(self, query_doc_list: List[dict], top_k: Optional[int] = None, batch_size: Optional[int] = None): pass @staticmethod diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py index 8fb82aa3c0..2956fd9f34 100644 --- a/haystack/reader/farm.py +++ b/haystack/reader/farm.py @@ -243,14 +243,14 @@ def save(self, directory: Path): self.inferencer.model.save(directory) self.inferencer.processor.save(directory) - def predict_batch(self, query_doc_list: List[dict], top_k_per_query: int = None, batch_size: int = None): + def predict_batch(self, query_doc_list: List[dict], top_k: int = None, batch_size: int = None): """ Use loaded QA model to find answers for a list of queries in each query's supplied list of Document. Returns list of dictionaries containing answers sorted by (desc.) probability :param query_doc_list: List of dictionaries containing queries with their retrieved documents - :param top_k_per_query: The maximum number of answers to return for each query + :param top_k: The maximum number of answers to return for each query :param batch_size: Number of samples the model receives in one batch for inference :return: List of dictionaries containing query and answers """ @@ -290,7 +290,7 @@ def predict_batch(self, query_doc_list: List[dict], top_k_per_query: int = None, result = [] for idx, group in enumerate(grouped_predictions): - answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k_per_query) + answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k) query = group[0].question cur_label = labels[idx] result.append({ diff --git a/haystack/reader/transformers.py b/haystack/reader/transformers.py index 2e6001009a..50cc0ad067 100644 --- a/haystack/reader/transformers.py +++ b/haystack/reader/transformers.py @@ -154,7 +154,6 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = return results - def predict_batch(self, query_doc_list: List[dict], top_k_per_query: Optional[int] = None, - batch_size: Optional[int] = None): + def predict_batch(self, query_doc_list: List[dict], top_k: Optional[int] = None, batch_size: Optional[int] = None): raise NotImplementedError("Batch prediction not yet available in TransformersReader.")