Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename question parameter to query #614

Merged
merged 5 commits into from
Nov 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions haystack/document_store/elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,10 +429,10 @@ def query(
body["query"]["bool"]["filter"] = filter_clause

# Retrieval via custom query
elif custom_query: # substitute placeholder for question and filters for the custom_query template string
elif custom_query: # substitute placeholder for query and filters for the custom_query template string
template = Template(custom_query)
# replace all "${question}" placeholder(s) with query
substitutions = {"question": query}
# replace all "${query}" placeholder(s) with query
substitutions = {"query": query}
# For each filter we got passed, we'll try to find & replace the corresponding placeholder in the template
# Example: filters={"years":[2018]} => replaces {$years} in custom_query with '[2018]'
if filters:
Expand Down
23 changes: 19 additions & 4 deletions haystack/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,14 @@ def __init__(self, reader: Optional[BaseReader], retriever: Optional[BaseRetriev
:param reader: Reader instance
:param retriever: Retriever instance
"""
logger.warning("The 'Finder' class will be deprecated in the next Haystack release in favour of the new"
"`Pipeline` class.")
logger.warning(
"""DEPRECATION WARNINGS:
1. The 'Finder' class will be deprecated in the next Haystack release in
favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components
including Retriever, Readers, and Generators.
For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544
2. The `question` parameter in search requests & results is renamed to `query`."""
)
self.retriever = retriever
self.reader = reader
if self.reader is None and self.retriever is None:
Expand All @@ -48,6 +54,14 @@ def get_answers(self, question: str, top_k_reader: int = 1, top_k_retriever: int
:return:
"""

logger.warning(
"""DEPRECATION WARNINGS:
1. The 'Finder' class will be deprecated in the next Haystack release in
favour of a new `Pipeline` class that supports building custom search pipelines using Haystack components
including Retriever, Readers, and Generators.
For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/544
2. The `question` parameter in search requests & results is renamed to `query`."""
)
if self.retriever is None or self.reader is None:
raise AttributeError("Finder.get_answers requires self.retriever AND self.reader")

Expand All @@ -65,9 +79,10 @@ def get_answers(self, question: str, top_k_reader: int = 1, top_k_retriever: int
len_chars = sum([len(d.text) for d in documents])
logger.info(f"Reader is looking for detailed answer in {len_chars} chars ...")

results = self.reader.predict(question=question,
results = self.reader.predict(query=question,
documents=documents,
top_k=top_k_reader) # type: Dict[str, Any]
results["question"] = results["query"]

# Add corresponding document_name and more meta data, if an answer contains the document_id
for ans in results["answers"]:
Expand Down Expand Up @@ -364,7 +379,7 @@ def eval_batch(
self.reader.return_no_answers = True
reader_start_time = time.time()
predictions = self.reader.predict_batch(questions_with_correct_doc,
top_k_per_question=top_k_reader, batch_size=batch_size)
top_k=top_k_reader, batch_size=batch_size)
reader_total_time = time.time() - reader_start_time

for pred in predictions:
Expand Down
10 changes: 5 additions & 5 deletions haystack/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ def __init__(self, reader: BaseReader, retriever: BaseRetriever):
self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
self.pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

def run(self, question, top_k_retriever=5, top_k_reader=5):
output = self.pipeline.run(question=question,
def run(self, query, top_k_retriever=5, top_k_reader=5):
output = self.pipeline.run(query=query,
top_k_retriever=top_k_retriever,
top_k_reader=top_k_reader)
return output
Expand All @@ -150,8 +150,8 @@ def __init__(self, retriever: BaseRetriever):
self.pipeline = Pipeline()
self.pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])

def run(self, question, top_k_retriever=5):
output = self.pipeline.run(question=question, top_k_retriever=top_k_retriever)
def run(self, query, top_k_retriever=5):
output = self.pipeline.run(query=query, top_k_retriever=top_k_retriever)
document_dicts = [doc.to_dict() for doc in output["documents"]]
output["documents"] = document_dicts
return output
Expand Down Expand Up @@ -183,7 +183,7 @@ def run(self, **kwargs):
for i, _ in inputs:
documents.extend(i["documents"])
output = {
"question": inputs[0][0]["question"],
"query": inputs[0][0]["query"],
"documents": documents
}
return output, "output_1"
9 changes: 4 additions & 5 deletions haystack/reader/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@ class BaseReader(ABC):
outgoing_edges = 1

@abstractmethod
def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None):
def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None):
pass

@abstractmethod
def predict_batch(self, question_doc_list: List[dict], top_k_per_question: Optional[int] = None,
batch_size: Optional[int] = None):
def predict_batch(self, query_doc_list: List[dict], top_k: Optional[int] = None, batch_size: Optional[int] = None):
pass

@staticmethod
Expand Down Expand Up @@ -47,9 +46,9 @@ def _calc_no_answer(no_ans_gaps: Sequence[float], best_score_answer: float):
"meta": None,}
return no_ans_prediction, max_no_ans_gap

def run(self, question: str, documents: List[Document], top_k: Optional[int] = None):
def run(self, query: str, documents: List[Document], top_k: Optional[int] = None):
if documents:
results = self.predict(question=question, documents=documents, top_k=top_k)
results = self.predict(query=query, documents=documents, top_k=top_k)
else:
results = {"answers": []}

Expand Down
42 changes: 21 additions & 21 deletions haystack/reader/farm.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,16 +243,16 @@ def save(self, directory: Path):
self.inferencer.model.save(directory)
self.inferencer.processor.save(directory)

def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int = None, batch_size: int = None):
def predict_batch(self, query_doc_list: List[dict], top_k: int = None, batch_size: int = None):
"""
Use loaded QA model to find answers for a list of questions in each question's supplied list of Document.
Use loaded QA model to find answers for a list of queries in each query's supplied list of Document.

Returns list of dictionaries containing answers sorted by (desc.) probability

:param question_doc_list: List of dictionaries containing questions with their retrieved documents
:param top_k_per_question: The maximum number of answers to return for each question
:param query_doc_list: List of dictionaries containing queries with their retrieved documents
:param top_k: The maximum number of answers to return for each query
:param batch_size: Number of samples the model receives in one batch for inference
:return: List of dictionaries containing question and answers
:return: List of dictionaries containing query and answers
"""

# convert input to FARM format
Expand All @@ -261,20 +261,20 @@ def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int =
labels = []

# build input objects for inference_from_objects
for question_with_docs in question_doc_list:
documents = question_with_docs["docs"]
question = question_with_docs["question"]
labels.append(question)
for query_with_docs in query_doc_list:
documents = query_with_docs["docs"]
query = query_with_docs["question"]
labels.append(query)
number_of_docs.append(len(documents))

for doc in documents:
cur = QAInput(doc_text=doc.text,
questions=Question(text=question.question,
questions=Question(text=query.question,
uid=doc.id))
inputs.append(cur)

self.inferencer.batch_size = batch_size
# make predictions on all document-question pairs
# make predictions on all document-query pairs
predictions = self.inferencer.inference_from_objects(
objects=inputs, return_json=False, multiprocessing_chunksize=1
)
Expand All @@ -290,27 +290,27 @@ def predict_batch(self, question_doc_list: List[dict], top_k_per_question: int =

result = []
for idx, group in enumerate(grouped_predictions):
answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k_per_question)
question = group[0].question
answers, max_no_ans_gap = self._extract_answers_of_predictions(group, top_k)
query = group[0].question
cur_label = labels[idx]
result.append({
"question": question,
"query": query,
"no_ans_gap": max_no_ans_gap,
"answers": answers,
"label": cur_label
})

return result

def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None):
def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None):
"""
Use loaded QA model to find answers for a question in the supplied list of Document.
Use loaded QA model to find answers for a query in the supplied list of Document.

Returns dictionaries containing answers sorted by (desc.) probability.
Example:
```python
|{
| 'question': 'Who is the father of Arya Stark?',
| 'query': 'Who is the father of Arya Stark?',
| 'answers':[
| {'answer': 'Eddard,',
| 'context': " She travels with her father, Eddard, to King's Landing when he is ",
Expand All @@ -324,17 +324,17 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
|}
```

:param question: Question string
:param query: Query string
:param documents: List of Document in which to search for the answer
:param top_k: The maximum number of answers to return
:return: Dict containing question and answers
:return: Dict containing query and answers
"""

# convert input to FARM format
inputs = []
for doc in documents:
cur = QAInput(doc_text=doc.text,
questions=Question(text=question,
questions=Question(text=query,
uid=doc.id))
inputs.append(cur)

Expand All @@ -345,7 +345,7 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
)
# assemble answers from all the different documents & format them.
answers, max_no_ans_gap = self._extract_answers_of_predictions(predictions, top_k)
result = {"question": question,
result = {"query": query,
"no_ans_gap": max_no_ans_gap,
"answers": answers}

Expand Down
21 changes: 10 additions & 11 deletions haystack/reader/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,16 @@ def __init__(

# TODO context_window_size behaviour different from behavior in FARMReader

def predict(self, question: str, documents: List[Document], top_k: Optional[int] = None):
def predict(self, query: str, documents: List[Document], top_k: Optional[int] = None):
"""
Use loaded QA model to find answers for a question in the supplied list of Document.
Use loaded QA model to find answers for a query in the supplied list of Document.

Returns dictionaries containing answers sorted by (desc.) probability.
Example:

```python
|{
| 'question': 'Who is the father of Arya Stark?',
| 'query': 'Who is the father of Arya Stark?',
| 'answers':[
| {'answer': 'Eddard,',
| 'context': " She travels with her father, Eddard, to King's Landing when he is ",
Expand All @@ -88,19 +88,19 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
|}
```

:param question: Question string
:param query: Query string
:param documents: List of Document in which to search for the answer
:param top_k: The maximum number of answers to return
:return: Dict containing question and answers
:return: Dict containing query and answers

"""
# get top-answers for each candidate passage
answers = []
no_ans_gaps = []
best_overall_score = 0
for doc in documents:
query = {"context": doc.text, "question": question}
predictions = self.model(query,
transformers_query = {"context": doc.text, "question": query}
predictions = self.model(transformers_query,
topk=self.top_k_per_candidate,
handle_impossible_answer=self.return_no_answers,
max_seq_len=self.max_seq_len,
Expand Down Expand Up @@ -149,12 +149,11 @@ def predict(self, question: str, documents: List[Document], top_k: Optional[int]
)
answers = answers[:top_k]

results = {"question": question,
results = {"query": query,
"answers": answers}

return results

def predict_batch(self, question_doc_list: List[dict], top_k_per_question: Optional[int] = None,
batch_size: Optional[int] = None):
def predict_batch(self, query_doc_list: List[dict], top_k: Optional[int] = None, batch_size: Optional[int] = None):

raise NotImplementedError("Batch prediction not yet available in TransformersReader.")
12 changes: 6 additions & 6 deletions haystack/retriever/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def eval(
) -> dict:
"""
Performs evaluation on the Retriever.
Retriever is evaluated based on whether it finds the correct document given the question string and at which
Retriever is evaluated based on whether it finds the correct document given the query string and at which
position in the ranking of documents the correct document is.

| Returns a dict containing the following metrics:
Expand All @@ -67,7 +67,7 @@ def eval(

:param label_index: Index/Table in DocumentStore where labeled questions are stored
:param doc_index: Index/Table in DocumentStore where documents that are used for evaluation are stored
:param top_k: How many documents to return per question
:param top_k: How many documents to return per query
:param open_domain: If ``True``, retrieval will be evaluated by checking if the answer string to a question is
contained in the retrieved docs (common approach in open-domain QA).
If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids
Expand Down Expand Up @@ -169,17 +169,17 @@ def eval(

def run(
self,
question: str,
query: str,
filters: Optional[dict] = None,
top_k_retriever: Optional[int] = None,
top_k_reader: Optional[int] = None,
):
if top_k_retriever:
documents = self.retrieve(query=question, filters=filters, top_k=top_k_retriever)
documents = self.retrieve(query=query, filters=filters, top_k=top_k_retriever)
else:
documents = self.retrieve(query=question, filters=filters)
documents = self.retrieve(query=query, filters=filters)
output = {
"question": question,
"query": query,
"documents": documents,
"top_k": top_k_reader
}
Expand Down
4 changes: 2 additions & 2 deletions haystack/retriever/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class ElasticsearchRetriever(BaseRetriever):
def __init__(self, document_store: ElasticsearchDocumentStore, custom_query: str = None):
"""
:param document_store: an instance of a DocumentStore to retrieve documents from.
:param custom_query: query string as per Elasticsearch DSL with a mandatory question placeholder($question).
:param custom_query: query string as per Elasticsearch DSL with a mandatory query placeholder(query).

Optionally, ES `filter` clause can be added where the values of `terms` are placeholders
that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..)
Expand All @@ -32,7 +32,7 @@ def __init__(self, document_store: ElasticsearchDocumentStore, custom_query: str
| "query": {
| "bool": {
| "should": [{"multi_match": {
| "query": "${question}", // mandatory $question placeholder
| "query": "${query}", // mandatory query placeholder
| "type": "most_fields",
| "fields": ["text", "title"]}}],
| "filter": [ // optional custom filters
Expand Down
6 changes: 3 additions & 3 deletions haystack/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,19 @@ def export_answers_to_csv(agg_results: list, output_file):
if isinstance(agg_results, dict):
agg_results = [agg_results]

assert "question" in agg_results[0], f"Wrong format used for {agg_results[0]}"
assert "query" in agg_results[0], f"Wrong format used for {agg_results[0]}"
assert "answers" in agg_results[0], f"Wrong format used for {agg_results[0]}"

data = {} # type: Dict[str, List[Any]]
data["question"] = []
data["query"] = []
data["prediction"] = []
data["prediction_rank"] = []
data["prediction_context"] = []

for res in agg_results:
for i in range(len(res["answers"])):
temp = res["answers"][i]
data["question"].append(res["question"])
data["query"].append(res["query"])
data["prediction"].append(temp["answer"])
data["prediction_rank"].append(i + 1)
data["prediction_context"].append(temp["context"])
Expand Down
4 changes: 2 additions & 2 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,14 +231,14 @@ def no_answer_reader(request, transformers_roberta, farm_roberta):
@pytest.fixture()
def prediction(reader, test_docs_xs):
docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
prediction = reader.predict(question="Who lives in Berlin?", documents=docs, top_k=5)
prediction = reader.predict(query="Who lives in Berlin?", documents=docs, top_k=5)
return prediction


@pytest.fixture()
def no_answer_prediction(no_answer_reader, test_docs_xs):
docs = [Document.from_dict(d) if isinstance(d, dict) else d for d in test_docs_xs]
prediction = no_answer_reader.predict(question="What is the meaning of life?", documents=docs, top_k=5)
prediction = no_answer_reader.predict(query="What is the meaning of life?", documents=docs, top_k=5)
return prediction


Expand Down
Loading