Skip to content

Commit

Permalink
Using PreProcessor functions on eval data (#751)
Browse files Browse the repository at this point in the history
* Add eval data splitting

* Adjust for split by passage, add test and test data, adjust docstrings, add max_docs to highler level fct
  • Loading branch information
Timoeller authored Jan 20, 2021
1 parent aa8a366 commit 4803da0
Show file tree
Hide file tree
Showing 5 changed files with 184 additions and 35 deletions.
38 changes: 27 additions & 11 deletions haystack/document_store/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Any, Optional, Dict, List, Union
from haystack import Document, Label, MultiLabel
from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
from haystack.preprocessor.preprocessor import PreProcessor


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -140,28 +141,43 @@ def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[s
pass

def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label",
batch_size: Optional[int] = None):
batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None,
max_docs: Union[int, bool] = None):
"""
Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise
from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors.
:param filename: Name of the file containing evaluation data (json or jsonl)
:type filename: str
:param doc_index: Elasticsearch index where evaluation documents should be stored
:type doc_index: str
:param label_index: Elasticsearch index where labeled questions should be stored
:type label_index: str
:param batch_size: Number of documents that are loaded and processed at a time.
Only works with jsonl formatted files. Setting batch_size and
using a json formatted file will convert the json to jsonl prior
to adding eval data.
:type batch_size: int
:param batch_size: Optional number of documents that are loaded and processed at a time.
When set to None (default) all documents are processed at once.
:param preprocessor: Optional PreProcessor to preprocess evaluation documents.
It can be used for splitting documents into passages (and assigning labels to corresponding passages).
Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0.
When set to None (default) preprocessing is disabled.
:param max_docs: Optional number of documents that will be loaded.
When set to None (default) all available eval documents are used.
"""
# TODO improve support for PreProcessor when adding eval data
if preprocessor is not None:
assert preprocessor.split_by != "sentence", f"Split by sentence not supported.\n" \
f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor."
assert preprocessor.split_overlap == 0, f"Overlapping documents are currently not supported when adding eval data.\n" \
f"Please set 'split_overlap=0' in the supplied PreProcessor."
assert preprocessor.clean_empty_lines == False, f"clean_empty_lines currently not supported when adding eval data.\n" \
f"Please set 'clean_empty_lines=False' in the supplied PreProcessor."
assert preprocessor.clean_whitespace == False, f"clean_whitespace is currently not supported when adding eval data.\n" \
f"Please set 'clean_whitespace=False' in the supplied PreProcessor."
assert preprocessor.clean_header_footer == False, f"clean_header_footer is currently not supported when adding eval data.\n" \
f"Please set 'clean_header_footer=False' in the supplied PreProcessor."

file_path = Path(filename)
if file_path.suffix == ".json":
if batch_size is None:
docs, labels = eval_data_from_json(filename)
docs, labels = eval_data_from_json(filename, max_docs=max_docs, preprocessor=preprocessor)
self.write_documents(docs, index=doc_index)
self.write_labels(labels, index=label_index)
else:
Expand All @@ -172,7 +188,7 @@ def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_i
self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size)

elif file_path.suffix == ".jsonl":
for docs, labels in eval_data_from_jsonl(filename, batch_size):
for docs, labels in eval_data_from_jsonl(filename, batch_size, max_docs=max_docs, preprocessor=preprocessor):
if docs:
self.write_documents(docs, index=doc_index)
if labels:
Expand Down
93 changes: 70 additions & 23 deletions haystack/preprocessor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
from haystack.file_converter.tika import TikaConverter
from haystack import Document, Label
from haystack.file_converter.txt import TextConverter
from haystack.preprocessor.preprocessor import PreProcessor

logger = logging.getLogger(__name__)



def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tuple[List[Document], List[Label]]:
def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Tuple[List[Document], List[Label]]:
"""
Read Documents + Labels from a SQuAD-style file.
Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
Expand All @@ -44,15 +45,15 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup
if len(docs) > max_docs:
break
# Extracting paragraphs and their labels from a SQuAD document dict
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document)
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document, preprocessor)
docs.extend(cur_docs)
labels.extend(cur_labels)

return docs, labels


def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
max_docs: Union[int, bool] = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
"""
Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line.
Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
Expand All @@ -76,7 +77,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
break
# Extracting paragraphs and their labels from a SQuAD document dict
document_dict = json.loads(document)
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict)
cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict, preprocessor)
docs.extend(cur_docs)
labels.extend(cur_labels)

Expand All @@ -89,50 +90,96 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
yield docs, labels


def _extract_docs_and_labels_from_dict(document_dict: Dict):
def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PreProcessor = None):
docs = []
labels = []

# get all extra fields from document level (e.g. title)
meta_doc = {k: v for k, v in document_dict.items() if k not in ("paragraphs", "title")}
for paragraph in document_dict["paragraphs"]:
## Create Metadata
cur_meta = {"name": document_dict.get("title", None)}
# all other fields from paragraph level
meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
cur_meta.update(meta_paragraph)
# meta from parent document
cur_meta.update(meta_doc)
# Create Document

## Create Document
cur_doc = Document(text=paragraph["context"], meta=cur_meta)
docs.append(cur_doc)
if preprocessor is not None:
splits_dicts = preprocessor.process(cur_doc.to_dict())
# we need to pull in _split_id into the document id for unique reference in labels
# todo: PreProcessor should work on Documents instead of dicts
splits = []
offset = 0
for d in splits_dicts:
id = f"{d['id']}-{d['meta']['_split_id']}"
d["meta"]["_split_offset"] = offset
offset += len(d["text"])
# offset correction based on splitting method
if preprocessor.split_by == "word":
offset += 1
elif preprocessor.split_by == "passage":
offset += 2
else:
raise NotImplementedError
mydoc = Document(text=d["text"],
id=id,
meta=d["meta"])
splits.append(mydoc)
else:
splits = [cur_doc]
docs.extend(splits)

# Get Labels
## Assign Labels to corresponding documents
for qa in paragraph["qas"]:
if len(qa["answers"]) > 0:
if not qa["is_impossible"]:
for answer in qa["answers"]:
ans = answer["text"]
ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
if ans != ans_position:
logger.warning(f"Answer Text and Answer position mismatch. Skipping Answer")
break
# find corresponding document or split
if len(splits) == 1:
cur_id = splits[0].id
cur_ans_start = answer["answer_start"]
else:
for s in splits:
# If answer start offset is contained in passage we assign the label to that passage
if (answer["answer_start"] >= s.meta["_split_offset"]) and (answer["answer_start"] < (s.meta["_split_offset"] + len(s.text))):
cur_id = s.id
cur_ans_start = answer["answer_start"] - s.meta["_split_offset"]
# If a document is splitting an answer we add the whole answer text to the document
if s.text[cur_ans_start:cur_ans_start+len(ans)] != ans:
s.text = s.text[:cur_ans_start] + ans
break
label = Label(
question=qa["question"],
answer=answer["text"],
answer=ans,
is_correct_answer=True,
is_correct_document=True,
document_id=cur_doc.id,
offset_start_in_doc=answer["answer_start"],
document_id=cur_id,
offset_start_in_doc=cur_ans_start,
no_answer=qa["is_impossible"],
origin="gold_label",
)
labels.append(label)
else:
label = Label(
question=qa["question"],
answer="",
is_correct_answer=True,
is_correct_document=True,
document_id=cur_doc.id,
offset_start_in_doc=0,
no_answer=qa["is_impossible"],
origin="gold_label",
)
labels.append(label)
# for no_answer we need to assign each split as not fitting to the question
for s in splits:
label = Label(
question=qa["question"],
answer="",
is_correct_answer=True,
is_correct_document=True,
document_id=s.id,
offset_start_in_doc=0,
no_answer=qa["is_impossible"],
origin="gold_label",
)
labels.append(label)

return docs, labels

Expand Down
4 changes: 4 additions & 0 deletions test/samples/squad/tiny.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@
{
"answer_start": 42,
"text": "Abdul"
},
{
"answer_start": 11,
"text": "Carla and I live together with Abdul"
}
],
"id": 7211011040021040393,
Expand Down
33 changes: 33 additions & 0 deletions test/samples/squad/tiny_passages.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"data": [
{
"title": "test1",
"paragraphs": [
{
"context": "My name is Carla and I live together with Abdul in Berlin. \n\nThis is a new passage saying Leila lives in Berlin, too.",
"qas": [
{
"answers": [
{
"answer_start": 11,
"text": "Carla"
},
{
"answer_start": 42,
"text": "Abdul"
},
{
"answer_start": 89,
"text": "Leila"
}
],
"id": 7211011040021040393,
"question": "Who lives in Berlin?",
"is_impossible": false
}
]
}
]
}
]
}
51 changes: 50 additions & 1 deletion test/test_eval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import pytest
from haystack.document_store.base import BaseDocumentStore
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.preprocessor.preprocessor import PreProcessor
from haystack.finder import Finder


Expand Down Expand Up @@ -159,4 +161,51 @@ def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):

# clean up
document_store.delete_all_documents(index="test_eval_document")
document_store.delete_all_documents(index="test_feedback")
document_store.delete_all_documents(index="test_feedback")

@pytest.mark.elasticsearch
def test_eval_data_splitting(document_store):
# splitting by word
document_store.delete_all_documents(index="test_eval_document")
document_store.delete_all_documents(index="test_feedback")

preprocessor = PreProcessor(
clean_empty_lines=False,
clean_whitespace=False,
clean_header_footer=False,
split_by="word",
split_length=4,
split_overlap=0,
split_respect_sentence_boundary=False
)

document_store.add_eval_data(filename="samples/squad/tiny.json",
doc_index="test_eval_document",
label_index="test_feedback",
preprocessor=preprocessor)
labels = document_store.get_all_labels_aggregated(index="test_feedback")
docs = document_store.get_all_documents(index="test_eval_document")
assert len(docs) == 5
assert len(set(labels[0].multiple_document_ids)) == 2

# splitting by passage
document_store.delete_all_documents(index="test_eval_document")
document_store.delete_all_documents(index="test_feedback")

preprocessor = PreProcessor(
clean_empty_lines=False,
clean_whitespace=False,
clean_header_footer=False,
split_by="passage",
split_length=1,
split_overlap=0,
split_respect_sentence_boundary=False
)

document_store.add_eval_data(filename="samples/squad/tiny_passages.json",
doc_index="test_eval_document",
label_index="test_feedback",
preprocessor=preprocessor)
docs = document_store.get_all_documents(index="test_eval_document")
assert len(docs) == 2
assert len(docs[1].text) == 56

0 comments on commit 4803da0

Please sign in to comment.