From 4803da009a9ebeaba47613c7ea71573832597f89 Mon Sep 17 00:00:00 2001 From: Timo Moeller Date: Wed, 20 Jan 2021 14:40:10 +0100 Subject: [PATCH] Using PreProcessor functions on eval data (#751) * Add eval data splitting * Adjust for split by passage, add test and test data, adjust docstrings, add max_docs to highler level fct --- haystack/document_store/base.py | 38 +++++++---- haystack/preprocessor/utils.py | 93 ++++++++++++++++++++------- test/samples/squad/tiny.json | 4 ++ test/samples/squad/tiny_passages.json | 33 ++++++++++ test/test_eval.py | 51 ++++++++++++++- 5 files changed, 184 insertions(+), 35 deletions(-) create mode 100644 test/samples/squad/tiny_passages.json diff --git a/haystack/document_store/base.py b/haystack/document_store/base.py index b160df24ef..dcba2dee66 100644 --- a/haystack/document_store/base.py +++ b/haystack/document_store/base.py @@ -4,6 +4,7 @@ from typing import Any, Optional, Dict, List, Union from haystack import Document, Label, MultiLabel from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl +from haystack.preprocessor.preprocessor import PreProcessor logger = logging.getLogger(__name__) @@ -140,28 +141,43 @@ def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[s pass def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label", - batch_size: Optional[int] = None): + batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None, + max_docs: Union[int, bool] = None): """ Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it. If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors. :param filename: Name of the file containing evaluation data (json or jsonl) - :type filename: str :param doc_index: Elasticsearch index where evaluation documents should be stored - :type doc_index: str :param label_index: Elasticsearch index where labeled questions should be stored - :type label_index: str - :param batch_size: Number of documents that are loaded and processed at a time. - Only works with jsonl formatted files. Setting batch_size and - using a json formatted file will convert the json to jsonl prior - to adding eval data. - :type batch_size: int + :param batch_size: Optional number of documents that are loaded and processed at a time. + When set to None (default) all documents are processed at once. + :param preprocessor: Optional PreProcessor to preprocess evaluation documents. + It can be used for splitting documents into passages (and assigning labels to corresponding passages). + Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0. + When set to None (default) preprocessing is disabled. + :param max_docs: Optional number of documents that will be loaded. + When set to None (default) all available eval documents are used. + """ + # TODO improve support for PreProcessor when adding eval data + if preprocessor is not None: + assert preprocessor.split_by != "sentence", f"Split by sentence not supported.\n" \ + f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor." + assert preprocessor.split_overlap == 0, f"Overlapping documents are currently not supported when adding eval data.\n" \ + f"Please set 'split_overlap=0' in the supplied PreProcessor." + assert preprocessor.clean_empty_lines == False, f"clean_empty_lines currently not supported when adding eval data.\n" \ + f"Please set 'clean_empty_lines=False' in the supplied PreProcessor." + assert preprocessor.clean_whitespace == False, f"clean_whitespace is currently not supported when adding eval data.\n" \ + f"Please set 'clean_whitespace=False' in the supplied PreProcessor." + assert preprocessor.clean_header_footer == False, f"clean_header_footer is currently not supported when adding eval data.\n" \ + f"Please set 'clean_header_footer=False' in the supplied PreProcessor." + file_path = Path(filename) if file_path.suffix == ".json": if batch_size is None: - docs, labels = eval_data_from_json(filename) + docs, labels = eval_data_from_json(filename, max_docs=max_docs, preprocessor=preprocessor) self.write_documents(docs, index=doc_index) self.write_labels(labels, index=label_index) else: @@ -172,7 +188,7 @@ def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_i self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size) elif file_path.suffix == ".jsonl": - for docs, labels in eval_data_from_jsonl(filename, batch_size): + for docs, labels in eval_data_from_jsonl(filename, batch_size, max_docs=max_docs, preprocessor=preprocessor): if docs: self.write_documents(docs, index=doc_index) if labels: diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py index 81991c2251..0c7558f9f7 100644 --- a/haystack/preprocessor/utils.py +++ b/haystack/preprocessor/utils.py @@ -16,12 +16,13 @@ from haystack.file_converter.tika import TikaConverter from haystack import Document, Label from haystack.file_converter.txt import TextConverter +from haystack.preprocessor.preprocessor import PreProcessor logger = logging.getLogger(__name__) -def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tuple[List[Document], List[Label]]: +def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Tuple[List[Document], List[Label]]: """ Read Documents + Labels from a SQuAD-style file. Document and Labels can then be indexed to the DocumentStore and be used for evaluation. @@ -44,7 +45,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup if len(docs) > max_docs: break # Extracting paragraphs and their labels from a SQuAD document dict - cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document) + cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document, preprocessor) docs.extend(cur_docs) labels.extend(cur_labels) @@ -52,7 +53,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None, - max_docs: Union[int, bool] = None) -> Generator[Tuple[List[Document], List[Label]], None, None]: + max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Generator[Tuple[List[Document], List[Label]], None, None]: """ Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line. Document and Labels can then be indexed to the DocumentStore and be used for evaluation. @@ -76,7 +77,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None, break # Extracting paragraphs and their labels from a SQuAD document dict document_dict = json.loads(document) - cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict) + cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict, preprocessor) docs.extend(cur_docs) labels.extend(cur_labels) @@ -89,50 +90,96 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None, yield docs, labels -def _extract_docs_and_labels_from_dict(document_dict: Dict): +def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PreProcessor = None): docs = [] labels = [] # get all extra fields from document level (e.g. title) meta_doc = {k: v for k, v in document_dict.items() if k not in ("paragraphs", "title")} for paragraph in document_dict["paragraphs"]: + ## Create Metadata cur_meta = {"name": document_dict.get("title", None)} # all other fields from paragraph level meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")} cur_meta.update(meta_paragraph) # meta from parent document cur_meta.update(meta_doc) - # Create Document + + ## Create Document cur_doc = Document(text=paragraph["context"], meta=cur_meta) - docs.append(cur_doc) + if preprocessor is not None: + splits_dicts = preprocessor.process(cur_doc.to_dict()) + # we need to pull in _split_id into the document id for unique reference in labels + # todo: PreProcessor should work on Documents instead of dicts + splits = [] + offset = 0 + for d in splits_dicts: + id = f"{d['id']}-{d['meta']['_split_id']}" + d["meta"]["_split_offset"] = offset + offset += len(d["text"]) + # offset correction based on splitting method + if preprocessor.split_by == "word": + offset += 1 + elif preprocessor.split_by == "passage": + offset += 2 + else: + raise NotImplementedError + mydoc = Document(text=d["text"], + id=id, + meta=d["meta"]) + splits.append(mydoc) + else: + splits = [cur_doc] + docs.extend(splits) - # Get Labels + ## Assign Labels to corresponding documents for qa in paragraph["qas"]: - if len(qa["answers"]) > 0: + if not qa["is_impossible"]: for answer in qa["answers"]: + ans = answer["text"] + ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)] + if ans != ans_position: + logger.warning(f"Answer Text and Answer position mismatch. Skipping Answer") + break + # find corresponding document or split + if len(splits) == 1: + cur_id = splits[0].id + cur_ans_start = answer["answer_start"] + else: + for s in splits: + # If answer start offset is contained in passage we assign the label to that passage + if (answer["answer_start"] >= s.meta["_split_offset"]) and (answer["answer_start"] < (s.meta["_split_offset"] + len(s.text))): + cur_id = s.id + cur_ans_start = answer["answer_start"] - s.meta["_split_offset"] + # If a document is splitting an answer we add the whole answer text to the document + if s.text[cur_ans_start:cur_ans_start+len(ans)] != ans: + s.text = s.text[:cur_ans_start] + ans + break label = Label( question=qa["question"], - answer=answer["text"], + answer=ans, is_correct_answer=True, is_correct_document=True, - document_id=cur_doc.id, - offset_start_in_doc=answer["answer_start"], + document_id=cur_id, + offset_start_in_doc=cur_ans_start, no_answer=qa["is_impossible"], origin="gold_label", ) labels.append(label) else: - label = Label( - question=qa["question"], - answer="", - is_correct_answer=True, - is_correct_document=True, - document_id=cur_doc.id, - offset_start_in_doc=0, - no_answer=qa["is_impossible"], - origin="gold_label", - ) - labels.append(label) + # for no_answer we need to assign each split as not fitting to the question + for s in splits: + label = Label( + question=qa["question"], + answer="", + is_correct_answer=True, + is_correct_document=True, + document_id=s.id, + offset_start_in_doc=0, + no_answer=qa["is_impossible"], + origin="gold_label", + ) + labels.append(label) return docs, labels diff --git a/test/samples/squad/tiny.json b/test/samples/squad/tiny.json index e2d3ced871..16bf295264 100644 --- a/test/samples/squad/tiny.json +++ b/test/samples/squad/tiny.json @@ -15,6 +15,10 @@ { "answer_start": 42, "text": "Abdul" + }, + { + "answer_start": 11, + "text": "Carla and I live together with Abdul" } ], "id": 7211011040021040393, diff --git a/test/samples/squad/tiny_passages.json b/test/samples/squad/tiny_passages.json new file mode 100644 index 0000000000..42326e3f11 --- /dev/null +++ b/test/samples/squad/tiny_passages.json @@ -0,0 +1,33 @@ +{ + "data": [ + { + "title": "test1", + "paragraphs": [ + { + "context": "My name is Carla and I live together with Abdul in Berlin. \n\nThis is a new passage saying Leila lives in Berlin, too.", + "qas": [ + { + "answers": [ + { + "answer_start": 11, + "text": "Carla" + }, + { + "answer_start": 42, + "text": "Abdul" + }, + { + "answer_start": 89, + "text": "Leila" + } + ], + "id": 7211011040021040393, + "question": "Who lives in Berlin?", + "is_impossible": false + } + ] + } + ] + } + ] +} \ No newline at end of file diff --git a/test/test_eval.py b/test/test_eval.py index 8e34eff28a..f2733dd36c 100644 --- a/test/test_eval.py +++ b/test/test_eval.py @@ -1,5 +1,7 @@ import pytest from haystack.document_store.base import BaseDocumentStore +from haystack.document_store.memory import InMemoryDocumentStore +from haystack.preprocessor.preprocessor import PreProcessor from haystack.finder import Finder @@ -159,4 +161,51 @@ def test_eval_finder(document_store: BaseDocumentStore, reader, retriever): # clean up document_store.delete_all_documents(index="test_eval_document") - document_store.delete_all_documents(index="test_feedback") \ No newline at end of file + document_store.delete_all_documents(index="test_feedback") + +@pytest.mark.elasticsearch +def test_eval_data_splitting(document_store): + # splitting by word + document_store.delete_all_documents(index="test_eval_document") + document_store.delete_all_documents(index="test_feedback") + + preprocessor = PreProcessor( + clean_empty_lines=False, + clean_whitespace=False, + clean_header_footer=False, + split_by="word", + split_length=4, + split_overlap=0, + split_respect_sentence_boundary=False + ) + + document_store.add_eval_data(filename="samples/squad/tiny.json", + doc_index="test_eval_document", + label_index="test_feedback", + preprocessor=preprocessor) + labels = document_store.get_all_labels_aggregated(index="test_feedback") + docs = document_store.get_all_documents(index="test_eval_document") + assert len(docs) == 5 + assert len(set(labels[0].multiple_document_ids)) == 2 + + # splitting by passage + document_store.delete_all_documents(index="test_eval_document") + document_store.delete_all_documents(index="test_feedback") + + preprocessor = PreProcessor( + clean_empty_lines=False, + clean_whitespace=False, + clean_header_footer=False, + split_by="passage", + split_length=1, + split_overlap=0, + split_respect_sentence_boundary=False + ) + + document_store.add_eval_data(filename="samples/squad/tiny_passages.json", + doc_index="test_eval_document", + label_index="test_feedback", + preprocessor=preprocessor) + docs = document_store.get_all_documents(index="test_eval_document") + assert len(docs) == 2 + assert len(docs[1].text) == 56 \ No newline at end of file