From 4803da009a9ebeaba47613c7ea71573832597f89 Mon Sep 17 00:00:00 2001
From: Timo Moeller <timo.moeller@deepset.ai>
Date: Wed, 20 Jan 2021 14:40:10 +0100
Subject: [PATCH] Using PreProcessor functions on eval data (#751)

* Add eval data splitting

* Adjust for split by passage, add test and test data, adjust docstrings, add max_docs to highler level fct
---
 haystack/document_store/base.py       | 38 +++++++----
 haystack/preprocessor/utils.py        | 93 ++++++++++++++++++++-------
 test/samples/squad/tiny.json          |  4 ++
 test/samples/squad/tiny_passages.json | 33 ++++++++++
 test/test_eval.py                     | 51 ++++++++++++++-
 5 files changed, 184 insertions(+), 35 deletions(-)
 create mode 100644 test/samples/squad/tiny_passages.json

diff --git a/haystack/document_store/base.py b/haystack/document_store/base.py
index b160df24ef..dcba2dee66 100644
--- a/haystack/document_store/base.py
+++ b/haystack/document_store/base.py
@@ -4,6 +4,7 @@
 from typing import Any, Optional, Dict, List, Union
 from haystack import Document, Label, MultiLabel
 from haystack.preprocessor.utils import eval_data_from_json, eval_data_from_jsonl, squad_json_to_jsonl
+from haystack.preprocessor.preprocessor import PreProcessor
 
 
 logger = logging.getLogger(__name__)
@@ -140,28 +141,43 @@ def write_labels(self, labels: Union[List[Label], List[dict]], index: Optional[s
         pass
 
     def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_index: str = "label",
-                      batch_size: Optional[int] = None):
+                      batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None,
+                      max_docs: Union[int, bool] = None):
         """
         Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it.
         If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise
         from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors.
 
         :param filename: Name of the file containing evaluation data (json or jsonl)
-        :type filename: str
         :param doc_index: Elasticsearch index where evaluation documents should be stored
-        :type doc_index: str
         :param label_index: Elasticsearch index where labeled questions should be stored
-        :type label_index: str
-        :param batch_size: Number of documents that are loaded and processed at a time.
-                           Only works with jsonl formatted files. Setting batch_size and
-                           using a json formatted file will convert the json to jsonl prior
-                           to adding eval data.
-        :type batch_size: int
+        :param batch_size: Optional number of documents that are loaded and processed at a time.
+                           When set to None (default) all documents are processed at once.
+        :param preprocessor: Optional PreProcessor to preprocess evaluation documents.
+                             It can be used for splitting documents into passages (and assigning labels to corresponding passages).
+                             Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0.
+                             When set to None (default) preprocessing is disabled.
+        :param max_docs: Optional number of documents that will be loaded.
+                         When set to None (default) all available eval documents are used.
+
         """
+        # TODO improve support for PreProcessor when adding eval data
+        if preprocessor is not None:
+            assert preprocessor.split_by != "sentence", f"Split by sentence not supported.\n" \
+                                                    f"Please set 'split_by' to either 'word' or 'passage' in the supplied PreProcessor."
+            assert preprocessor.split_overlap == 0, f"Overlapping documents are currently not supported when adding eval data.\n" \
+                                                    f"Please set 'split_overlap=0' in the supplied PreProcessor."
+            assert preprocessor.clean_empty_lines == False, f"clean_empty_lines currently not supported when adding eval data.\n" \
+                                                    f"Please set 'clean_empty_lines=False' in the supplied PreProcessor."
+            assert preprocessor.clean_whitespace == False, f"clean_whitespace is currently not supported when adding eval data.\n" \
+                                                    f"Please set 'clean_whitespace=False' in the supplied PreProcessor."
+            assert preprocessor.clean_header_footer == False, f"clean_header_footer is currently not supported when adding eval data.\n" \
+                                                    f"Please set 'clean_header_footer=False' in the supplied PreProcessor."
+
         file_path = Path(filename)
         if file_path.suffix == ".json":
             if batch_size is None:
-                docs, labels = eval_data_from_json(filename)
+                docs, labels = eval_data_from_json(filename, max_docs=max_docs, preprocessor=preprocessor)
                 self.write_documents(docs, index=doc_index)
                 self.write_labels(labels, index=label_index)
             else:
@@ -172,7 +188,7 @@ def add_eval_data(self, filename: str, doc_index: str = "eval_document", label_i
                 self.add_eval_data(jsonl_filename, doc_index, label_index, batch_size)
 
         elif file_path.suffix == ".jsonl":
-            for docs, labels in eval_data_from_jsonl(filename, batch_size):
+            for docs, labels in eval_data_from_jsonl(filename, batch_size, max_docs=max_docs, preprocessor=preprocessor):
                 if docs:
                     self.write_documents(docs, index=doc_index)
                 if labels:
diff --git a/haystack/preprocessor/utils.py b/haystack/preprocessor/utils.py
index 81991c2251..0c7558f9f7 100644
--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@@ -16,12 +16,13 @@
 from haystack.file_converter.tika import TikaConverter
 from haystack import Document, Label
 from haystack.file_converter.txt import TextConverter
+from haystack.preprocessor.preprocessor import PreProcessor
 
 logger = logging.getLogger(__name__)
 
 
 
-def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tuple[List[Document], List[Label]]:
+def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Tuple[List[Document], List[Label]]:
     """
     Read Documents + Labels from a SQuAD-style file.
     Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
@@ -44,7 +45,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup
                 if len(docs) > max_docs:
                     break
             # Extracting paragraphs and their labels from a SQuAD document dict
-            cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document)
+            cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document, preprocessor)
             docs.extend(cur_docs)
             labels.extend(cur_labels)
 
@@ -52,7 +53,7 @@ def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None) -> Tup
 
 
 def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
-                         max_docs: Union[int, bool] = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
+                         max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None) -> Generator[Tuple[List[Document], List[Label]], None, None]:
     """
     Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line.
     Document and Labels can then be indexed to the DocumentStore and be used for evaluation.
@@ -76,7 +77,7 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
                     break
             # Extracting paragraphs and their labels from a SQuAD document dict
             document_dict = json.loads(document)
-            cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict)
+            cur_docs, cur_labels = _extract_docs_and_labels_from_dict(document_dict, preprocessor)
             docs.extend(cur_docs)
             labels.extend(cur_labels)
 
@@ -89,50 +90,96 @@ def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None,
     yield docs, labels
 
 
-def _extract_docs_and_labels_from_dict(document_dict: Dict):
+def _extract_docs_and_labels_from_dict(document_dict: Dict, preprocessor: PreProcessor = None):
     docs = []
     labels = []
 
     # get all extra fields from document level (e.g. title)
     meta_doc = {k: v for k, v in document_dict.items() if k not in ("paragraphs", "title")}
     for paragraph in document_dict["paragraphs"]:
+        ## Create Metadata
         cur_meta = {"name": document_dict.get("title", None)}
         # all other fields from paragraph level
         meta_paragraph = {k: v for k, v in paragraph.items() if k not in ("qas", "context")}
         cur_meta.update(meta_paragraph)
         # meta from parent document
         cur_meta.update(meta_doc)
-        # Create Document
+
+        ## Create Document
         cur_doc = Document(text=paragraph["context"], meta=cur_meta)
-        docs.append(cur_doc)
+        if preprocessor is not None:
+            splits_dicts = preprocessor.process(cur_doc.to_dict())
+            # we need to pull in _split_id into the document id for unique reference in labels
+            # todo: PreProcessor should work on Documents instead of dicts
+            splits = []
+            offset = 0
+            for d in splits_dicts:
+                id = f"{d['id']}-{d['meta']['_split_id']}"
+                d["meta"]["_split_offset"] = offset
+                offset += len(d["text"])
+                # offset correction based on splitting method
+                if preprocessor.split_by == "word":
+                    offset += 1
+                elif preprocessor.split_by == "passage":
+                    offset += 2
+                else:
+                    raise NotImplementedError
+                mydoc = Document(text=d["text"],
+                                 id=id,
+                                 meta=d["meta"])
+                splits.append(mydoc)
+        else:
+            splits = [cur_doc]
+        docs.extend(splits)
 
-        # Get Labels
+        ## Assign Labels to corresponding documents
         for qa in paragraph["qas"]:
-            if len(qa["answers"]) > 0:
+            if not qa["is_impossible"]:
                 for answer in qa["answers"]:
+                    ans = answer["text"]
+                    ans_position = cur_doc.text[answer["answer_start"]:answer["answer_start"]+len(ans)]
+                    if ans != ans_position:
+                        logger.warning(f"Answer Text and Answer position mismatch. Skipping Answer")
+                        break
+                    # find corresponding document or split
+                    if len(splits) == 1:
+                        cur_id = splits[0].id
+                        cur_ans_start = answer["answer_start"]
+                    else:
+                        for s in splits:
+                            # If answer start offset is contained in passage we assign the label to that passage
+                            if (answer["answer_start"] >= s.meta["_split_offset"]) and (answer["answer_start"] < (s.meta["_split_offset"] + len(s.text))):
+                                cur_id = s.id
+                                cur_ans_start = answer["answer_start"] - s.meta["_split_offset"]
+                                # If a document is splitting an answer we add the whole answer text to the document
+                                if s.text[cur_ans_start:cur_ans_start+len(ans)] != ans:
+                                    s.text = s.text[:cur_ans_start] + ans
+                                break
                     label = Label(
                         question=qa["question"],
-                        answer=answer["text"],
+                        answer=ans,
                         is_correct_answer=True,
                         is_correct_document=True,
-                        document_id=cur_doc.id,
-                        offset_start_in_doc=answer["answer_start"],
+                        document_id=cur_id,
+                        offset_start_in_doc=cur_ans_start,
                         no_answer=qa["is_impossible"],
                         origin="gold_label",
                     )
                     labels.append(label)
             else:
-                label = Label(
-                    question=qa["question"],
-                    answer="",
-                    is_correct_answer=True,
-                    is_correct_document=True,
-                    document_id=cur_doc.id,
-                    offset_start_in_doc=0,
-                    no_answer=qa["is_impossible"],
-                    origin="gold_label",
-                )
-                labels.append(label)
+                # for no_answer we need to assign each split as not fitting to the question
+                for s in splits:
+                    label = Label(
+                        question=qa["question"],
+                        answer="",
+                        is_correct_answer=True,
+                        is_correct_document=True,
+                        document_id=s.id,
+                        offset_start_in_doc=0,
+                        no_answer=qa["is_impossible"],
+                        origin="gold_label",
+                    )
+                    labels.append(label)
 
     return docs, labels
 
diff --git a/test/samples/squad/tiny.json b/test/samples/squad/tiny.json
index e2d3ced871..16bf295264 100644
--- a/test/samples/squad/tiny.json
+++ b/test/samples/squad/tiny.json
@@ -15,6 +15,10 @@
                 {
                   "answer_start": 42,
                   "text": "Abdul"
+                },
+                {
+                  "answer_start": 11,
+                  "text": "Carla and I live together with Abdul"
                 }
               ],
               "id": 7211011040021040393,
diff --git a/test/samples/squad/tiny_passages.json b/test/samples/squad/tiny_passages.json
new file mode 100644
index 0000000000..42326e3f11
--- /dev/null
+++ b/test/samples/squad/tiny_passages.json
@@ -0,0 +1,33 @@
+{
+  "data": [
+   {
+      "title": "test1",
+      "paragraphs": [
+        {
+          "context": "My name is Carla and I live together with Abdul in Berlin. \n\nThis is a new passage saying Leila lives in Berlin, too.",
+          "qas": [
+            {
+              "answers": [
+                {
+                  "answer_start": 11,
+                  "text": "Carla"
+                },
+                {
+                  "answer_start": 42,
+                  "text": "Abdul"
+                },
+                {
+                  "answer_start": 89,
+                  "text": "Leila"
+                }
+              ],
+              "id": 7211011040021040393,
+              "question": "Who lives in Berlin?",
+              "is_impossible": false
+            }
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/test/test_eval.py b/test/test_eval.py
index 8e34eff28a..f2733dd36c 100644
--- a/test/test_eval.py
+++ b/test/test_eval.py
@@ -1,5 +1,7 @@
 import pytest
 from haystack.document_store.base import BaseDocumentStore
+from haystack.document_store.memory import InMemoryDocumentStore
+from haystack.preprocessor.preprocessor import PreProcessor
 from haystack.finder import Finder
 
 
@@ -159,4 +161,51 @@ def test_eval_finder(document_store: BaseDocumentStore, reader, retriever):
 
     # clean up
     document_store.delete_all_documents(index="test_eval_document")
-    document_store.delete_all_documents(index="test_feedback")
\ No newline at end of file
+    document_store.delete_all_documents(index="test_feedback")
+
+@pytest.mark.elasticsearch
+def test_eval_data_splitting(document_store):
+    # splitting by word
+    document_store.delete_all_documents(index="test_eval_document")
+    document_store.delete_all_documents(index="test_feedback")
+
+    preprocessor = PreProcessor(
+        clean_empty_lines=False,
+        clean_whitespace=False,
+        clean_header_footer=False,
+        split_by="word",
+        split_length=4,
+        split_overlap=0,
+        split_respect_sentence_boundary=False
+    )
+
+    document_store.add_eval_data(filename="samples/squad/tiny.json",
+                                 doc_index="test_eval_document",
+                                 label_index="test_feedback",
+                                 preprocessor=preprocessor)
+    labels = document_store.get_all_labels_aggregated(index="test_feedback")
+    docs = document_store.get_all_documents(index="test_eval_document")
+    assert len(docs) == 5
+    assert len(set(labels[0].multiple_document_ids)) == 2
+
+    # splitting by passage
+    document_store.delete_all_documents(index="test_eval_document")
+    document_store.delete_all_documents(index="test_feedback")
+
+    preprocessor = PreProcessor(
+        clean_empty_lines=False,
+        clean_whitespace=False,
+        clean_header_footer=False,
+        split_by="passage",
+        split_length=1,
+        split_overlap=0,
+        split_respect_sentence_boundary=False
+    )
+
+    document_store.add_eval_data(filename="samples/squad/tiny_passages.json",
+                                 doc_index="test_eval_document",
+                                 label_index="test_feedback",
+                                 preprocessor=preprocessor)
+    docs = document_store.get_all_documents(index="test_eval_document")
+    assert len(docs) == 2
+    assert len(docs[1].text) == 56
\ No newline at end of file