deepset-ai · MichelBartels · Mar 29, 2022 · Mar 17, 2022 · Mar 17, 2022 · Mar 28, 2022
diff --git a/docs/_src/tutorials/tutorials/15.md b/docs/_src/tutorials/tutorials/15.md
@@ -84,7 +84,7 @@ document_store = ElasticsearchDocumentStore(
 ```
 
 ## Add Tables to DocumentStore
-To quickly demonstrate the capabilities of the `TableTextRetriever` and the `TableReader` we use a subset of 1000 tables of the [Open Table-and-Text Question Answering (OTT-QA) dataset](https://github.com/wenhuchen/OTT-QA).
+To quickly demonstrate the capabilities of the `TableTextRetriever` and the `TableReader` we use a subset of 1000 tables and text documents from a dataset we have published in [this paper](https://arxiv.org/abs/2108.04049).
 
 Just as text passages, tables are represented as `Document` objects in Haystack. The content field, though, is a pandas DataFrame instead of a string.
 
@@ -95,7 +95,7 @@ Just as text passages, tables are represented as `Document` objects in Haystack.
 from haystack.utils import fetch_archive_from_http
 
 doc_dir = "data/tutorial15"
-s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip"
+s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip"
 "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_tables_sample.json.zip": "15", 
 "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_tables_sample.json.zip": "15", 
 fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
 ```
 
@@ -108,28 +108,21 @@ from haystack import Document
 import pandas as pd
 
 
-def read_ottqa_tables(filename):
+def read_tables(filename):
     processed_tables = []
     with open(filename) as tables:
         tables = json.load(tables)
         for key, table in tables.items():
             current_columns = table["header"]
             current_rows = table["data"]
             current_df = pd.DataFrame(columns=current_columns, data=current_rows)
-            current_doc_title = table["title"]
-            current_section_title = table["section_title"]
-            document = Document(
-                content=current_df,
-                content_type="table",
-                meta={"title": current_doc_title, "section_title": current_section_title},
-                id=key,
-            )
+            document = Document(content=current_df, content_type="table", id=key)
             processed_tables.append(document)
 
     return processed_tables
 
 
-tables = read_ottqa_tables(f"{doc_dir}/ottqa_tables_sample.json")
+tables = read_tables(f"{doc_dir}/tables.json")
 document_store.write_documents(tables, index=document_index)
 
 # Showing content field and meta field of one of the Documents of content_type 'table'
@@ -161,7 +154,6 @@ retriever = TableTextRetriever(
     query_embedding_model="deepset/bert-small-mm_retrieval-question_encoder",
     passage_embedding_model="deepset/bert-small-mm_retrieval-passage_encoder",
     table_embedding_model="deepset/bert-small-mm_retrieval-table_encoder",
-    embed_meta_fields=["title", "section_title"],
 )
 ```
 
@@ -183,7 +175,7 @@ document_store.update_embeddings(retriever=retriever)
 # Try the Retriever
 from haystack.utils import print_documents
 
-retrieved_tables = retriever.retrieve("How many twin buildings are under construction?", top_k=5)
+retrieved_tables = retriever.retrieve("Who won the Super Bowl?", top_k=5)
 # Get highest scored table
 print(retrieved_tables[0].content)
 ```
@@ -202,24 +194,22 @@ reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq", max_s
 
 
 ```python
-# Try the TableReader on one Table (highest-scored retrieved table from previous section)
+# Try the TableReader on one Table
 
-table_doc = document_store.get_document_by_id("List_of_tallest_twin_buildings_and_structures_in_the_world_1")
+table_doc = document_store.get_document_by_id("36964e90-3735-4ba1-8e6a-bec236e88bb2")
 print(table_doc.content)
 ```
 
 
 ```python
 from haystack.utils import print_answers
 
-prediction = reader.predict(query="How many twin buildings are under construction?", documents=[table_doc])
+prediction = reader.predict(query="Who played Gregory House in the series House?", documents=[table_doc])
 print_answers(prediction, details="all")
 ```
 
 The offsets in the `offsets_in_document` and `offsets_in_context` field indicate the table cells that the model predicts to be part of the answer. They need to be interpreted on the linearized table, i.e., a flat list containing all of the table cells.
 
-In the `Answer`'s meta field, you can find the aggreagtion operator used to construct the answer (in this case `COUNT`) and the answer cells as strings.
-
 
 ```python
 print(f"Predicted answer: {prediction['answers'][0].answer}")
@@ -243,34 +233,27 @@ table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["TableT
 
 
 ```python
-prediction = table_qa_pipeline.run("How many twin buildings are under construction?")
+prediction = table_qa_pipeline.run("When was Guilty Gear Xrd : Sign released?", params={"top_k": 30})
 print_answers(prediction, details="minimum")
 ```
 
-# Open-Domain QA on Text and Tables
-With haystack, you not only have the possibility to do QA on texts or tables, solely, but you can also use both texts and tables as your source of information.
-
-To demonstrate this, we add 1,000 sample text passages from the OTT-QA dataset.
-
 
 ```python
-# Add 1,000 text passages from OTT-QA to our document store.
+# Add 500 text passages to our document store.
 
 
-def read_ottqa_texts(filename):
+def read_texts(filename):
     processed_passages = []
     with open(filename) as passages:
         passages = json.load(passages)
-        for title, content in passages.items():
-            title = title[6:]
-            title = title.replace("_", " ")
-            document = Document(content=content, content_type="text", meta={"title": title})
+        for key, content in passages.items():
+            document = Document(content=content, content_type="text", id=key)
             processed_passages.append(document)
 
     return processed_passages
 
 
-passages = read_ottqa_texts(f"{doc_dir}/ottqa_texts_sample.json")
+passages = read_texts(f"{doc_dir}/texts.json")
 document_store.write_documents(passages, index=document_index)
 ```
 
@@ -321,7 +304,7 @@ display.Image("pipeline.png")
 
 ```python
 # Example query whose answer resides in a text passage
-predictions = text_table_qa_pipeline.run(query="Who is Aleksandar Trifunovic?")
+predictions = text_table_qa_pipeline.run(query="Who was Thomas Alva Edison?")
 ```
 
 
@@ -333,7 +316,7 @@ print_answers(predictions, details="minimum")
 
 ```python
 # Example query whose answer resides in a table
-predictions = text_table_qa_pipeline.run(query="What is Cuba's national tree?")
+predictions = text_table_qa_pipeline.run(query="Which country does the film Macaroni come from?")
 ```
 
 
@@ -342,6 +325,78 @@ predictions = text_table_qa_pipeline.run(query="What is Cuba's national tree?")
 print_answers(predictions, details="minimum")
 ```
 
+## Evaluation
+To evaluate our pipeline, we can use haystack's evaluation feature. We just need to convert our labels into `MultiLabel` objects and the `eval` method will do the rest.
+
+
+```python
+from haystack import Label, MultiLabel, Answer
+
+
+def read_labels(filename, tables):
+    processed_labels = []
+    with open(filename) as labels:
+        labels = json.load(labels)
+        for table in tables:
+            if table.id not in labels:
+                continue
+            label = labels[table.id]
+            label = Label(
+                query=label["query"],
+                document=table,
+                is_correct_answer=True,
+                is_correct_document=True,
+                answer=Answer(answer=label["answer"]),
+                origin="gold-label",
+            )
+            processed_labels.append(MultiLabel(labels=[label]))
+    return processed_labels
+
+
+table_labels = read_labels(f"{doc_dir}/labels.json", tables)
+passage_labels = read_labels(f"{doc_dir}/labels.json", passages)
+```
+
+
+```python
+eval_results = text_table_qa_pipeline.eval(table_labels + passage_labels, params={"top_k": 10})
+```
+
+
+```python
+# Calculating and printing the evaluation metrics
+print(eval_results.calculate_metrics())
+```
+
+## Adding tables from PDFs
+It can sometimes be hard to provide your data in form of a pandas DataFrame. For this case, we provide the `ParsrConverter` wrapper that can help you to convert, for example, a PDF file into a document that you can index.
+
+
+```python
+!docker run -d -p 3001:3001 axarev/parsr
+```
+
+
+```python
+!wget https://www.w3.org/WAI/WCAG21/working-examples/pdf-table/table.pdf
+```
+
+
+```python
+from haystack.nodes import ParsrConverter
+
+converter = ParsrConverter()
+
+docs = converter.convert("table.pdf")
+
+tables = [doc for doc in docs if doc["content_type"] == "table"]
+```
+
+
+```python
+print(tables)
+```
+
 ## About us
 
 This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany

diff --git a/haystack/nodes/reader/table.py b/haystack/nodes/reader/table.py
@@ -370,7 +370,7 @@ def _aggregate_answers(agg_operator: str, answer_cells: List[str]) -> str:
                 else:
                     raise KeyError("unknown aggregator")
 
-            return f"{answer_value}{' ' + unit if unit else ''}"
+                return f"{answer_value}{' ' + unit if unit else ''}"
 
         except KeyError as e:
             if "unknown aggregator" in str(e):

diff --git a/haystack/schema.py b/haystack/schema.py
@@ -225,7 +225,7 @@ def __repr__(self):
 
     def __str__(self):
         # In some cases, self.content is None (therefore not subscriptable)
-        if not self.content:
+        if self.content is None:
             return f"<Document: id={self.id}, content=None>"
         return f"<Document: id={self.id}, content='{self.content[:100]} {'...' if len(self.content) > 100 else ''}'>"
 

diff --git a/haystack/telemetry.py b/haystack/telemetry.py
@@ -202,7 +202,7 @@ def send_tutorial_event(url: str):
         "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip": "12",
         # Tutorial 13: no dataset available yet
         "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip": "14",
-        "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/ottqa_sample.zip": "15",
+        "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip": "15",
         # "https://nlp.stanford.edu/data/glove.6B.zip": "16",
         "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip": "16",
     }