deepset-ai · tholor · Sep 16, 2020 · Sep 16, 2020 · Sep 16, 2020 · Sep 16, 2020
diff --git a/.gitignore b/.gitignore
@@ -138,7 +138,7 @@ dmypy.json
 .idea
 
 # haystack files
-haystack/database/qa.db
+haystack/document_store/qa.db
 data
 mlruns
 src

diff --git a/Dockerfile b/Dockerfile
@@ -17,7 +17,7 @@ COPY README.rst models* /home/user/models/
 # optional : copy sqlite db if needed for testing
 #COPY qa.db /home/user/
 
-# optional: copy data directory containing docs for indexing
+# optional: copy data directory containing docs for ingestion
 #COPY data /home/user/data
 
 EXPOSE 8000

diff --git a/haystack/__init__.py b/haystack/__init__.py
@@ -1,6 +1,7 @@
 import logging
 
 import pandas as pd
+from haystack.schema import Document, Label, MultiLabel
 from haystack.finder import Finder
 
 pd.options.display.max_colwidth = 80

diff --git a/haystack/database/__init__.py → haystack/document_store/__init__.py b/haystack/database/__init__.py → haystack/document_store/__init__.py
diff --git a/haystack/document_store/base.py b/haystack/document_store/base.py
@@ -0,0 +1,125 @@
+import logging
+from abc import abstractmethod, ABC
+from typing import Any, Optional, Dict, List, Union
+from haystack import Document, Label, MultiLabel
+
+logger = logging.getLogger(__name__)
+
+
+class BaseDocumentStore(ABC):
+    """
+    Base class for implementing Document Stores.
+    """
+    index: Optional[str]
+    label_index: Optional[str]
+
+    @abstractmethod
+    def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
+        """
+        Indexes documents for later queries.
+
+        :param documents: a list of Python dictionaries or a list of Haystack Document objects.
+                          For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
+                          Optionally: Include meta data via {"text": "<the-actual-text>",
+                          "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
+                          It can be used for filtering and is accessible in the responses of the Finder.
+        :param index: Optional name of index where the documents shall be written to.
+                      If None, the DocumentStore's default index (self.index) will be used.
+
+        :return: None
+        """
+        pass
+
+    @abstractmethod
+    def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
+        pass
+
+    @abstractmethod
+    def get_all_labels(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
+        pass
+
+    def get_all_labels_aggregated(self,
+                                  index: Optional[str] = None,
+                                  filters: Optional[Dict[str, List[str]]] = None) -> List[MultiLabel]:
+        aggregated_labels = []
+        all_labels = self.get_all_labels(index=index, filters=filters)
+
+        # Collect all answers to a question in a dict
+        question_ans_dict = {} # type: ignore
+        for l in all_labels:
+            # only aggregate labels with correct answers, as only those can be currently used in evaluation
+            if not l.is_correct_answer:
+                continue
+
+            if l.question in question_ans_dict:
+                question_ans_dict[l.question].append(l)
+            else:
+                question_ans_dict[l.question] = [l]
+
+        # Aggregate labels
+        for q, ls in question_ans_dict.items():
+            ls = list(set(ls))  # get rid of exact duplicates
+            # check if there are both text answer and "no answer" present
+            t_present = False
+            no_present = False
+            no_idx = []
+            for idx, l in enumerate(ls):
+                if len(l.answer) == 0:
+                    no_present = True
+                    no_idx.append(idx)
+                else:
+                    t_present = True
+            # if both text and no answer are present, remove no answer labels
+            if t_present and no_present:
+                logger.warning(
+                    f"Both text label and 'no answer possible' label is present for question: {ls[0].question}")
+                for remove_idx in no_idx[::-1]:
+                    ls.pop(remove_idx)
+
+            # construct Aggregated_label
+            for i, l in enumerate(ls):
+                if i == 0:
+                    agg_label = MultiLabel(question=l.question,
+                                           multiple_answers=[l.answer],
+                                           is_correct_answer=l.is_correct_answer,
+                                           is_correct_document=l.is_correct_document,
+                                           origin=l.origin,
+                                           multiple_document_ids=[l.document_id],
+                                           multiple_offset_start_in_docs=[l.offset_start_in_doc],
+                                           no_answer=l.no_answer,
+                                           model_id=l.model_id,
+                                           )
+                else:
+                    agg_label.multiple_answers.append(l.answer)
+                    agg_label.multiple_document_ids.append(l.document_id)
+                    agg_label.multiple_offset_start_in_docs.append(l.offset_start_in_doc)
+            aggregated_labels.append(agg_label)
+        return aggregated_labels
+
+    @abstractmethod
+    def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]:
+        pass
+
+    @abstractmethod
+    def get_document_count(self, index: Optional[str] = None) -> int:
+        pass
+
+    @abstractmethod
+    def query_by_embedding(self,
+                           query_emb: List[float],
+                           filters: Optional[Optional[Dict[str, List[str]]]] = None,
+                           top_k: int = 10,
+                           index: Optional[str] = None) -> List[Document]:
+        pass
+
+    @abstractmethod
+    def get_label_count(self, index: Optional[str] = None) -> int:
+        pass
+
+    @abstractmethod
+    def add_eval_data(self, filename: str, doc_index: str = "document", label_index: str = "label"):
+        pass
+
+    def delete_all_documents(self, index: str):
+        pass
+
diff --git a/haystack/database/elasticsearch.py → haystack/document_store/elasticsearch.py b/haystack/database/elasticsearch.py → haystack/document_store/elasticsearch.py
@@ -7,8 +7,9 @@
 from elasticsearch.helpers import bulk, scan
 import numpy as np
 
-from haystack.database.base import BaseDocumentStore, Document, Label
-from haystack.indexing.utils import eval_data_from_file
+from haystack.document_store.base import BaseDocumentStore
+from haystack import Document, Label
+from haystack.preprocessor.utils import eval_data_from_file
 from haystack.retriever.base import BaseRetriever
 
 logger = logging.getLogger(__name__)
@@ -70,7 +71,7 @@ def __init__(
         :param refresh_type: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
                              Values:
                              - 'wait_for' => continue only after changes are visible (slow, but safe)
-                             - 'false' => continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after indexing)
+                             - 'false' => continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion)
                              More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-refresh.html
         """
         self.client = Elasticsearch(hosts=[{"host": host, "port": port}], http_auth=(username, password),
@@ -470,7 +471,7 @@ def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = Non
 
         if embeddings[0].shape[0] != self.embedding_dim:
             raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})"
-                               f" doesn't match embedding dim. in documentstore ({self.embedding_dim})."
+                               f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
                                "Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()")
         doc_updates = []
         for doc, emb in zip(docs, embeddings):

diff --git a/haystack/database/faiss.py → haystack/document_store/faiss.py b/haystack/database/faiss.py → haystack/document_store/faiss.py
@@ -6,8 +6,8 @@
 import numpy as np
 from faiss.swigfaiss import IndexHNSWFlat
 
-from haystack.database.base import Document
-from haystack.database.sql import SQLDocumentStore
+from haystack import Document
+from haystack.document_store.sql import SQLDocumentStore
 from haystack.retriever.base import BaseRetriever
 
 logger = logging.getLogger(__name__)
@@ -35,7 +35,7 @@ def __init__(
         """
         :param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale
                         deployment, Postgres is recommended.
-        :param index_buffer_size: When working with large dataset, the indexing process(FAISS + SQL) can be buffered in
+        :param index_buffer_size: When working with large datasets, the ingestion process(FAISS + SQL) can be buffered in
                                   smaller chunks to reduce memory footprint.
         :param vector_size: the embedding vector size.
         :param faiss_index: load an existing FAISS Index.

diff --git a/haystack/database/memory.py → haystack/document_store/memory.py b/haystack/database/memory.py → haystack/document_store/memory.py
@@ -2,8 +2,9 @@
 from uuid import uuid4
 from collections import defaultdict
 
-from haystack.database.base import BaseDocumentStore, Document, Label
-from haystack.indexing.utils import eval_data_from_file
+from haystack.document_store.base import BaseDocumentStore
+from haystack import Document, Label
+from haystack.preprocessor.utils import eval_data_from_file
 from haystack.retriever.base import BaseRetriever
 
 import logging
@@ -114,7 +115,7 @@ def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = Non
 
         if embeddings[0].shape[0] != self.embedding_dim:
             raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})"
-                               f" doesn't match embedding dim. in documentstore ({self.embedding_dim})."
+                               f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
                                "Specify the arg `embedding_dim` when initializing InMemoryDocumentStore()")
 
         for doc, emb in zip(docs, embeddings):

diff --git a/haystack/database/sql.py → haystack/document_store/sql.py b/haystack/database/sql.py → haystack/document_store/sql.py
@@ -5,8 +5,9 @@
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import relationship, sessionmaker
 
-from haystack.database.base import BaseDocumentStore, Document, Label
-from haystack.indexing.utils import eval_data_from_file
+from haystack.document_store.base import BaseDocumentStore
+from haystack import Document, Label
+from haystack.preprocessor.utils import eval_data_from_file
 
 Base = declarative_base()  # type: Any
 

diff --git a/haystack/eval.py b/haystack/eval.py
@@ -1,6 +1,6 @@
 from typing import List, Tuple, Dict, Any
 
-from haystack.database.base import MultiLabel
+from haystack import MultiLabel
 
 
 def calculate_reader_metrics(metric_counts: Dict[str, float], correct_retrievals: int):

diff --git a/haystack/indexing/__init__.py → haystack/file_converter/__init__.py b/haystack/indexing/__init__.py → haystack/file_converter/__init__.py
diff --git a/haystack/indexing/file_converters/base.py → haystack/file_converter/base.py b/haystack/indexing/file_converters/base.py → haystack/file_converter/base.py
@@ -9,7 +9,7 @@
 
 class BaseConverter:
     """
-    Base class for implementing file converts to transform input documents to text format for indexing in database.
+    Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
     """
 
     def __init__(

diff --git a/haystack/indexing/file_converters/docx.py → haystack/file_converter/docx.py b/haystack/indexing/file_converters/docx.py → haystack/file_converter/docx.py
@@ -1,4 +1,4 @@
-from haystack.indexing.file_converters.base import BaseConverter
+from haystack.file_converter.base import BaseConverter
 import logging
 from pathlib import Path
 from typing import List, Dict, Optional, Any, Tuple

diff --git a/haystack/indexing/file_converters/pdf.py → haystack/file_converter/pdf.py b/haystack/indexing/file_converters/pdf.py → haystack/file_converter/pdf.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 from typing import List, Optional, Dict, Tuple, Any
 
-from haystack.indexing.file_converters.base import BaseConverter
+from haystack.file_converter.base import BaseConverter
 
 logger = logging.getLogger(__name__)
 

diff --git a/haystack/indexing/file_converters/tika.py → haystack/file_converter/tika.py b/haystack/indexing/file_converters/tika.py → haystack/file_converter/tika.py
@@ -7,7 +7,7 @@
 import requests
 from tika import parser as tikaparser
 
-from haystack.indexing.file_converters.base import BaseConverter
+from haystack.file_converter.base import BaseConverter
 
 logger = logging.getLogger(__name__)
 

diff --git a/haystack/indexing/file_converters/txt.py → haystack/file_converter/txt.py b/haystack/indexing/file_converters/txt.py → haystack/file_converter/txt.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from typing import List, Optional, Tuple, Any, Dict
 
-from haystack.indexing.file_converters.base import BaseConverter
+from haystack.file_converter.base import BaseConverter
 
 logger = logging.getLogger(__name__)
 

diff --git a/haystack/finder.py b/haystack/finder.py
@@ -9,7 +9,7 @@
 
 from haystack.reader.base import BaseReader
 from haystack.retriever.base import BaseRetriever
-from haystack.database.base import MultiLabel, Document
+from haystack import MultiLabel, Document
 from haystack.eval import calculate_average_precision, eval_counts_reader_batch, calculate_reader_metrics, \
     eval_counts_reader
 

diff --git a/...tack/indexing/file_converters/__init__.py → haystack/preprocessor/__init__.py b/...tack/indexing/file_converters/__init__.py → haystack/preprocessor/__init__.py
diff --git a/haystack/indexing/cleaning.py → haystack/preprocessor/cleaning.py b/haystack/indexing/cleaning.py → haystack/preprocessor/cleaning.py
diff --git a/haystack/indexing/utils.py → haystack/preprocessor/utils.py b/haystack/indexing/utils.py → haystack/preprocessor/utils.py
@@ -9,9 +9,9 @@
 
 from farm.data_handler.utils import http_get
 
-from haystack.indexing.file_converters.pdf import PDFToTextConverter
-from haystack.indexing.file_converters.tika import TikaConverter
-from haystack.database.base import Document, Label
+from haystack.file_converter.pdf import PDFToTextConverter
+from haystack.file_converter.tika import TikaConverter
+from haystack import Document, Label
 
 logger = logging.getLogger(__name__)
 
@@ -78,7 +78,7 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
     Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
     Document Store.
 
-    :param dir_path: path for the documents to be written to the database
+    :param dir_path: path for the documents to be written to the DocumentStore
     :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
     :param split_paragraphs: split text in paragraphs.
 
@@ -127,7 +127,7 @@ def tika_convert_files_to_dicts(
     Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
     Document Store.
 
-    :param dir_path: path for the documents to be written to the database
+    :param dir_path: path for the documents to be written to the DocumentStore
     :param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
     :param split_paragraphs: split text in paragraphs.
 

diff --git a/haystack/reader/base.py b/haystack/reader/base.py
@@ -3,7 +3,7 @@
 from abc import ABC, abstractmethod
 from typing import List, Optional, Sequence
 
-from haystack.database.base import Document
+from haystack import Document
 
 
 class BaseReader(ABC):

diff --git a/haystack/reader/farm.py b/haystack/reader/farm.py
@@ -19,8 +19,10 @@
 from scipy.special import expit
 import shutil
 
-from haystack.database.base import Document, BaseDocumentStore
+from haystack import Document
+from haystack.document_store.base import BaseDocumentStore
 from haystack.reader.base import BaseReader
+
 logger = logging.getLogger(__name__)
 
 

diff --git a/haystack/reader/transformers.py b/haystack/reader/transformers.py
@@ -2,7 +2,7 @@
 
 from transformers import pipeline
 
-from haystack.database.base import Document
+from haystack import Document
 from haystack.reader.base import BaseReader
 
 

diff --git a/haystack/retriever/base.py b/haystack/retriever/base.py
@@ -2,8 +2,8 @@
 from typing import List
 import logging
 
-from haystack.database.base import Document
-from haystack.database.base import BaseDocumentStore
+from haystack import Document
+from haystack.document_store.base import BaseDocumentStore
 
 logger = logging.getLogger(__name__)
 

diff --git a/haystack/retriever/dense.py b/haystack/retriever/dense.py
@@ -6,7 +6,9 @@
 
 from farm.infer import Inferencer
 
-from haystack.database.base import Document, BaseDocumentStore
+from haystack.document_store.base import BaseDocumentStore
+from haystack import Document
+from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
 from haystack.retriever.base import BaseRetriever
 from haystack.retriever.sparse import logger
 

diff --git a/haystack/retriever/dpr_utils.py b/haystack/retriever/dpr_utils.py
@@ -48,7 +48,7 @@
 
 class ModelOutput:
     """
-    Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
+    Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows preprocessor by integer or slice (like
     a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes.
     """
 

diff --git a/haystack/retriever/sparse.py b/haystack/retriever/sparse.py
@@ -5,8 +5,9 @@
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 
-from haystack.database.base import Document, BaseDocumentStore
-from haystack.database.elasticsearch import ElasticsearchDocumentStore
+from haystack.document_store.base import BaseDocumentStore
+from haystack import Document
+from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
 from haystack.retriever.base import BaseRetriever
 from collections import namedtuple
-Original file line number
+Diff line change
@@ Expand Up / @@ -138,7 +138,7 @@ dmypy.json @@
     .idea
     # haystack files
-    haystack/database/qa.db
+    haystack/document_store/qa.db
     data
     mlruns
     src
@@ Expand Down @@