deepset-ai · kolk · Jul 17, 2020 · Jul 20, 2020 · Jul 20, 2020 · Jul 20, 2020
diff --git a/README.rst b/README.rst
@@ -113,7 +113,7 @@ Elasticsearch (Recommended)
 
 You can get started by running a single Elasticsearch node using docker::
 
-     docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.1
+     docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2
 
 Or if docker is not possible for you::
 

diff --git a/haystack/database/base.py b/haystack/database/base.py
@@ -1,21 +1,110 @@
 from abc import abstractmethod, ABC
-from typing import Any, Optional, Dict, List
+from typing import Any, Optional, Dict, List, Union
+from uuid import UUID, uuid4
 
-from pydantic import BaseModel, Field
 
+class Document:
+    def __init__(self, text: str,
+                 id: Optional[Union[str, UUID]] = None,
+                 query_score: Optional[float] = None,
+                 question: Optional[str] = None,
+                 meta: Dict[str, Any] = None,
+                 tags: Optional[Dict[str, Any]] = None,
+                 embedding: Optional[List[float]] = None):
+        """
+        Object used to represent documents / passages in a standardized way within Haystack.
+        For example, this is what the retriever will return from the DocumentStore,
+        regardless if it's ElasticsearchDocumentStore or InMemoryDocumentStore.
+
+        Note that there can be multiple Documents originating from one file (e.g. PDF),
+        if you split the text into smaller passsages. We'll have one Document per passage in this case.
+
+        :param id: ID used within the DocumentStore
+        :param text: Text of the document
+        :param query_score: Retriever's query score for a retrieved document
+        :param question: Question text for FAQs.
+        :param meta: Meta fields for a document like name, url, or author.
+        :param tags: Tags that allow filtering of the data
+        :param embedding: Vector encoding of the text
+        """
+
+        self.text = text
+        # Create a unique ID (either new one, or one from user input)
+        if id:
+            if isinstance(id, str):
+                self.id = UUID(hex=str(id), version=4)
+            if isinstance(id, UUID):
+                self.id = id
+        else:
+            self.id = uuid4()
+
+        self.query_score = query_score
+        self.question = question
+        self.meta = meta
+        self.tags = tags # deprecate?
+        self.embedding = embedding
+
+    def to_dict(self):
+        return self.__dict__
+
+    @classmethod
+    def from_dict(cls, dict):
+        _doc = dict.copy()
+        init_args = ["text", "id", "query_score", "question", "meta", "tags", "embedding"]
+        if "meta" not in _doc.keys():
+            _doc["meta"] = {}
+        # copy additional fields into "meta"
+        for k, v in _doc.items():
+            if k not in init_args:
+                _doc["meta"][k] = v
+        # remove additional fields from top level
+        _doc = {k: v for k, v in _doc.items() if k in init_args}
+
+        return cls(**_doc)
+
+
+class Label:
+    def __init__(self, question: str,
+                 answer: str,
+                 positive_sample: bool,
+                 origin: str,
+                 document_id: Optional[UUID] = None,
+                 offset_start_in_doc: Optional[int] = None,
+                 no_answer: Optional[bool] = None,
+                 model_id: Optional[int] = None):
+        """
+        #TODO
+
+        :param question:
+        :param answer:
+        :param positive_sample:
+        :param origin:
+        :param document_id:
+        :param offset_start_in_doc:
+        :param no_answer:
+        :param model_id:
+        """
+        self.no_answer = no_answer
+        self.origin = origin
+        self.question = question
+        self.positive_sample = positive_sample
+        if document_id:
+            if isinstance(document_id, str):
+                self.document_id: Optional[UUID] = UUID(hex=str(document_id), version=4)
+            if isinstance(document_id, UUID):
+                self.document_id = document_id
+        else:
+            self.document_id = document_id
+        self.answer = answer
+        self.offset_start_in_doc = offset_start_in_doc
+        self.model_id = model_id
 
-class Document(BaseModel):
-    id: str = Field(..., description="_id field from Elasticsearch")
-    text: str = Field(..., description="Text of the document")
-    external_source_id: Optional[str] = Field(
-        None,
-        description="id for the source file the document was created from. In the case when a large file is divided "
-        "across multiple Elasticsearch documents, this id can be used to reference original source file.",
-    )
-    question: Optional[str] = Field(None, description="Question text for FAQs.")
-    query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document")
-    meta: Dict[str, Any] = Field({}, description="Meta fields for a document like name, url, or author.")
-    tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data")
+    @classmethod
+    def from_dict(cls, dict):
+        return cls(**dict)
+
+    def to_dict(self):
+        return self.__dict__
 
 
 class BaseDocumentStore(ABC):
@@ -25,7 +114,7 @@ class BaseDocumentStore(ABC):
     index: Optional[str]
 
     @abstractmethod
-    def write_documents(self, documents: List[dict]):
+    def write_documents(self, documents: List[dict], index: Optional[str] = None):
         """
         Indexes documents for later queries.
 
@@ -34,25 +123,31 @@ def write_documents(self, documents: List[dict]):
                           Optionally: Include meta data via {"text": "<the-actual-text>",
                           "meta":{"name": "<some-document-name>, "author": "somebody", ...}}
                           It can be used for filtering and is accessible in the responses of the Finder.
+        :param index: Optional name of index where the documents shall be written to.
+                      If None, the DocumentStore's default index (self.index) will be used.
 
         :return: None
         """
         pass
 
     @abstractmethod
-    def get_all_documents(self) -> List[Document]:
+    def get_all_documents(self, index: Optional[str] = None) -> List[Document]:
+        pass
+
+    @abstractmethod
+    def get_all_labels(self, index: str = "feedback", filters: Optional[dict] = None) -> List[Label]:
         pass
 
     @abstractmethod
-    def get_document_by_id(self, id: str) -> Optional[Document]:
+    def get_document_by_id(self, id: UUID, index: Optional[str] = None) -> Optional[Document]:
         pass
 
     @abstractmethod
-    def get_document_ids_by_tags(self, tag) -> List[str]:
+    def get_document_ids_by_tags(self, tag, index) -> List[str]:
         pass
 
     @abstractmethod
-    def get_document_count(self) -> int:
+    def get_document_count(self, index: Optional[str] = None) -> int:
         pass
 
     @abstractmethod
@@ -62,3 +157,15 @@ def query_by_embedding(self,
                            top_k: int = 10,
                            index: Optional[str] = None) -> List[Document]:
         pass
+
+    @abstractmethod
+    def get_label_count(self, index: Optional[str] = None) -> int:
+        pass
+
+    @abstractmethod
+    def add_eval_data(self, filename: str, doc_index: str = "document", label_index: str = "feedback"):
+        pass
+
+    def delete_all_documents(self, index: str):
+        pass
+