Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add DPR Training #273

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e258b72
WIP add eval for DPR
tholor Jul 17, 2020
03c991f
increase timeout for ES retrieval. add describe_documents()
tholor Jul 20, 2020
9dae1e6
Update tutorials
tholor Jul 20, 2020
eff84ff
Add comment in tutorial
tholor Jul 20, 2020
1b0175b
add basic tests for eval
tholor Jul 21, 2020
15f5bc4
WIP refactor add_eval_data
tholor Jul 21, 2020
fdc4720
refactor retriever.eval() to use Label objects and allow open domain …
tholor Jul 22, 2020
0d7944c
refactor Document class. Introducing UUID
tholor Jul 22, 2020
585d45b
refactor FARMReader.eval()
tholor Jul 22, 2020
2e2113c
fix ES and inmemory tests
tholor Jul 22, 2020
58a00f7
Refactor SQL store to comply with the changes
tanaysoni Jul 23, 2020
7ebbe4c
Refactor InMemory Document Store to comply with the changes
tanaysoni Jul 23, 2020
83964e9
Update test fixtures
tanaysoni Jul 23, 2020
c489f72
Fix breaking tests for reader
tanaysoni Jul 23, 2020
1a33498
Fix SQL query
tanaysoni Jul 23, 2020
74a4f2e
Make eval comptabile with SQL and InMemory Document Stores
tanaysoni Jul 27, 2020
55245bd
Add default index value
tanaysoni Jul 27, 2020
c587be7
dpr json to squad json conversion
kolk Jul 29, 2020
0f1fc63
changed filename and arguments, added checks in find_answer_start
kolk Jul 30, 2020
6608b93
removed testing lines
kolk Jul 30, 2020
9a3d1e2
tutorial 7: DPR training data preprocessing, find_answer_start bug fi…
kolk Aug 4, 2020
28106e4
Bug fix for Tutorial7_Create_preprocessed_data_for_Dense_Retrieval_tr…
kolk Aug 4, 2020
541c414
HFBertEncoder replaced with transformer DPR encoders
kolk Aug 11, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ Elasticsearch (Recommended)

You can get started by running a single Elasticsearch node using docker::

docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.1
docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:7.6.2

Or if docker is not possible for you::

Expand Down
145 changes: 126 additions & 19 deletions haystack/database/base.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,110 @@
from abc import abstractmethod, ABC
from typing import Any, Optional, Dict, List
from typing import Any, Optional, Dict, List, Union
from uuid import UUID, uuid4

from pydantic import BaseModel, Field

class Document:
def __init__(self, text: str,
id: Optional[Union[str, UUID]] = None,
query_score: Optional[float] = None,
question: Optional[str] = None,
meta: Dict[str, Any] = None,
tags: Optional[Dict[str, Any]] = None,
embedding: Optional[List[float]] = None):
"""
Object used to represent documents / passages in a standardized way within Haystack.
For example, this is what the retriever will return from the DocumentStore,
regardless if it's ElasticsearchDocumentStore or InMemoryDocumentStore.

Note that there can be multiple Documents originating from one file (e.g. PDF),
if you split the text into smaller passsages. We'll have one Document per passage in this case.

:param id: ID used within the DocumentStore
:param text: Text of the document
:param query_score: Retriever's query score for a retrieved document
:param question: Question text for FAQs.
:param meta: Meta fields for a document like name, url, or author.
:param tags: Tags that allow filtering of the data
:param embedding: Vector encoding of the text
"""

self.text = text
# Create a unique ID (either new one, or one from user input)
if id:
if isinstance(id, str):
self.id = UUID(hex=str(id), version=4)
if isinstance(id, UUID):
self.id = id
else:
self.id = uuid4()

self.query_score = query_score
self.question = question
self.meta = meta
self.tags = tags # deprecate?
self.embedding = embedding

def to_dict(self):
return self.__dict__

@classmethod
def from_dict(cls, dict):
_doc = dict.copy()
init_args = ["text", "id", "query_score", "question", "meta", "tags", "embedding"]
if "meta" not in _doc.keys():
_doc["meta"] = {}
# copy additional fields into "meta"
for k, v in _doc.items():
if k not in init_args:
_doc["meta"][k] = v
# remove additional fields from top level
_doc = {k: v for k, v in _doc.items() if k in init_args}

return cls(**_doc)


class Label:
def __init__(self, question: str,
answer: str,
positive_sample: bool,
origin: str,
document_id: Optional[UUID] = None,
offset_start_in_doc: Optional[int] = None,
no_answer: Optional[bool] = None,
model_id: Optional[int] = None):
"""
#TODO

:param question:
:param answer:
:param positive_sample:
:param origin:
:param document_id:
:param offset_start_in_doc:
:param no_answer:
:param model_id:
"""
self.no_answer = no_answer
self.origin = origin
self.question = question
self.positive_sample = positive_sample
if document_id:
if isinstance(document_id, str):
self.document_id: Optional[UUID] = UUID(hex=str(document_id), version=4)
if isinstance(document_id, UUID):
self.document_id = document_id
else:
self.document_id = document_id
self.answer = answer
self.offset_start_in_doc = offset_start_in_doc
self.model_id = model_id

class Document(BaseModel):
id: str = Field(..., description="_id field from Elasticsearch")
text: str = Field(..., description="Text of the document")
external_source_id: Optional[str] = Field(
None,
description="id for the source file the document was created from. In the case when a large file is divided "
"across multiple Elasticsearch documents, this id can be used to reference original source file.",
)
question: Optional[str] = Field(None, description="Question text for FAQs.")
query_score: Optional[float] = Field(None, description="Elasticsearch query score for a retrieved document")
meta: Dict[str, Any] = Field({}, description="Meta fields for a document like name, url, or author.")
tags: Optional[Dict[str, Any]] = Field(None, description="Tags that allow filtering of the data")
@classmethod
def from_dict(cls, dict):
return cls(**dict)

def to_dict(self):
return self.__dict__


class BaseDocumentStore(ABC):
Expand All @@ -25,7 +114,7 @@ class BaseDocumentStore(ABC):
index: Optional[str]

@abstractmethod
def write_documents(self, documents: List[dict]):
def write_documents(self, documents: List[dict], index: Optional[str] = None):
"""
Indexes documents for later queries.

Expand All @@ -34,25 +123,31 @@ def write_documents(self, documents: List[dict]):
Optionally: Include meta data via {"text": "<the-actual-text>",
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder.
:param index: Optional name of index where the documents shall be written to.
If None, the DocumentStore's default index (self.index) will be used.

:return: None
"""
pass

@abstractmethod
def get_all_documents(self) -> List[Document]:
def get_all_documents(self, index: Optional[str] = None) -> List[Document]:
pass

@abstractmethod
def get_all_labels(self, index: str = "feedback", filters: Optional[dict] = None) -> List[Label]:
pass

@abstractmethod
def get_document_by_id(self, id: str) -> Optional[Document]:
def get_document_by_id(self, id: UUID, index: Optional[str] = None) -> Optional[Document]:
pass

@abstractmethod
def get_document_ids_by_tags(self, tag) -> List[str]:
def get_document_ids_by_tags(self, tag, index) -> List[str]:
pass

@abstractmethod
def get_document_count(self) -> int:
def get_document_count(self, index: Optional[str] = None) -> int:
pass

@abstractmethod
Expand All @@ -62,3 +157,15 @@ def query_by_embedding(self,
top_k: int = 10,
index: Optional[str] = None) -> List[Document]:
pass

@abstractmethod
def get_label_count(self, index: Optional[str] = None) -> int:
pass

@abstractmethod
def add_eval_data(self, filename: str, doc_index: str = "document", label_index: str = "feedback"):
pass

def delete_all_documents(self, index: str):
pass

Loading