Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename and restructure modules #379

Merged
merged 8 commits into from
Sep 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ dmypy.json
.idea

# haystack files
haystack/database/qa.db
haystack/document_store/qa.db
data
mlruns
src
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ COPY README.rst models* /home/user/models/
# optional : copy sqlite db if needed for testing
#COPY qa.db /home/user/

# optional: copy data directory containing docs for indexing
# optional: copy data directory containing docs for ingestion
#COPY data /home/user/data

EXPOSE 8000
Expand Down
1 change: 1 addition & 0 deletions haystack/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging

import pandas as pd
from haystack.schema import Document, Label, MultiLabel
from haystack.finder import Finder

pd.options.display.max_colwidth = 80
Expand Down
File renamed without changes.
125 changes: 125 additions & 0 deletions haystack/document_store/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import logging
from abc import abstractmethod, ABC
from typing import Any, Optional, Dict, List, Union
from haystack import Document, Label, MultiLabel

logger = logging.getLogger(__name__)


class BaseDocumentStore(ABC):
"""
Base class for implementing Document Stores.
"""
index: Optional[str]
label_index: Optional[str]

@abstractmethod
def write_documents(self, documents: Union[List[dict], List[Document]], index: Optional[str] = None):
"""
Indexes documents for later queries.

:param documents: a list of Python dictionaries or a list of Haystack Document objects.
For documents as dictionaries, the format is {"text": "<the-actual-text>"}.
Optionally: Include meta data via {"text": "<the-actual-text>",
"meta":{"name": "<some-document-name>, "author": "somebody", ...}}
It can be used for filtering and is accessible in the responses of the Finder.
:param index: Optional name of index where the documents shall be written to.
If None, the DocumentStore's default index (self.index) will be used.

:return: None
"""
pass

@abstractmethod
def get_all_documents(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Document]:
pass

@abstractmethod
def get_all_labels(self, index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> List[Label]:
pass

def get_all_labels_aggregated(self,
index: Optional[str] = None,
filters: Optional[Dict[str, List[str]]] = None) -> List[MultiLabel]:
aggregated_labels = []
all_labels = self.get_all_labels(index=index, filters=filters)

# Collect all answers to a question in a dict
question_ans_dict = {} # type: ignore
for l in all_labels:
# only aggregate labels with correct answers, as only those can be currently used in evaluation
if not l.is_correct_answer:
continue

if l.question in question_ans_dict:
question_ans_dict[l.question].append(l)
else:
question_ans_dict[l.question] = [l]

# Aggregate labels
for q, ls in question_ans_dict.items():
ls = list(set(ls)) # get rid of exact duplicates
# check if there are both text answer and "no answer" present
t_present = False
no_present = False
no_idx = []
for idx, l in enumerate(ls):
if len(l.answer) == 0:
no_present = True
no_idx.append(idx)
else:
t_present = True
# if both text and no answer are present, remove no answer labels
if t_present and no_present:
logger.warning(
f"Both text label and 'no answer possible' label is present for question: {ls[0].question}")
for remove_idx in no_idx[::-1]:
ls.pop(remove_idx)

# construct Aggregated_label
for i, l in enumerate(ls):
if i == 0:
agg_label = MultiLabel(question=l.question,
multiple_answers=[l.answer],
is_correct_answer=l.is_correct_answer,
is_correct_document=l.is_correct_document,
origin=l.origin,
multiple_document_ids=[l.document_id],
multiple_offset_start_in_docs=[l.offset_start_in_doc],
no_answer=l.no_answer,
model_id=l.model_id,
)
else:
agg_label.multiple_answers.append(l.answer)
agg_label.multiple_document_ids.append(l.document_id)
agg_label.multiple_offset_start_in_docs.append(l.offset_start_in_doc)
aggregated_labels.append(agg_label)
return aggregated_labels

@abstractmethod
def get_document_by_id(self, id: str, index: Optional[str] = None) -> Optional[Document]:
pass

@abstractmethod
def get_document_count(self, index: Optional[str] = None) -> int:
pass

@abstractmethod
def query_by_embedding(self,
query_emb: List[float],
filters: Optional[Optional[Dict[str, List[str]]]] = None,
top_k: int = 10,
index: Optional[str] = None) -> List[Document]:
pass

@abstractmethod
def get_label_count(self, index: Optional[str] = None) -> int:
pass

@abstractmethod
def add_eval_data(self, filename: str, doc_index: str = "document", label_index: str = "label"):
pass

def delete_all_documents(self, index: str):
pass

Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
from elasticsearch.helpers import bulk, scan
import numpy as np

from haystack.database.base import BaseDocumentStore, Document, Label
from haystack.indexing.utils import eval_data_from_file
from haystack.document_store.base import BaseDocumentStore
from haystack import Document, Label
from haystack.preprocessor.utils import eval_data_from_file
from haystack.retriever.base import BaseRetriever

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -70,7 +71,7 @@ def __init__(
:param refresh_type: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search.
Values:
- 'wait_for' => continue only after changes are visible (slow, but safe)
- 'false' => continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after indexing)
- 'false' => continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion)
More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-refresh.html
"""
self.client = Elasticsearch(hosts=[{"host": host, "port": port}], http_auth=(username, password),
Expand Down Expand Up @@ -470,7 +471,7 @@ def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = Non

if embeddings[0].shape[0] != self.embedding_dim:
raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})"
f" doesn't match embedding dim. in documentstore ({self.embedding_dim})."
f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
"Specify the arg `embedding_dim` when initializing ElasticsearchDocumentStore()")
doc_updates = []
for doc, emb in zip(docs, embeddings):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import numpy as np
from faiss.swigfaiss import IndexHNSWFlat

from haystack.database.base import Document
from haystack.database.sql import SQLDocumentStore
from haystack import Document
from haystack.document_store.sql import SQLDocumentStore
from haystack.retriever.base import BaseRetriever

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -35,7 +35,7 @@ def __init__(
"""
:param sql_url: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale
deployment, Postgres is recommended.
:param index_buffer_size: When working with large dataset, the indexing process(FAISS + SQL) can be buffered in
:param index_buffer_size: When working with large datasets, the ingestion process(FAISS + SQL) can be buffered in
smaller chunks to reduce memory footprint.
:param vector_size: the embedding vector size.
:param faiss_index: load an existing FAISS Index.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from uuid import uuid4
from collections import defaultdict

from haystack.database.base import BaseDocumentStore, Document, Label
from haystack.indexing.utils import eval_data_from_file
from haystack.document_store.base import BaseDocumentStore
from haystack import Document, Label
from haystack.preprocessor.utils import eval_data_from_file
from haystack.retriever.base import BaseRetriever

import logging
Expand Down Expand Up @@ -114,7 +115,7 @@ def update_embeddings(self, retriever: BaseRetriever, index: Optional[str] = Non

if embeddings[0].shape[0] != self.embedding_dim:
raise RuntimeError(f"Embedding dim. of model ({embeddings[0].shape[0]})"
f" doesn't match embedding dim. in documentstore ({self.embedding_dim})."
f" doesn't match embedding dim. in DocumentStore ({self.embedding_dim})."
"Specify the arg `embedding_dim` when initializing InMemoryDocumentStore()")

for doc, emb in zip(docs, embeddings):
Expand Down
5 changes: 3 additions & 2 deletions haystack/database/sql.py → haystack/document_store/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker

from haystack.database.base import BaseDocumentStore, Document, Label
from haystack.indexing.utils import eval_data_from_file
from haystack.document_store.base import BaseDocumentStore
from haystack import Document, Label
from haystack.preprocessor.utils import eval_data_from_file

Base = declarative_base() # type: Any

Expand Down
2 changes: 1 addition & 1 deletion haystack/eval.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Tuple, Dict, Any

from haystack.database.base import MultiLabel
from haystack import MultiLabel


def calculate_reader_metrics(metric_counts: Dict[str, float], correct_retrievals: int):
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class BaseConverter:
"""
Base class for implementing file converts to transform input documents to text format for indexing in database.
Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore.
"""

def __init__(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from haystack.indexing.file_converters.base import BaseConverter
from haystack.file_converter.base import BaseConverter
import logging
from pathlib import Path
from typing import List, Dict, Optional, Any, Tuple
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pathlib import Path
from typing import List, Optional, Dict, Tuple, Any

from haystack.indexing.file_converters.base import BaseConverter
from haystack.file_converter.base import BaseConverter

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import requests
from tika import parser as tikaparser

from haystack.indexing.file_converters.base import BaseConverter
from haystack.file_converter.base import BaseConverter

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from typing import List, Optional, Tuple, Any, Dict

from haystack.indexing.file_converters.base import BaseConverter
from haystack.file_converter.base import BaseConverter

logger = logging.getLogger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion haystack/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from haystack.reader.base import BaseReader
from haystack.retriever.base import BaseRetriever
from haystack.database.base import MultiLabel, Document
from haystack import MultiLabel, Document
from haystack.eval import calculate_average_precision, eval_counts_reader_batch, calculate_reader_metrics, \
eval_counts_reader

Expand Down
File renamed without changes.
10 changes: 5 additions & 5 deletions haystack/indexing/utils.py → haystack/preprocessor/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@

from farm.data_handler.utils import http_get

from haystack.indexing.file_converters.pdf import PDFToTextConverter
from haystack.indexing.file_converters.tika import TikaConverter
from haystack.database.base import Document, Label
from haystack.file_converter.pdf import PDFToTextConverter
from haystack.file_converter.tika import TikaConverter
from haystack import Document, Label

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -78,7 +78,7 @@ def convert_files_to_dicts(dir_path: str, clean_func: Optional[Callable] = None,
Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
Document Store.

:param dir_path: path for the documents to be written to the database
:param dir_path: path for the documents to be written to the DocumentStore
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
:param split_paragraphs: split text in paragraphs.

Expand Down Expand Up @@ -127,7 +127,7 @@ def tika_convert_files_to_dicts(
Convert all files(.txt, .pdf) in the sub-directories of the given path to Python dicts that can be written to a
Document Store.

:param dir_path: path for the documents to be written to the database
:param dir_path: path for the documents to be written to the DocumentStore
:param clean_func: a custom cleaning function that gets applied to each doc (input: str, output:str)
:param split_paragraphs: split text in paragraphs.

Expand Down
2 changes: 1 addition & 1 deletion haystack/reader/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from abc import ABC, abstractmethod
from typing import List, Optional, Sequence

from haystack.database.base import Document
from haystack import Document


class BaseReader(ABC):
Expand Down
4 changes: 3 additions & 1 deletion haystack/reader/farm.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,10 @@
from scipy.special import expit
import shutil

from haystack.database.base import Document, BaseDocumentStore
from haystack import Document
from haystack.document_store.base import BaseDocumentStore
from haystack.reader.base import BaseReader

logger = logging.getLogger(__name__)


Expand Down
2 changes: 1 addition & 1 deletion haystack/reader/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from transformers import pipeline

from haystack.database.base import Document
from haystack import Document
from haystack.reader.base import BaseReader


Expand Down
4 changes: 2 additions & 2 deletions haystack/retriever/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from typing import List
import logging

from haystack.database.base import Document
from haystack.database.base import BaseDocumentStore
from haystack import Document
from haystack.document_store.base import BaseDocumentStore

logger = logging.getLogger(__name__)

Expand Down
4 changes: 3 additions & 1 deletion haystack/retriever/dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@

from farm.infer import Inferencer

from haystack.database.base import Document, BaseDocumentStore
from haystack.document_store.base import BaseDocumentStore
from haystack import Document
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.base import BaseRetriever
from haystack.retriever.sparse import logger

Expand Down
2 changes: 1 addition & 1 deletion haystack/retriever/dpr_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@

class ModelOutput:
"""
Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows preprocessor by integer or slice (like
a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes.
"""

Expand Down
5 changes: 3 additions & 2 deletions haystack/retriever/sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

from haystack.database.base import Document, BaseDocumentStore
from haystack.database.elasticsearch import ElasticsearchDocumentStore
from haystack.document_store.base import BaseDocumentStore
from haystack import Document
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.base import BaseRetriever
from collections import namedtuple

Expand Down
Loading