Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add filtering by tags for InMemoryDocumentStore #108

Merged
merged 5 commits into from
May 14, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions haystack/database/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,5 @@ def __getitem__(self, item):
return self.text
if item == 'id':
return self.id
if item == 'meta':
return self.meta
34 changes: 30 additions & 4 deletions haystack/database/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,36 @@ def __init__(self):
self.docs = {}
self.doc_tags = {}

def write_documents(self, documents):
def write_documents(self, documents, tags=None):
skirdey marked this conversation as resolved.
Show resolved Hide resolved
import hashlib
for document in documents:

if documents is None:
return

if tags is not None and len(tags) == len(documents):
documents = zip(documents, tags)
else:
documents = zip(documents, [None] * len(documents))

for document, tag in documents:
name = document.get("name", None)
text = document.get("text", None)

if name is None or text is None:
continue

signature = name + text

hash = hashlib.md5(signature.encode("utf-8")).hexdigest()

self.docs[hash] = document

if isinstance(tag, dict):
tag_key = tag.keys()
tag_values = tag.values()
for tag_value in tag_values:
self.doc_tags[str((tag_key, tag_value))] = hash

def get_document_by_id(self, id):
return self.docs[id]

Expand All @@ -32,10 +48,20 @@ def get_document_ids_by_tags(self, tags):
The format for the dict is {"tag-1": "value-1", "tag-2": "value-2" ...}
The format for the dict is {"tag-1": ["value-1","value-2"], "tag-2": ["value-3]" ...}
"""
pass
if not isinstance(tags, list):
tags = [tags]
result = []
for tag in tags:
tag_key = tag.keys()
tag_values = tag.values()
for tag_value in tag_values:
doc = self.docs.get(self.doc_tags.get(str((tag_key, tag_value)), None), None)
if doc:
result.append(doc)
return result

def get_document_count(self):
return len(self.docs.items())

def get_all_documents(self):
return [Document(id=item[0], text=item[1]['text'], name=item[1]['name']) for item in self.docs.items()]
return [Document(id=item[0], text=item[1]['text'], name=item[1]['name'], meta=item[1].get('meta', {})) for item in self.docs.items()]
2 changes: 0 additions & 2 deletions test/test_farm_reader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import pytest

from haystack.reader.farm import FARMReader


Expand Down
41 changes: 37 additions & 4 deletions test/test_in_memory_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,51 @@

def test_finder_get_answers_with_in_memory_store():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1"},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2"},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3"}
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': 'url'}},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}}
]

from haystack.database.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
document_store.write_documents(test_docs)
document_store.write_documents(test_docs, tags=[{'has_url': ["true"] for _ in range(0, len(test_docs))}])

retriever = TfidfRetriever(document_store=document_store)
reader = TransformersReader(model="distilbert-base-uncased-distilled-squad",
tokenizer="distilbert-base-uncased", use_gpu=-1)
finder = Finder(reader, retriever)
prediction = finder.get_answers(question="testing finder", top_k_retriever=10,
top_k_reader=5)
assert prediction is not None

skirdey marked this conversation as resolved.
Show resolved Hide resolved

def test_memory_store_get_by_tags():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}}
]

from haystack.database.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
document_store.write_documents(test_docs, tags=[None, {'has_url': 'false'}, {'has_url': 'true'}])

docs = document_store.get_document_ids_by_tags({'has_url': 'false'})

assert docs == [{'name': 'testing the finder 2', 'text': 'testing the finder with pyhton unit test 2', 'meta': {'url': None}}]


def test_memory_store_get_by_tag_lists():
test_docs = [
{"name": "testing the finder 1", "text": "testing the finder with pyhton unit test 1", 'meta': {'url': 'url'}},
{"name": "testing the finder 2", "text": "testing the finder with pyhton unit test 2", 'meta': {'url': None}},
{"name": "testing the finder 3", "text": "testing the finder with pyhton unit test 3", 'meta': {'url': 'url'}}
]

from haystack.database.memory import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
document_store.write_documents(test_docs, tags=[{'tag2': ["1"]}, {'tag1': ['1']}, {'tag2': ["1", "2"]}])

docs = document_store.get_document_ids_by_tags({'tag2': ["1"]})

assert docs == [{'name': 'testing the finder 1', 'text': 'testing the finder with pyhton unit test 1', 'meta': {'url': 'url'}}]