Skip to content

Commit

Permalink
Parse tsv/key files in DocumentCorpus only if required
Browse files Browse the repository at this point in the history
  • Loading branch information
juhoinkinen committed May 16, 2023
1 parent 9712028 commit b602ea2
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 27 deletions.
4 changes: 1 addition & 3 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,7 @@ def run_index(
raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
backend_params = cli_util.parse_backend_params(backend_param, project)

documents = annif.corpus.DocumentDirectory(
directory, project.subjects, lang, require_subjects=False
)
documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)

for (docfilename, _), suggestions in zip(documents, results):
Expand Down
25 changes: 13 additions & 12 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,27 +17,28 @@
class DocumentDirectory(DocumentCorpus):
"""A directory of files as a full text document corpus"""

def __init__(self, path, subject_index, language, require_subjects=False):
def __init__(self, path, subject_index=None, language=None, require_subjects=False):
self.path = path
self.subject_index = subject_index
self.language = language
self.require_subjects = require_subjects

def __iter__(self):
"""Iterate through the directory, yielding tuples of (docfile,
subjectfile) containing file paths. If there is no key file and
require_subjects is False, the subjectfile will be returned as None."""
subjectfile) containing file paths. If require_subjects is False, the
subjectfile will be returned as None."""

for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
if os.path.exists(tsvfilename):
yield (filename, tsvfilename)
continue
keyfilename = re.sub(r"\.txt$", ".key", filename)
if os.path.exists(keyfilename):
yield (filename, keyfilename)
continue
if not self.require_subjects:
if self.require_subjects:
tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
if os.path.exists(tsvfilename):
yield (filename, tsvfilename)
continue
keyfilename = re.sub(r"\.txt$", ".key", filename)
if os.path.exists(keyfilename):
yield (filename, keyfilename)
continue
else:
yield (filename, None)

@property
Expand Down
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ def fulltext_corpus(subject_index):
ftdir = os.path.join(
os.path.dirname(__file__), "corpora", "archaeology", "fulltext"
)
ft_corpus = annif.corpus.DocumentDirectory(ftdir, subject_index, "fi")
ft_corpus = annif.corpus.DocumentDirectory(
ftdir, subject_index, "fi", require_subjects=True
)
return ft_corpus


Expand Down
4 changes: 3 additions & 1 deletion tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def test_learn_dummy(project, tmpdir):
tmpdir.join("doc1.tsv").write("<http://www.yso.fi/onto/yso/p10849>\tarchaeologists")
tmpdir.join("doc2.txt").write("doc2")
tmpdir.join("doc2.tsv").write("<http://example.org/dummy>\tdummy")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en")
docdir = annif.corpus.DocumentDirectory(
str(tmpdir), project.subjects, "en", require_subjects=True
)

dummy.learn(docdir)

Expand Down
20 changes: 11 additions & 9 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,38 +80,38 @@ def test_subjectset_as_vector_destination(subject_index):
assert vector is destination


def test_docdir_key(tmpdir, subject_index):
def test_docdir_key(tmpdir):
tmpdir.join("doc1.txt").write("doc1")
tmpdir.join("doc1.key").write("key1")
tmpdir.join("doc2.txt").write("doc2")
tmpdir.join("doc2.key").write("key2")
tmpdir.join("doc3.txt").write("doc3")

docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False)
files = sorted(list(docdir))
assert len(files) == 3
assert files[0][0] == str(tmpdir.join("doc1.txt"))
assert files[0][1] == str(tmpdir.join("doc1.key"))
assert files[0][1] is None
assert files[1][0] == str(tmpdir.join("doc2.txt"))
assert files[1][1] == str(tmpdir.join("doc2.key"))
assert files[1][1] is None
assert files[2][0] == str(tmpdir.join("doc3.txt"))
assert files[2][1] is None


def test_docdir_tsv(tmpdir, subject_index):
def test_docdir_tsv(tmpdir):
tmpdir.join("doc1.txt").write("doc1")
tmpdir.join("doc1.tsv").write("<http://example.org/key1>\tkey1")
tmpdir.join("doc2.txt").write("doc2")
tmpdir.join("doc2.tsv").write("<http://example.org/key2>\tkey2")
tmpdir.join("doc3.txt").write("doc3")

docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False)
files = sorted(list(docdir))
assert len(files) == 3
assert files[0][0] == str(tmpdir.join("doc1.txt"))
assert files[0][1] == str(tmpdir.join("doc1.tsv"))
assert files[0][1] is None
assert files[1][0] == str(tmpdir.join("doc2.txt"))
assert files[1][1] == str(tmpdir.join("doc2.tsv"))
assert files[1][1] is None
assert files[2][0] == str(tmpdir.join("doc3.txt"))
assert files[2][1] is None

Expand All @@ -126,7 +126,9 @@ def test_docdir_tsv_bom(tmpdir, subject_index):
"<http://www.yso.fi/onto/yso/p2558>\trautakausi".encode("utf-8-sig")
)

docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "fi")
docdir = annif.corpus.DocumentDirectory(
str(tmpdir), subject_index, "fi", require_subjects=True
)
docs = list(docdir.documents)
assert docs[0].text == "doc1"
assert (
Expand Down
4 changes: 3 additions & 1 deletion tests/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ def test_project_learn(registry, tmpdir):
tmpdir.join("doc2.tsv").write("<http://example.org/dummy>\tdummy")

project = registry.get_project("dummy-fi")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en")
docdir = annif.corpus.DocumentDirectory(
str(tmpdir), project.subjects, "en", require_subjects=True
)
project.learn(docdir)
result = project.suggest(["this is some text"])[0]
assert len(result) == 1
Expand Down

0 comments on commit b602ea2

Please sign in to comment.