Skip to content

Commit

Permalink
Merge pull request #705 from NatLibFi/fix-failing-index-command
Browse files Browse the repository at this point in the history
Fix crashing `index` command when targeted directory contains subject files
  • Loading branch information
juhoinkinen authored May 16, 2023
2 parents 5f670e7 + b602ea2 commit 6f13121
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 31 deletions.
4 changes: 1 addition & 3 deletions annif/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,7 @@ def run_index(
raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
backend_params = cli_util.parse_backend_params(backend_param, project)

documents = annif.corpus.DocumentDirectory(
directory, None, None, require_subjects=False
)
documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)

for (docfilename, _), suggestions in zip(documents, results):
Expand Down
33 changes: 17 additions & 16 deletions annif/corpus/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,40 +17,41 @@
class DocumentDirectory(DocumentCorpus):
"""A directory of files as a full text document corpus"""

def __init__(self, path, subject_index, language, require_subjects=False):
def __init__(self, path, subject_index=None, language=None, require_subjects=False):
self.path = path
self.subject_index = subject_index
self.language = language
self.require_subjects = require_subjects

def __iter__(self):
"""Iterate through the directory, yielding tuples of (docfile,
subjectfile) containing file paths. If there is no key file and
require_subjects is False, the subjectfile will be returned as None."""
subjectfile) containing file paths. If require_subjects is False, the
subjectfile will be returned as None."""

for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
if os.path.exists(tsvfilename):
yield (filename, tsvfilename)
continue
keyfilename = re.sub(r"\.txt$", ".key", filename)
if os.path.exists(keyfilename):
yield (filename, keyfilename)
continue
if not self.require_subjects:
if self.require_subjects:
tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
if os.path.exists(tsvfilename):
yield (filename, tsvfilename)
continue
keyfilename = re.sub(r"\.txt$", ".key", filename)
if os.path.exists(keyfilename):
yield (filename, keyfilename)
continue
else:
yield (filename, None)

@property
def documents(self):
for docfilename, keyfilename in self:
for docfilename, subjfilename in self:
with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
text = docfile.read()
if keyfilename is None:
if subjfilename is None:
yield Document(text=text, subject_set=None)
continue
with open(keyfilename, encoding="utf-8-sig") as keyfile:
with open(subjfilename, encoding="utf-8-sig") as subjfile:
subjects = SubjectSet.from_string(
keyfile.read(), self.subject_index, self.language
subjfile.read(), self.subject_index, self.language
)
yield Document(text=text, subject_set=subjects)

Expand Down
4 changes: 3 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ def fulltext_corpus(subject_index):
ftdir = os.path.join(
os.path.dirname(__file__), "corpora", "archaeology", "fulltext"
)
ft_corpus = annif.corpus.DocumentDirectory(ftdir, subject_index, "fi")
ft_corpus = annif.corpus.DocumentDirectory(
ftdir, subject_index, "fi", require_subjects=True
)
return ft_corpus


Expand Down
4 changes: 3 additions & 1 deletion tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def test_learn_dummy(project, tmpdir):
tmpdir.join("doc1.tsv").write("<http://www.yso.fi/onto/yso/p10849>\tarchaeologists")
tmpdir.join("doc2.txt").write("doc2")
tmpdir.join("doc2.tsv").write("<http://example.org/dummy>\tdummy")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en")
docdir = annif.corpus.DocumentDirectory(
str(tmpdir), project.subjects, "en", require_subjects=True
)

dummy.learn(docdir)

Expand Down
3 changes: 3 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,9 @@ def test_suggest_dash_path():

def test_index(tmpdir):
tmpdir.join("doc1.txt").write("nothing special")
# Existing subject files should not have an effect
tmpdir.join("doc1.tsv").write("<http://example.org/dummy>\tdummy")
tmpdir.join("doc1.key").write("<http://example.org/dummy>\tdummy")

result = runner.invoke(annif.cli.cli, ["index", "dummy-en", str(tmpdir)])
assert not result.exception
Expand Down
20 changes: 11 additions & 9 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,38 +80,38 @@ def test_subjectset_as_vector_destination(subject_index):
assert vector is destination


def test_docdir_key(tmpdir, subject_index):
def test_docdir_key(tmpdir):
tmpdir.join("doc1.txt").write("doc1")
tmpdir.join("doc1.key").write("key1")
tmpdir.join("doc2.txt").write("doc2")
tmpdir.join("doc2.key").write("key2")
tmpdir.join("doc3.txt").write("doc3")

docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False)
files = sorted(list(docdir))
assert len(files) == 3
assert files[0][0] == str(tmpdir.join("doc1.txt"))
assert files[0][1] == str(tmpdir.join("doc1.key"))
assert files[0][1] is None
assert files[1][0] == str(tmpdir.join("doc2.txt"))
assert files[1][1] == str(tmpdir.join("doc2.key"))
assert files[1][1] is None
assert files[2][0] == str(tmpdir.join("doc3.txt"))
assert files[2][1] is None


def test_docdir_tsv(tmpdir, subject_index):
def test_docdir_tsv(tmpdir):
tmpdir.join("doc1.txt").write("doc1")
tmpdir.join("doc1.tsv").write("<http://example.org/key1>\tkey1")
tmpdir.join("doc2.txt").write("doc2")
tmpdir.join("doc2.tsv").write("<http://example.org/key2>\tkey2")
tmpdir.join("doc3.txt").write("doc3")

docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False)
files = sorted(list(docdir))
assert len(files) == 3
assert files[0][0] == str(tmpdir.join("doc1.txt"))
assert files[0][1] == str(tmpdir.join("doc1.tsv"))
assert files[0][1] is None
assert files[1][0] == str(tmpdir.join("doc2.txt"))
assert files[1][1] == str(tmpdir.join("doc2.tsv"))
assert files[1][1] is None
assert files[2][0] == str(tmpdir.join("doc3.txt"))
assert files[2][1] is None

Expand All @@ -126,7 +126,9 @@ def test_docdir_tsv_bom(tmpdir, subject_index):
"<http://www.yso.fi/onto/yso/p2558>\trautakausi".encode("utf-8-sig")
)

docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "fi")
docdir = annif.corpus.DocumentDirectory(
str(tmpdir), subject_index, "fi", require_subjects=True
)
docs = list(docdir.documents)
assert docs[0].text == "doc1"
assert (
Expand Down
4 changes: 3 additions & 1 deletion tests/test_project.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ def test_project_learn(registry, tmpdir):
tmpdir.join("doc2.tsv").write("<http://example.org/dummy>\tdummy")

project = registry.get_project("dummy-fi")
docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en")
docdir = annif.corpus.DocumentDirectory(
str(tmpdir), project.subjects, "en", require_subjects=True
)
project.learn(docdir)
result = project.suggest(["this is some text"])[0]
assert len(result) == 1
Expand Down

0 comments on commit 6f13121

Please sign in to comment.