Merge pull request #705 from NatLibFi/fix-failing-index-command

Fix crashing `index` command when targeted directory contains subject files
NatLibFi · May 16, 2023 · 6f13121 · 6f13121
2 parents 5f670e7 + b602ea2
commit 6f13121
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 31 deletions.
diff --git a/annif/cli.py b/annif/cli.py
@@ -318,9 +318,7 @@ def run_index(
         raise click.BadParameter(f'language "{lang}" not supported by vocabulary')
     backend_params = cli_util.parse_backend_params(backend_param, project)
 
-    documents = annif.corpus.DocumentDirectory(
-        directory, None, None, require_subjects=False
-    )
+    documents = annif.corpus.DocumentDirectory(directory, require_subjects=False)
     results = project.suggest_corpus(documents, backend_params).filter(limit, threshold)
 
     for (docfilename, _), suggestions in zip(documents, results):

diff --git a/annif/corpus/document.py b/annif/corpus/document.py
@@ -17,40 +17,41 @@
 class DocumentDirectory(DocumentCorpus):
     """A directory of files as a full text document corpus"""
 
-    def __init__(self, path, subject_index, language, require_subjects=False):
+    def __init__(self, path, subject_index=None, language=None, require_subjects=False):
         self.path = path
         self.subject_index = subject_index
         self.language = language
         self.require_subjects = require_subjects
 
     def __iter__(self):
         """Iterate through the directory, yielding tuples of (docfile,
-        subjectfile) containing file paths. If there is no key file and
-        require_subjects is False, the subjectfile will be returned as None."""
+        subjectfile) containing file paths. If require_subjects is False, the
+        subjectfile will be returned as None."""
 
         for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))):
-            tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
-            if os.path.exists(tsvfilename):
-                yield (filename, tsvfilename)
-                continue
-            keyfilename = re.sub(r"\.txt$", ".key", filename)
-            if os.path.exists(keyfilename):
-                yield (filename, keyfilename)
-                continue
-            if not self.require_subjects:
+            if self.require_subjects:
+                tsvfilename = re.sub(r"\.txt$", ".tsv", filename)
+                if os.path.exists(tsvfilename):
+                    yield (filename, tsvfilename)
+                    continue
+                keyfilename = re.sub(r"\.txt$", ".key", filename)
+                if os.path.exists(keyfilename):
+                    yield (filename, keyfilename)
+                    continue
+            else:
                 yield (filename, None)
 
     @property
     def documents(self):
-        for docfilename, keyfilename in self:
+        for docfilename, subjfilename in self:
             with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile:
                 text = docfile.read()
-            if keyfilename is None:
+            if subjfilename is None:
                 yield Document(text=text, subject_set=None)
                 continue
-            with open(keyfilename, encoding="utf-8-sig") as keyfile:
+            with open(subjfilename, encoding="utf-8-sig") as subjfile:
                 subjects = SubjectSet.from_string(
-                    keyfile.read(), self.subject_index, self.language
+                    subjfile.read(), self.subject_index, self.language
                 )
             yield Document(text=text, subject_set=subjects)
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -106,7 +106,9 @@ def fulltext_corpus(subject_index):
     ftdir = os.path.join(
         os.path.dirname(__file__), "corpora", "archaeology", "fulltext"
     )
-    ft_corpus = annif.corpus.DocumentDirectory(ftdir, subject_index, "fi")
+    ft_corpus = annif.corpus.DocumentDirectory(
+        ftdir, subject_index, "fi", require_subjects=True
+    )
     return ft_corpus
 
 

diff --git a/tests/test_backend.py b/tests/test_backend.py
@@ -32,7 +32,9 @@ def test_learn_dummy(project, tmpdir):
     tmpdir.join("doc1.tsv").write("<http://www.yso.fi/onto/yso/p10849>\tarchaeologists")
     tmpdir.join("doc2.txt").write("doc2")
     tmpdir.join("doc2.tsv").write("<http://example.org/dummy>\tdummy")
-    docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en")
+    docdir = annif.corpus.DocumentDirectory(
+        str(tmpdir), project.subjects, "en", require_subjects=True
+    )
 
     dummy.learn(docdir)
 

diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -570,6 +570,9 @@ def test_suggest_dash_path():
 
 def test_index(tmpdir):
     tmpdir.join("doc1.txt").write("nothing special")
+    # Existing subject files should not have an effect
+    tmpdir.join("doc1.tsv").write("<http://example.org/dummy>\tdummy")
+    tmpdir.join("doc1.key").write("<http://example.org/dummy>\tdummy")
 
     result = runner.invoke(annif.cli.cli, ["index", "dummy-en", str(tmpdir)])
     assert not result.exception

diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -80,38 +80,38 @@ def test_subjectset_as_vector_destination(subject_index):
     assert vector is destination
 
 
-def test_docdir_key(tmpdir, subject_index):
+def test_docdir_key(tmpdir):
     tmpdir.join("doc1.txt").write("doc1")
     tmpdir.join("doc1.key").write("key1")
     tmpdir.join("doc2.txt").write("doc2")
     tmpdir.join("doc2.key").write("key2")
     tmpdir.join("doc3.txt").write("doc3")
 
-    docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en")
+    docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False)
     files = sorted(list(docdir))
     assert len(files) == 3
     assert files[0][0] == str(tmpdir.join("doc1.txt"))
-    assert files[0][1] == str(tmpdir.join("doc1.key"))
+    assert files[0][1] is None
     assert files[1][0] == str(tmpdir.join("doc2.txt"))
-    assert files[1][1] == str(tmpdir.join("doc2.key"))
+    assert files[1][1] is None
     assert files[2][0] == str(tmpdir.join("doc3.txt"))
     assert files[2][1] is None
 
 
-def test_docdir_tsv(tmpdir, subject_index):
+def test_docdir_tsv(tmpdir):
     tmpdir.join("doc1.txt").write("doc1")
     tmpdir.join("doc1.tsv").write("<http://example.org/key1>\tkey1")
     tmpdir.join("doc2.txt").write("doc2")
     tmpdir.join("doc2.tsv").write("<http://example.org/key2>\tkey2")
     tmpdir.join("doc3.txt").write("doc3")
 
-    docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en")
+    docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False)
     files = sorted(list(docdir))
     assert len(files) == 3
     assert files[0][0] == str(tmpdir.join("doc1.txt"))
-    assert files[0][1] == str(tmpdir.join("doc1.tsv"))
+    assert files[0][1] is None
     assert files[1][0] == str(tmpdir.join("doc2.txt"))
-    assert files[1][1] == str(tmpdir.join("doc2.tsv"))
+    assert files[1][1] is None
     assert files[2][0] == str(tmpdir.join("doc3.txt"))
     assert files[2][1] is None
 
@@ -126,7 +126,9 @@ def test_docdir_tsv_bom(tmpdir, subject_index):
         "<http://www.yso.fi/onto/yso/p2558>\trautakausi".encode("utf-8-sig")
     )
 
-    docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "fi")
+    docdir = annif.corpus.DocumentDirectory(
+        str(tmpdir), subject_index, "fi", require_subjects=True
+    )
     docs = list(docdir.documents)
     assert docs[0].text == "doc1"
     assert (

diff --git a/tests/test_project.py b/tests/test_project.py
@@ -183,7 +183,9 @@ def test_project_learn(registry, tmpdir):
     tmpdir.join("doc2.tsv").write("<http://example.org/dummy>\tdummy")
 
     project = registry.get_project("dummy-fi")
-    docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en")
+    docdir = annif.corpus.DocumentDirectory(
+        str(tmpdir), project.subjects, "en", require_subjects=True
+    )
     project.learn(docdir)
     result = project.suggest(["this is some text"])[0]
     assert len(result) == 1