diff --git a/annif/cli.py b/annif/cli.py index d24527050..ebed088ba 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -318,9 +318,7 @@ def run_index( raise click.BadParameter(f'language "{lang}" not supported by vocabulary') backend_params = cli_util.parse_backend_params(backend_param, project) - documents = annif.corpus.DocumentDirectory( - directory, None, None, require_subjects=False - ) + documents = annif.corpus.DocumentDirectory(directory, require_subjects=False) results = project.suggest_corpus(documents, backend_params).filter(limit, threshold) for (docfilename, _), suggestions in zip(documents, results): diff --git a/annif/corpus/document.py b/annif/corpus/document.py index c26c75122..54a0a3ba6 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -17,7 +17,7 @@ class DocumentDirectory(DocumentCorpus): """A directory of files as a full text document corpus""" - def __init__(self, path, subject_index, language, require_subjects=False): + def __init__(self, path, subject_index=None, language=None, require_subjects=False): self.path = path self.subject_index = subject_index self.language = language @@ -25,32 +25,33 @@ def __init__(self, path, subject_index, language, require_subjects=False): def __iter__(self): """Iterate through the directory, yielding tuples of (docfile, - subjectfile) containing file paths. If there is no key file and - require_subjects is False, the subjectfile will be returned as None.""" + subjectfile) containing file paths. If require_subjects is False, the + subjectfile will be returned as None.""" for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))): - tsvfilename = re.sub(r"\.txt$", ".tsv", filename) - if os.path.exists(tsvfilename): - yield (filename, tsvfilename) - continue - keyfilename = re.sub(r"\.txt$", ".key", filename) - if os.path.exists(keyfilename): - yield (filename, keyfilename) - continue - if not self.require_subjects: + if self.require_subjects: + tsvfilename = re.sub(r"\.txt$", ".tsv", filename) + if os.path.exists(tsvfilename): + yield (filename, tsvfilename) + continue + keyfilename = re.sub(r"\.txt$", ".key", filename) + if os.path.exists(keyfilename): + yield (filename, keyfilename) + continue + else: yield (filename, None) @property def documents(self): - for docfilename, keyfilename in self: + for docfilename, subjfilename in self: with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile: text = docfile.read() - if keyfilename is None: + if subjfilename is None: yield Document(text=text, subject_set=None) continue - with open(keyfilename, encoding="utf-8-sig") as keyfile: + with open(subjfilename, encoding="utf-8-sig") as subjfile: subjects = SubjectSet.from_string( - keyfile.read(), self.subject_index, self.language + subjfile.read(), self.subject_index, self.language ) yield Document(text=text, subject_set=subjects) diff --git a/tests/conftest.py b/tests/conftest.py index fcccb268f..76378a98d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -106,7 +106,9 @@ def fulltext_corpus(subject_index): ftdir = os.path.join( os.path.dirname(__file__), "corpora", "archaeology", "fulltext" ) - ft_corpus = annif.corpus.DocumentDirectory(ftdir, subject_index, "fi") + ft_corpus = annif.corpus.DocumentDirectory( + ftdir, subject_index, "fi", require_subjects=True + ) return ft_corpus diff --git a/tests/test_backend.py b/tests/test_backend.py index 378cb78b8..b7e583d1c 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -32,7 +32,9 @@ def test_learn_dummy(project, tmpdir): tmpdir.join("doc1.tsv").write("\tarchaeologists") tmpdir.join("doc2.txt").write("doc2") tmpdir.join("doc2.tsv").write("\tdummy") - docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en") + docdir = annif.corpus.DocumentDirectory( + str(tmpdir), project.subjects, "en", require_subjects=True + ) dummy.learn(docdir) diff --git a/tests/test_cli.py b/tests/test_cli.py index 496cd8b13..ef44df8a3 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -570,6 +570,9 @@ def test_suggest_dash_path(): def test_index(tmpdir): tmpdir.join("doc1.txt").write("nothing special") + # Existing subject files should not have an effect + tmpdir.join("doc1.tsv").write("\tdummy") + tmpdir.join("doc1.key").write("\tdummy") result = runner.invoke(annif.cli.cli, ["index", "dummy-en", str(tmpdir)]) assert not result.exception diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 6580477bb..6e7db5158 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -80,38 +80,38 @@ def test_subjectset_as_vector_destination(subject_index): assert vector is destination -def test_docdir_key(tmpdir, subject_index): +def test_docdir_key(tmpdir): tmpdir.join("doc1.txt").write("doc1") tmpdir.join("doc1.key").write("key1") tmpdir.join("doc2.txt").write("doc2") tmpdir.join("doc2.key").write("key2") tmpdir.join("doc3.txt").write("doc3") - docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en") + docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False) files = sorted(list(docdir)) assert len(files) == 3 assert files[0][0] == str(tmpdir.join("doc1.txt")) - assert files[0][1] == str(tmpdir.join("doc1.key")) + assert files[0][1] is None assert files[1][0] == str(tmpdir.join("doc2.txt")) - assert files[1][1] == str(tmpdir.join("doc2.key")) + assert files[1][1] is None assert files[2][0] == str(tmpdir.join("doc3.txt")) assert files[2][1] is None -def test_docdir_tsv(tmpdir, subject_index): +def test_docdir_tsv(tmpdir): tmpdir.join("doc1.txt").write("doc1") tmpdir.join("doc1.tsv").write("\tkey1") tmpdir.join("doc2.txt").write("doc2") tmpdir.join("doc2.tsv").write("\tkey2") tmpdir.join("doc3.txt").write("doc3") - docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en") + docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False) files = sorted(list(docdir)) assert len(files) == 3 assert files[0][0] == str(tmpdir.join("doc1.txt")) - assert files[0][1] == str(tmpdir.join("doc1.tsv")) + assert files[0][1] is None assert files[1][0] == str(tmpdir.join("doc2.txt")) - assert files[1][1] == str(tmpdir.join("doc2.tsv")) + assert files[1][1] is None assert files[2][0] == str(tmpdir.join("doc3.txt")) assert files[2][1] is None @@ -126,7 +126,9 @@ def test_docdir_tsv_bom(tmpdir, subject_index): "\trautakausi".encode("utf-8-sig") ) - docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "fi") + docdir = annif.corpus.DocumentDirectory( + str(tmpdir), subject_index, "fi", require_subjects=True + ) docs = list(docdir.documents) assert docs[0].text == "doc1" assert ( diff --git a/tests/test_project.py b/tests/test_project.py index 6600d664c..a6294edb6 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -183,7 +183,9 @@ def test_project_learn(registry, tmpdir): tmpdir.join("doc2.tsv").write("\tdummy") project = registry.get_project("dummy-fi") - docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en") + docdir = annif.corpus.DocumentDirectory( + str(tmpdir), project.subjects, "en", require_subjects=True + ) project.learn(docdir) result = project.suggest(["this is some text"])[0] assert len(result) == 1