From 626ee926fcdef8e01504d1f790eebd5aad95f228 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Mon, 15 May 2023 10:28:40 +0300 Subject: [PATCH 1/4] Make test_index check that subject files in target dir do not crash --- tests/test_cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0ef12b63d..2b8dad961 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -570,6 +570,9 @@ def test_suggest_dash_path(): def test_index(tmpdir): tmpdir.join("doc1.txt").write("nothing special") + # Existing subject files should not have an effect + tmpdir.join("doc1.tsv").write("\tdummy") + tmpdir.join("doc1.key").write("\tdummy") result = runner.invoke(annif.cli.cli, ["index", "dummy-en", str(tmpdir)]) assert not result.exception From b56b31e0c49e5b9270f5067af135a44536a2aa5e Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Mon, 15 May 2023 10:59:20 +0300 Subject: [PATCH 2/4] Rename vars to better correspond their origin/usage: keyfile(name) -> subjfile(name) --- annif/corpus/document.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/annif/corpus/document.py b/annif/corpus/document.py index c26c75122..258285a26 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -42,15 +42,15 @@ def __iter__(self): @property def documents(self): - for docfilename, keyfilename in self: + for docfilename, subjfilename in self: with open(docfilename, errors="replace", encoding="utf-8-sig") as docfile: text = docfile.read() - if keyfilename is None: + if subjfilename is None: yield Document(text=text, subject_set=None) continue - with open(keyfilename, encoding="utf-8-sig") as keyfile: + with open(subjfilename, encoding="utf-8-sig") as subjfile: subjects = SubjectSet.from_string( - keyfile.read(), self.subject_index, self.language + subjfile.read(), self.subject_index, self.language ) yield Document(text=text, subject_set=subjects) From 971202817d5187b5ecacf3b5e984a07d374af662 Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Mon, 15 May 2023 11:15:34 +0300 Subject: [PATCH 3/4] Fix crashing index cmd when subject files are present in target dir --- annif/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/annif/cli.py b/annif/cli.py index 10d632c2c..7ab26998c 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -319,7 +319,7 @@ def run_index( backend_params = cli_util.parse_backend_params(backend_param, project) documents = annif.corpus.DocumentDirectory( - directory, None, None, require_subjects=False + directory, project.subjects, lang, require_subjects=False ) results = project.suggest_corpus(documents, backend_params).filter(limit, threshold) From b602ea246c8de003faf40599cf5849b328ef114e Mon Sep 17 00:00:00 2001 From: Juho Inkinen <34240031+juhoinkinen@users.noreply.github.com> Date: Tue, 16 May 2023 12:35:59 +0300 Subject: [PATCH 4/4] Parse tsv/key files in DocumentCorpus only if required --- annif/cli.py | 4 +--- annif/corpus/document.py | 25 +++++++++++++------------ tests/conftest.py | 4 +++- tests/test_backend.py | 4 +++- tests/test_corpus.py | 20 +++++++++++--------- tests/test_project.py | 4 +++- 6 files changed, 34 insertions(+), 27 deletions(-) diff --git a/annif/cli.py b/annif/cli.py index 7ab26998c..66b723794 100644 --- a/annif/cli.py +++ b/annif/cli.py @@ -318,9 +318,7 @@ def run_index( raise click.BadParameter(f'language "{lang}" not supported by vocabulary') backend_params = cli_util.parse_backend_params(backend_param, project) - documents = annif.corpus.DocumentDirectory( - directory, project.subjects, lang, require_subjects=False - ) + documents = annif.corpus.DocumentDirectory(directory, require_subjects=False) results = project.suggest_corpus(documents, backend_params).filter(limit, threshold) for (docfilename, _), suggestions in zip(documents, results): diff --git a/annif/corpus/document.py b/annif/corpus/document.py index 258285a26..54a0a3ba6 100644 --- a/annif/corpus/document.py +++ b/annif/corpus/document.py @@ -17,7 +17,7 @@ class DocumentDirectory(DocumentCorpus): """A directory of files as a full text document corpus""" - def __init__(self, path, subject_index, language, require_subjects=False): + def __init__(self, path, subject_index=None, language=None, require_subjects=False): self.path = path self.subject_index = subject_index self.language = language @@ -25,19 +25,20 @@ def __init__(self, path, subject_index, language, require_subjects=False): def __iter__(self): """Iterate through the directory, yielding tuples of (docfile, - subjectfile) containing file paths. If there is no key file and - require_subjects is False, the subjectfile will be returned as None.""" + subjectfile) containing file paths. If require_subjects is False, the + subjectfile will be returned as None.""" for filename in sorted(glob.glob(os.path.join(self.path, "*.txt"))): - tsvfilename = re.sub(r"\.txt$", ".tsv", filename) - if os.path.exists(tsvfilename): - yield (filename, tsvfilename) - continue - keyfilename = re.sub(r"\.txt$", ".key", filename) - if os.path.exists(keyfilename): - yield (filename, keyfilename) - continue - if not self.require_subjects: + if self.require_subjects: + tsvfilename = re.sub(r"\.txt$", ".tsv", filename) + if os.path.exists(tsvfilename): + yield (filename, tsvfilename) + continue + keyfilename = re.sub(r"\.txt$", ".key", filename) + if os.path.exists(keyfilename): + yield (filename, keyfilename) + continue + else: yield (filename, None) @property diff --git a/tests/conftest.py b/tests/conftest.py index fcccb268f..76378a98d 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -106,7 +106,9 @@ def fulltext_corpus(subject_index): ftdir = os.path.join( os.path.dirname(__file__), "corpora", "archaeology", "fulltext" ) - ft_corpus = annif.corpus.DocumentDirectory(ftdir, subject_index, "fi") + ft_corpus = annif.corpus.DocumentDirectory( + ftdir, subject_index, "fi", require_subjects=True + ) return ft_corpus diff --git a/tests/test_backend.py b/tests/test_backend.py index 378cb78b8..b7e583d1c 100644 --- a/tests/test_backend.py +++ b/tests/test_backend.py @@ -32,7 +32,9 @@ def test_learn_dummy(project, tmpdir): tmpdir.join("doc1.tsv").write("\tarchaeologists") tmpdir.join("doc2.txt").write("doc2") tmpdir.join("doc2.tsv").write("\tdummy") - docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en") + docdir = annif.corpus.DocumentDirectory( + str(tmpdir), project.subjects, "en", require_subjects=True + ) dummy.learn(docdir) diff --git a/tests/test_corpus.py b/tests/test_corpus.py index 6580477bb..6e7db5158 100644 --- a/tests/test_corpus.py +++ b/tests/test_corpus.py @@ -80,38 +80,38 @@ def test_subjectset_as_vector_destination(subject_index): assert vector is destination -def test_docdir_key(tmpdir, subject_index): +def test_docdir_key(tmpdir): tmpdir.join("doc1.txt").write("doc1") tmpdir.join("doc1.key").write("key1") tmpdir.join("doc2.txt").write("doc2") tmpdir.join("doc2.key").write("key2") tmpdir.join("doc3.txt").write("doc3") - docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en") + docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False) files = sorted(list(docdir)) assert len(files) == 3 assert files[0][0] == str(tmpdir.join("doc1.txt")) - assert files[0][1] == str(tmpdir.join("doc1.key")) + assert files[0][1] is None assert files[1][0] == str(tmpdir.join("doc2.txt")) - assert files[1][1] == str(tmpdir.join("doc2.key")) + assert files[1][1] is None assert files[2][0] == str(tmpdir.join("doc3.txt")) assert files[2][1] is None -def test_docdir_tsv(tmpdir, subject_index): +def test_docdir_tsv(tmpdir): tmpdir.join("doc1.txt").write("doc1") tmpdir.join("doc1.tsv").write("\tkey1") tmpdir.join("doc2.txt").write("doc2") tmpdir.join("doc2.tsv").write("\tkey2") tmpdir.join("doc3.txt").write("doc3") - docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "en") + docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=False) files = sorted(list(docdir)) assert len(files) == 3 assert files[0][0] == str(tmpdir.join("doc1.txt")) - assert files[0][1] == str(tmpdir.join("doc1.tsv")) + assert files[0][1] is None assert files[1][0] == str(tmpdir.join("doc2.txt")) - assert files[1][1] == str(tmpdir.join("doc2.tsv")) + assert files[1][1] is None assert files[2][0] == str(tmpdir.join("doc3.txt")) assert files[2][1] is None @@ -126,7 +126,9 @@ def test_docdir_tsv_bom(tmpdir, subject_index): "\trautakausi".encode("utf-8-sig") ) - docdir = annif.corpus.DocumentDirectory(str(tmpdir), subject_index, "fi") + docdir = annif.corpus.DocumentDirectory( + str(tmpdir), subject_index, "fi", require_subjects=True + ) docs = list(docdir.documents) assert docs[0].text == "doc1" assert ( diff --git a/tests/test_project.py b/tests/test_project.py index 6600d664c..a6294edb6 100644 --- a/tests/test_project.py +++ b/tests/test_project.py @@ -183,7 +183,9 @@ def test_project_learn(registry, tmpdir): tmpdir.join("doc2.tsv").write("\tdummy") project = registry.get_project("dummy-fi") - docdir = annif.corpus.DocumentDirectory(str(tmpdir), project.subjects, "en") + docdir = annif.corpus.DocumentDirectory( + str(tmpdir), project.subjects, "en", require_subjects=True + ) project.learn(docdir) result = project.suggest(["this is some text"])[0] assert len(result) == 1