diff --git a/machine/translation/thot/thot_smt_model.py b/machine/translation/thot/thot_smt_model.py index 5104162..a9f2b5f 100644 --- a/machine/translation/thot/thot_smt_model.py +++ b/machine/translation/thot/thot_smt_model.py @@ -71,6 +71,10 @@ def __init__( else: self._config_filename = Path(config) parameters = ThotSmtParameters.load(config) + if not Path(parameters.translation_model_filename_prefix + ".ttable").is_file(): + raise FileNotFoundError("The translation model could not be found.") + if not Path(parameters.language_model_filename_prefix).is_file(): + raise FileNotFoundError("The language model could not be found.") self._parameters = parameters self.source_tokenizer = source_tokenizer self.target_tokenizer = target_tokenizer diff --git a/machine/translation/thot/thot_utils.py b/machine/translation/thot/thot_utils.py index 2797660..d98943f 100644 --- a/machine/translation/thot/thot_utils.py +++ b/machine/translation/thot/thot_utils.py @@ -38,8 +38,10 @@ def load_smt_model(word_alignment_model_type: ThotWordAlignmentModelType, parame model_type = ta.AlignmentModelType.IBM4 model = tt.SmtModel(model_type) - model.load_translation_model(parameters.translation_model_filename_prefix) - model.load_language_model(parameters.language_model_filename_prefix) + if not model.load_translation_model(parameters.translation_model_filename_prefix): + raise RuntimeError("Unable to load translation model.") + if not model.load_language_model(parameters.language_model_filename_prefix): + raise RuntimeError("Unable to load language model.") model.non_monotonicity = parameters.model_non_monotonicity model.w = parameters.model_w model.a = parameters.model_a diff --git a/machine/translation/thot/thot_word_alignment_model.py b/machine/translation/thot/thot_word_alignment_model.py index d8be7e3..9ba606d 100644 --- a/machine/translation/thot/thot_word_alignment_model.py +++ b/machine/translation/thot/thot_word_alignment_model.py @@ -61,15 +61,16 @@ def load(self, prefix_filename: StrPath) -> None: prefix_filename = Path(prefix_filename) if not (prefix_filename.parent / (prefix_filename.name + ".src")).is_file(): raise FileNotFoundError("The word alignment model configuration could not be found.") - self._prefix_filename = prefix_filename self._model.clear() - self._model.load(str(prefix_filename)) + if not self._model.load(str(prefix_filename)): + raise RuntimeError("Unable to load word alignment model.") + self._prefix_filename = prefix_filename def create_new(self, prefix_filename: StrPath) -> None: if self._owned: raise RuntimeError("The word alignment model is owned by an SMT model.") - self._prefix_filename = Path(prefix_filename) self._model.clear() + self._prefix_filename = Path(prefix_filename) def save(self) -> None: if self._prefix_filename is not None: diff --git a/poetry.lock b/poetry.lock index 07a7fce..1665519 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3207,32 +3207,32 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar [[package]] name = "sil-thot" -version = "3.4.1" +version = "3.4.2" description = "A toolkit for statistical word alignment and machine translation" optional = false python-versions = ">=3.7, <4.0" files = [ - {file = "sil-thot-3.4.1.tar.gz", hash = "sha256:419f4ebd93a87531e365c94f377792ad81529148521016d63ef2effd80336676"}, - {file = "sil_thot-3.4.1-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:b3a9ef1e28bccb1564034bad5e3067b685f8c07de90be4281153c147d8f031b0"}, - {file = "sil_thot-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67ba9d35f4768240c2b672925e8fc75aba8a4342a1cc7ce75e2155d219f734c7"}, - {file = "sil_thot-3.4.1-cp310-cp310-win32.whl", hash = "sha256:92fa6dbb89b4ecd67ca29d321221afa0708467efbc3ab0a14a677b27caea948c"}, - {file = "sil_thot-3.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:544e9cda853641741e8c09f5fcc3038307ff3490b495655e62636222adba75ed"}, - {file = "sil_thot-3.4.1-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:85253c5b9a8b2f2108907424a73ddd5029d53193bdcc8090f994fa4a4e2c19b6"}, - {file = "sil_thot-3.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cb533bd81a06833671079de3ffd386b3344b2cafc286f9e827194ca39b02e06b"}, - {file = "sil_thot-3.4.1-cp311-cp311-win32.whl", hash = "sha256:3f00f00adc567157569b5b761054717277e17268c0826cc066f289c1f59ccd9c"}, - {file = "sil_thot-3.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:dffce5bbfeffb48c66efc0eb27fd104bf7b0dd6fe9287cdad07c7fa8d14949e5"}, - {file = "sil_thot-3.4.1-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:2dea53d3c2b5cca315d5ffb766c6dfd316f48c21c80b134050e0139bcd6b87a7"}, - {file = "sil_thot-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:81e40d067b225d8d101c5c012385c781f19594f731ff0dda32af65630be0b962"}, - {file = "sil_thot-3.4.1-cp37-cp37m-win32.whl", hash = "sha256:97046c15bc86da70d92c993cebea6aace8cd067db8ac205996916ee71c900e83"}, - {file = "sil_thot-3.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:1d6db1a975d2d79f65830b65d545f862f34b568481b535dd7464342d41f4355d"}, - {file = "sil_thot-3.4.1-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c57b5f8f8786ed582709dec20ece7c3a9ac7aeb2201d1843485aac8223857add"}, - {file = "sil_thot-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3dbd7d937579a8591f0b78aeb2581a6e2955d84ceec6439eecd3b8426022dcbc"}, - {file = "sil_thot-3.4.1-cp38-cp38-win32.whl", hash = "sha256:be83e587456f8748fa532622614687bf87152c0385d2a808a045493baf65b3ab"}, - {file = "sil_thot-3.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:2820b23e852a1f84732aace0ffe50dd7bea44913ed2c246beaedc1c9f16516a6"}, - {file = "sil_thot-3.4.1-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:c1705f208a5593245c5826d15175974d358e869d99e55ae206c9e899e6252607"}, - {file = "sil_thot-3.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d3b95146168ed6fa82f730fc5ae733a99d6fc58d61c9de597b4c0dfef01634b"}, - {file = "sil_thot-3.4.1-cp39-cp39-win32.whl", hash = "sha256:587622bc959bddcc7fcb344588f4c872952d84c03df08ed4e7ddb914db67265f"}, - {file = "sil_thot-3.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:b854caecc326deec1ae5c96edd04191b272b2b947932b9e64797aa97d17a0ea5"}, + {file = "sil-thot-3.4.2.tar.gz", hash = "sha256:b99f5ff3c4ea5662317a9f33bcabbbaed6fe5973780ff5bf54b1189d279da582"}, + {file = "sil_thot-3.4.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:6655501b3a0caac286e250593f30633a90e50e448da150cb3da5f1cb2b5e5e57"}, + {file = "sil_thot-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:430d2fdcd40478975bb2f7cc0ef2d150c3c14271eb31c2a4b5bfa6f549b9c278"}, + {file = "sil_thot-3.4.2-cp310-cp310-win32.whl", hash = "sha256:dd8b3b19d01e891bfef9e974d96d2a37368a56c46231f292216eb42ac79ca537"}, + {file = "sil_thot-3.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:4216ca050e715ca949a383e19b22ba51031e2a49dac70845b027f89f6e6833fe"}, + {file = "sil_thot-3.4.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:041ecbd34edb370c132fe80b8f8fd6b26ae15645f82849847f8de274c02cf5f4"}, + {file = "sil_thot-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ebbde7fe7e76195155d1da1fc6c67a152ec39d954b45b9f1c1d3e77ef64a524a"}, + {file = "sil_thot-3.4.2-cp311-cp311-win32.whl", hash = "sha256:82b179278b5439e9b55f40f249f9d97926e11aca398b7346092eb1362bb3ae4d"}, + {file = "sil_thot-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:d6fac867c4507a87db002e70169894be93f7f2600413f6f389fe534a1ebc2491"}, + {file = "sil_thot-3.4.2-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:e6dd402b45dfef91160f34afffd65eabe19390d5aaa354b67709a5c7564c020e"}, + {file = "sil_thot-3.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2afe1c092c85b0cf288d9ccbf42659393a123d01997adc2c0632bfc429b1d3"}, + {file = "sil_thot-3.4.2-cp37-cp37m-win32.whl", hash = "sha256:31abdf6323e5595568816b5eec70ffc2c258f4c91d2ab65920d2527fec0f8058"}, + {file = "sil_thot-3.4.2-cp37-cp37m-win_amd64.whl", hash = "sha256:35ce0bef912464a8d475fd314a1eca88ead221cec27af9a8e2a317f758d9962c"}, + {file = "sil_thot-3.4.2-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:4cfaa22b44eadddc834bf53008ddbf2735f33871d66c9acca59025cfb7f65abe"}, + {file = "sil_thot-3.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e27df92edc31d68b499d724bfab772ee3a073156c86414b3880c81574dccd438"}, + {file = "sil_thot-3.4.2-cp38-cp38-win32.whl", hash = "sha256:d4103734b6ac91e78cf28865edde5199c462dfa1665b67307fc9b093bb634133"}, + {file = "sil_thot-3.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:12139b6185c07b5ae10c90a1387422b993aefb78a76b00d685fcd8b007459adc"}, + {file = "sil_thot-3.4.2-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:19de1f88307c58532748ca9b1cdd1d5884cdf196e39cf43db31a61fdd59c6de0"}, + {file = "sil_thot-3.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:768737bdacfd77e11c68c01b78aeee03ee0a11ee9f81b03cc5371d2b77b1a032"}, + {file = "sil_thot-3.4.2-cp39-cp39-win32.whl", hash = "sha256:c9a4805fc620de7d01e84629eddfa9cf3dde7be0ccb11720ffe1c90d533c4f50"}, + {file = "sil_thot-3.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:ab0034c447bf34f6d93e803f5efec048411562ee265b3f5611388efb72c662ae"}, ] [package.extras] @@ -3954,4 +3954,4 @@ thot = ["sil-thot"] [metadata] lock-version = "2.0" python-versions = ">=3.8,<3.12" -content-hash = "7d53fccb68beb7d88f67c7e02522bd23ac4210c877db3e55912b95c9a129ecce" +content-hash = "ba021a99d637a6f12f63df80d4a2eceab0a71c1f8a01379bcfb19ff1da2cd934" diff --git a/pyproject.toml b/pyproject.toml index e0e1e77..ab86b13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ charset-normalizer = "^2.1.1" ### extras sentencepiece = "^0.1.95" -sil-thot = "^3.4.0" +sil-thot = "^3.4.2" # huggingface extras transformers = "^4.34.0" datasets = "^2.4.0" diff --git a/tests/corpora/test_text_file_text_corpus.py b/tests/corpora/test_text_file_text_corpus.py index 6b1c340..ef6fc0e 100644 --- a/tests/corpora/test_text_file_text_corpus.py +++ b/tests/corpora/test_text_file_text_corpus.py @@ -1,11 +1,11 @@ -import pytest +from pytest import raises from testutils.corpora_test_helpers import TEXT_TEST_PROJECT_PATH from machine.corpora import TextFileTextCorpus def test_does_not_exist() -> None: - with pytest.raises(FileNotFoundError): + with raises(FileNotFoundError): TextFileTextCorpus("does-not-exist.txt") diff --git a/tests/jobs/test_nmt_engine_build_job.py b/tests/jobs/test_nmt_engine_build_job.py index b3a1bf6..d02803a 100644 --- a/tests/jobs/test_nmt_engine_build_job.py +++ b/tests/jobs/test_nmt_engine_build_job.py @@ -3,8 +3,8 @@ from io import StringIO from typing import Iterator -import pytest from decoy import Decoy, matchers +from pytest import raises from machine.annotations import Range from machine.corpora import DictionaryTextCorpus @@ -27,7 +27,7 @@ def test_run(decoy: Decoy) -> None: def test_cancel(decoy: Decoy) -> None: env = _TestEnvironment(decoy) checker = _CancellationChecker(3) - with pytest.raises(CanceledError): + with raises(CanceledError): env.job.run(check_canceled=checker.check_canceled) assert env.target_pretranslations == "" diff --git a/tests/translation/thot/test_thot_fast_align_word_alignment_model.py b/tests/translation/thot/test_thot_fast_align_word_alignment_model.py index 21aaa84..9d1acb9 100644 --- a/tests/translation/thot/test_thot_fast_align_word_alignment_model.py +++ b/tests/translation/thot/test_thot_fast_align_word_alignment_model.py @@ -1,4 +1,7 @@ -from pytest import approx +from pathlib import Path +from tempfile import TemporaryDirectory + +from pytest import approx, raises from testutils.thot_test_helpers import TOY_CORPUS_FAST_ALIGN_PATH from machine.translation import WordAlignmentMatrix @@ -107,3 +110,11 @@ def test_get_avg_translation_score_symmetrized() -> None: matrix = model.align(source_segment, target_segment) score = model.get_avg_translation_score(source_segment, target_segment, matrix) assert score == approx(0.36, abs=0.01) + + +def test_constructor_model_corrupted() -> None: + with TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + (temp_dir_path / "src_trg_invswm.src").write_text("corrupted", encoding="utf-8") + with raises(RuntimeError): + ThotFastAlignWordAlignmentModel(temp_dir_path / "src_trg_invswm") diff --git a/tests/translation/thot/test_thot_smt_model.py b/tests/translation/thot/test_thot_smt_model.py index 8efc3f9..bd779f2 100644 --- a/tests/translation/thot/test_thot_smt_model.py +++ b/tests/translation/thot/test_thot_smt_model.py @@ -1,6 +1,10 @@ +from pathlib import Path +from tempfile import TemporaryDirectory + +from pytest import raises from testutils.thot_test_helpers import TOY_CORPUS_FAST_ALIGN_CONFIG_FILENAME, TOY_CORPUS_HMM_CONFIG_FILENAME -from machine.translation.thot import ThotSmtModel, ThotWordAlignmentModelType +from machine.translation.thot import ThotSmtModel, ThotSmtParameters, ThotWordAlignmentModelType def test_translate_target_segment_hmm() -> None: @@ -95,6 +99,35 @@ def test_get_word_graph_empty_segment_fast_align() -> None: assert word_graph.is_empty +def test_constructor_model_not_found() -> None: + with raises(FileNotFoundError): + ThotSmtModel( + ThotWordAlignmentModelType.HMM, + ThotSmtParameters( + translation_model_filename_prefix="does-not-exist", language_model_filename_prefix="does-not-exist" + ), + ) + + +def test_constructor_model_corrupted() -> None: + with TemporaryDirectory() as temp_dir: + temp_dir_path = Path(temp_dir) + tm_dir_path = temp_dir_path / "tm" + tm_dir_path.mkdir() + (tm_dir_path / "src_trg.ttable").write_text("corrupted", encoding="utf-8") + lm_dir_path = temp_dir_path / "lm" + lm_dir_path.mkdir() + (lm_dir_path / "trg.lm").write_text("corrupted", encoding="utf-8") + with raises(RuntimeError): + ThotSmtModel( + ThotWordAlignmentModelType.HMM, + ThotSmtParameters( + translation_model_filename_prefix=str(tm_dir_path / "src_trg"), + language_model_filename_prefix=str(lm_dir_path / "trg.lm"), + ), + ) + + def _create_hmm_model() -> ThotSmtModel: return ThotSmtModel(ThotWordAlignmentModelType.HMM, TOY_CORPUS_HMM_CONFIG_FILENAME)