From d48322735df64c69fb8aaf7bc1ee269dcf2ba63f Mon Sep 17 00:00:00 2001 From: Benjamin Kiessling Date: Wed, 19 Jun 2024 19:56:19 +0200 Subject: [PATCH] Rebuild binary dataset alphabet when selecting all alphabet-changing transformations Fixes #616. --- kraken/lib/train.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kraken/lib/train.py b/kraken/lib/train.py index 1e5a41e80..e7506658c 100644 --- a/kraken/lib/train.py +++ b/kraken/lib/train.py @@ -457,8 +457,10 @@ def _build_dataset(self, dataset.add(**sample) except KrakenInputException as e: logger.warning(str(e)) - if self.format_type == 'binary' and self.hparams.hyper_params['normalization']: - logger.debug('Rebuilding dataset using unicode normalization') + if self.format_type == 'binary' and (self.hparams.hyper_params['normalization'] or + self.hparams.hyper_params['normalize_whitespace'] or + self.reorder): + logger.debug('Text transformations modifying alphabet selected. Rebuilding alphabet') dataset.rebuild_alphabet() return dataset