diff --git a/nemo_text_processing/text_normalization/data/roman/__init__.py b/nemo_text_processing/text_normalization/data/roman/__init__.py new file mode 100644 index 000000000000..bc443be41c4c --- /dev/null +++ b/nemo_text_processing/text_normalization/data/roman/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nemo_text_processing/text_normalization/data/whitelist.tsv b/nemo_text_processing/text_normalization/data/whitelist.tsv index d3a54dbf2453..853993190452 100644 --- a/nemo_text_processing/text_normalization/data/whitelist.tsv +++ b/nemo_text_processing/text_normalization/data/whitelist.tsv @@ -1,7 +1,6 @@ Ph.D. p h d Hon. honorable & and -&Co. and Mt. Mount Maj. Major Rev. Reverend diff --git a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv index d3c878e34b43..a25f181bae42 100644 --- a/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv +++ b/nemo_text_processing/text_normalization/data/whitelist_alternatives.tsv @@ -11,4 +11,5 @@ Mrs. Misses Ms. Miss Mr Mister Mrs Misses -Ms Miss \ No newline at end of file +Ms Miss +&Co. and Co. \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/data_loader_utils.py b/nemo_text_processing/text_normalization/data_loader_utils.py index 93413ee33650..1aaac7a76246 100644 --- a/nemo_text_processing/text_normalization/data_loader_utils.py +++ b/nemo_text_processing/text_normalization/data_loader_utils.py @@ -242,6 +242,7 @@ def post_process_punctuation(text: str) -> str: .replace('“', '"') .replace("‘", "'") .replace('`', "'") + .replace('- -', "--") ) for punct in "!,.:;?": diff --git a/nemo_text_processing/text_normalization/normalize_with_audio.py b/nemo_text_processing/text_normalization/normalize_with_audio.py index 5a19851feba0..ca3b7568e5e4 100644 --- a/nemo_text_processing/text_normalization/normalize_with_audio.py +++ b/nemo_text_processing/text_normalization/normalize_with_audio.py @@ -118,7 +118,6 @@ def normalize( normalized_texts = [] for tagged_text in tagged_texts: self._verbalize(tagged_text, normalized_texts) - if len(normalized_texts) == 0: raise ValueError() if punct_post_process: @@ -193,8 +192,7 @@ def calculate_cer(normalized_texts: List[str], transcript: str, remove_punct=Fal text_clean = text.replace('-', ' ').lower() if remove_punct: for punct in "!?:;,.-()*+-/<=>@^_": - text_clean = text_clean.replace(punct, " ") - text_clean = re.sub(r' +', ' ', text_clean) + text_clean = text_clean.replace(punct, "") cer = round(word_error_rate([transcript], [text_clean], use_cer=True) * 100, 2) normalized_options.append((text, cer)) return normalized_options diff --git a/nemo_text_processing/text_normalization/taggers/fraction.py b/nemo_text_processing/text_normalization/taggers/fraction.py index 2fe93ca3ca0d..da9a86ec8538 100644 --- a/nemo_text_processing/text_normalization/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/taggers/fraction.py @@ -28,7 +28,7 @@ class FractionFst(GraphFst): """ Finite state transducer for classifying fraction "23 4/5" -> - tokens { fraction { numerator: "four" denominator: "five" } } + tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } } Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/verbalizers/fraction.py b/nemo_text_processing/text_normalization/verbalizers/fraction.py index e654202b3a8b..50c32c20226a 100644 --- a/nemo_text_processing/text_normalization/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/verbalizers/fraction.py @@ -28,6 +28,8 @@ class FractionFst(GraphFst): """ Finite state transducer for verbalizing fraction + e.g. tokens { fraction { integer: "twenty three" numerator: "four" denominator: "five" } } -> + twenty three four fifth Args: deterministic: if True will provide a single transduction option, diff --git a/nemo_text_processing/text_normalization/verbalizers/roman.py b/nemo_text_processing/text_normalization/verbalizers/roman.py index 2528632cc1ca..bb42f3c52294 100644 --- a/nemo_text_processing/text_normalization/verbalizers/roman.py +++ b/nemo_text_processing/text_normalization/verbalizers/roman.py @@ -27,12 +27,12 @@ class RomanFst(GraphFst): """ - Finite state transducer for verbalizing electronic - e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> c d f one at a b c dot e d u + Finite state transducer for verbalizing roman numerals + e.g. tokens { roman { integer: "one" } } -> one Args: deterministic: if True will provide a single transduction option, - for False multiple transduction are generated (used for audio-based normalization) + for False multiple transduction are generated (used for audio-based normalization) """ def __init__(self, deterministic: bool = True): diff --git a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt index 1314582597c1..436bff19205f 100644 --- a/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt +++ b/tests/nemo_text_processing/data_text_normalization/test_cases_normalize_with_audio.txt @@ -49,7 +49,7 @@ It seemed to her that the jacket Oswald wore was darker than Commission Exhibit It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one hundred and sixty two. It seemed to her that the jacket Oswald wore was darker than Commission Exhibit number one six two. ~"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." -"Father, let this cup pass." He prayed--was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord:-- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." +"Father, let this cup pass." He prayed -- was heard. What cup was it that passed away from him? Sure not the death-cup, now filled to the brim! There was no quailing in the awful word; He still was king of kings, of lords the lord: -- He feared lest, in the suffering waste and grim, His faith might grow too faint and sickly dim." ~1970-2010 nineteen seventy to twenty ten one thousand nine seventy to two thousand ten @@ -104,4 +104,4 @@ twenty five.] ~Francis I--test Francis the first -- test Francis one -- test -Francis first --test \ No newline at end of file +Francis first -- test \ No newline at end of file