From 42a5f20201f4534c145d89b7f54cf9010e203d05 Mon Sep 17 00:00:00 2001 From: Vahid Noroozi Date: Tue, 4 Feb 2020 15:08:26 -0800 Subject: [PATCH] nlp refactoring (#316) Signed-off-by: VahidooX Co-authored-by: Yang Zhang Co-authored-by: Tomasz Kornuta <56979727+tkornuta-nvidia@users.noreply.github.com> Co-authored-by: Evelina <10428420+ekmb@users.noreply.github.com> --- CHANGELOG.md | 4 + Jenkinsfile | 2 +- .../2_Online_ASR_Microphone_Demo.ipynb | 1 - examples/nlp/BERTPretrainingTutorial.ipynb | 26 +- examples/nlp/NERWithBERT.ipynb | 27 +- examples/nlp/PunctuationWithBERT.ipynb | 38 +- examples/nlp/asr_postprocessor.py | 47 +- examples/nlp/bert_pretraining.py | 107 +- ...th_BERT.py => glue_benchmark_with_bert.py} | 124 +- examples/nlp/joint_intent_slot_infer.py | 56 +- examples/nlp/joint_intent_slot_infer_b1.py | 40 +- examples/nlp/joint_intent_slot_with_bert.py | 76 +- ...lm.py => language_modeling_transformer.py} | 43 +- ...ial.py => machine_translation_tutorial.py} | 48 +- examples/nlp/punctuation_capitalization.py | 126 +- .../nlp/punctuation_capitalization_infer.py | 58 +- .../{squad.py => question_answering_squad.py} | 105 +- ...b_format_to_token_classification_format.py | 15 +- .../nlp/scripts/get_squad.py | 18 +- .../nlp/scripts/get_tatoeba.py | 31 +- examples/nlp/scripts/process_wiki_zh.py | 13 +- ...rt.py => text_classification_with_bert.py} | 61 +- examples/nlp/token_classification.py | 124 +- examples/nlp/token_classification_infer.py | 45 +- examples/tts/tacotron2.py | 2 +- nemo/backends/pytorch/nm.py | 2 +- nemo/collections/nlp/__init__.py | 12 +- nemo/collections/nlp/callbacks/__init__.py | 25 + .../glue_benchmark_callback.py} | 37 +- .../joint_intent_slot_callback.py} | 74 +- .../lm_bert_callback.py} | 25 +- .../nlp/callbacks/lm_transformer_callback.py | 46 + .../machine_translation_callback.py} | 46 +- .../punctuation_capitalization_callback.py} | 45 +- .../qa_squad_callback.py} | 32 +- .../callbacks/text_classification_callback.py | 68 + .../token_classification_callback.py} | 34 +- nemo/collections/nlp/data/__init__.py | 21 +- nemo/collections/nlp/data/data_layers.py | 1128 ----------- .../collections/nlp/data/datasets/__init__.py | 46 +- .../nlp/data/datasets/datasets_utils.py | 988 ++++++++++ nemo/collections/nlp/data/datasets/glue.py | 229 --- .../data/datasets/glue_benchmark_dataset.py | 593 ++++++ ...t_slot.py => joint_intent_slot_dataset.py} | 167 +- .../nlp/data/datasets/language_modeling.py | 40 - ...bert_pretraining.py => lm_bert_dataset.py} | 42 +- .../data/datasets/lm_transformer_dataset.py | 187 ++ ...tion.py => machine_translation_dataset.py} | 46 +- ... => punctuation_capitalization_dataset.py} | 42 +- .../{squad.py => qa_squad_dataset.py} | 131 +- ...tion.py => text_classification_dataset.py} | 121 +- ...ion.py => token_classification_dataset.py} | 60 +- nemo/collections/nlp/data/datasets/utils.py | 1681 ----------------- .../nlp/data/tokenizers/__init__.py | 28 +- .../nlp/data/tokenizers/bert_tokenizer.py | 20 +- .../nlp/data/tokenizers/char_tokenizer.py | 20 +- .../tokenizers}/fairseq_tokenizer.py | 18 + .../nlp/data/tokenizers/gpt2_tokenizer.py | 20 +- ...okenizer.py => sentencepiece_tokenizer.py} | 20 +- .../nlp/data/tokenizers/tokenizer_spec.py | 18 + .../nlp/data/tokenizers/word_tokenizer.py | 20 +- ...tokenizer.py => youtokentome_tokenizer.py} | 20 +- nemo/collections/nlp/data/utils.py | 125 -- nemo/collections/nlp/huggingface/__init__.py | 1 - nemo/collections/nlp/metrics/__init__.py | 17 + .../nlp/{utils => }/metrics/bleu.py | 21 +- .../nlp/{utils => }/metrics/sacrebleu.py | 554 +++--- .../nlp/{utils => }/metrics/squad_metrics.py | 118 +- nemo/collections/nlp/modules/__init__.py | 3 - nemo/collections/nlp/modules/classifiers.py | 363 ---- nemo/collections/nlp/modules/losses.py | 422 ----- nemo/collections/nlp/nm/__init__.py | 19 + .../nlp/nm/data_layers/__init__.py | 26 + .../data_layers/glue_benchmark_datalayer.py | 152 ++ .../joint_intent_slot_datalayer.py | 177 ++ .../nlp/nm/data_layers/lm_bert_datalayer.py | 225 +++ .../data_layers/lm_transformer_datalayer.py | 72 + .../machine_translation_datalayer.py | 137 ++ .../punctuation_capitalization_datalayer.py | 106 ++ .../nlp/nm/data_layers/qa_squad_datalayer.py | 108 ++ .../text_classification_datalayer.py | 83 + .../nlp/nm/data_layers/text_datalayer.py | 47 + .../token_classification_datalayer.py | 143 ++ nemo/collections/nlp/nm/losses/__init__.py | 23 + .../nlp/nm/losses/aggregator_loss.py | 61 + .../nlp/nm/losses/joint_intent_slot_loss.py | 127 ++ .../losses/masked_language_modeling_loss.py | 74 + .../padded_smoothed_cross_entropy_loss.py | 77 + .../nlp/nm/losses/qa_squad_loss.py | 107 ++ .../losses/smoothed_cross_entropy_loss.py} | 18 +- .../nm/losses/token_classification_loss.py | 88 + .../collections/nlp/nm/trainables/__init__.py | 18 + .../nlp/nm/trainables/common/__init__.py | 21 + .../trainables/common/huggingface/__init__.py | 17 + .../trainables/common/huggingface/bert_nm.py} | 24 +- .../common/sequence_classification_nm.py | 85 + .../common/sequence_regression_nm.py | 79 + .../common/token_classification_nm.py | 171 ++ .../trainables/common/transformer/__init__.py | 17 + .../transformer/transformer_decoders.py} | 25 +- .../transformer/transformer_encoders.py} | 27 +- .../transformer/transformer_generators.py} | 26 +- .../transformer/transformer_modules.py} | 46 +- .../common/transformer}/transformer_nm.py | 32 +- .../common/transformer/transformer_utils.py} | 0 .../trainables/joint_intent_slot/__init__.py | 17 + .../joint_intent_slot/joint_intent_slot_nm.py | 95 + nemo/collections/nlp/transformer/__init__.py | 5 - nemo/collections/nlp/utils/__init__.py | 4 +- nemo/collections/nlp/utils/callback_utils.py | 97 + .../nlp/utils/callbacks/__init__.py | 0 .../nlp/utils/callbacks/language_modeling.py | 32 - .../callbacks/sentence_classification.py | 69 - .../collections/nlp/utils/common_nlp_utils.py | 144 ++ nemo/collections/nlp/utils/loss_utils.py | 42 + .../collections/nlp/utils/metrics/__init__.py | 0 nemo/collections/nlp/utils/nlp_utils.py | 123 -- tests/nlp/test_bert.py | 2 +- tests/nlp/test_spc_tokenizer.py | 2 +- tests/nlp/test_squad.py | 93 +- tests/test_deploy_export.py | 27 +- tests/test_infer.py | 8 +- tests/test_neural_types.py | 42 +- 123 files changed, 6616 insertions(+), 5657 deletions(-) rename examples/nlp/{glue_with_BERT.py => glue_benchmark_with_bert.py} (72%) rename examples/nlp/{transformer_lm.py => language_modeling_transformer.py} (74%) rename examples/nlp/{nmt_tutorial.py => machine_translation_tutorial.py} (84%) rename examples/nlp/{squad.py => question_answering_squad.py} (80%) rename {scripts => examples/nlp/scripts}/convert_iob_format_to_token_classification_format.py (83%) rename nemo/collections/nlp/utils/download_squad.py => examples/nlp/scripts/get_squad.py (79%) rename scripts/get_tatoeba_data.py => examples/nlp/scripts/get_tatoeba.py (89%) rename examples/nlp/{sentence_classification_with_bert.py => text_classification_with_bert.py} (72%) create mode 100644 nemo/collections/nlp/callbacks/__init__.py rename nemo/collections/nlp/{utils/callbacks/glue.py => callbacks/glue_benchmark_callback.py} (80%) rename nemo/collections/nlp/{utils/callbacks/joint_intent_slot.py => callbacks/joint_intent_slot_callback.py} (63%) rename nemo/collections/nlp/{utils/callbacks/bert_pretraining.py => callbacks/lm_bert_callback.py} (52%) create mode 100644 nemo/collections/nlp/callbacks/lm_transformer_callback.py rename nemo/collections/nlp/{utils/callbacks/translation.py => callbacks/machine_translation_callback.py} (64%) rename nemo/collections/nlp/{utils/callbacks/punctuation_capitalization.py => callbacks/punctuation_capitalization_callback.py} (70%) rename nemo/collections/nlp/{utils/callbacks/squad.py => callbacks/qa_squad_callback.py} (67%) create mode 100644 nemo/collections/nlp/callbacks/text_classification_callback.py rename nemo/collections/nlp/{utils/callbacks/token_classification.py => callbacks/token_classification_callback.py} (64%) delete mode 100644 nemo/collections/nlp/data/data_layers.py create mode 100644 nemo/collections/nlp/data/datasets/datasets_utils.py delete mode 100644 nemo/collections/nlp/data/datasets/glue.py create mode 100644 nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py rename nemo/collections/nlp/data/datasets/{joint_intent_slot.py => joint_intent_slot_dataset.py} (54%) delete mode 100644 nemo/collections/nlp/data/datasets/language_modeling.py rename nemo/collections/nlp/data/datasets/{bert_pretraining.py => lm_bert_dataset.py} (91%) create mode 100644 nemo/collections/nlp/data/datasets/lm_transformer_dataset.py rename nemo/collections/nlp/data/datasets/{translation.py => machine_translation_dataset.py} (77%) rename nemo/collections/nlp/data/datasets/{punctuation_capitalization.py => punctuation_capitalization_dataset.py} (91%) rename nemo/collections/nlp/data/datasets/{squad.py => qa_squad_dataset.py} (88%) rename nemo/collections/nlp/data/datasets/{sentence_classification.py => text_classification_dataset.py} (54%) rename nemo/collections/nlp/data/datasets/{token_classification.py => token_classification_dataset.py} (88%) delete mode 100644 nemo/collections/nlp/data/datasets/utils.py rename nemo/collections/nlp/{utils/metrics => data/tokenizers}/fairseq_tokenizer.py (79%) rename nemo/collections/nlp/data/tokenizers/{spc_tokenizer.py => sentencepiece_tokenizer.py} (79%) rename nemo/collections/nlp/data/tokenizers/{yttm_tokenizer.py => youtokentome_tokenizer.py} (58%) delete mode 100644 nemo/collections/nlp/data/utils.py delete mode 100644 nemo/collections/nlp/huggingface/__init__.py create mode 100644 nemo/collections/nlp/metrics/__init__.py rename nemo/collections/nlp/{utils => }/metrics/bleu.py (88%) rename nemo/collections/nlp/{utils => }/metrics/sacrebleu.py (90%) rename nemo/collections/nlp/{utils => }/metrics/squad_metrics.py (85%) delete mode 100644 nemo/collections/nlp/modules/__init__.py delete mode 100644 nemo/collections/nlp/modules/classifiers.py delete mode 100644 nemo/collections/nlp/modules/losses.py create mode 100644 nemo/collections/nlp/nm/__init__.py create mode 100644 nemo/collections/nlp/nm/data_layers/__init__.py create mode 100644 nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py create mode 100644 nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py create mode 100644 nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py create mode 100644 nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py create mode 100644 nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py create mode 100644 nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py create mode 100644 nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py create mode 100644 nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py create mode 100644 nemo/collections/nlp/nm/data_layers/text_datalayer.py create mode 100644 nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py create mode 100644 nemo/collections/nlp/nm/losses/__init__.py create mode 100644 nemo/collections/nlp/nm/losses/aggregator_loss.py create mode 100644 nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py create mode 100644 nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py create mode 100644 nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py create mode 100644 nemo/collections/nlp/nm/losses/qa_squad_loss.py rename nemo/collections/nlp/{modules/pytorch_utils.py => nm/losses/smoothed_cross_entropy_loss.py} (72%) create mode 100644 nemo/collections/nlp/nm/losses/token_classification_loss.py create mode 100644 nemo/collections/nlp/nm/trainables/__init__.py create mode 100644 nemo/collections/nlp/nm/trainables/common/__init__.py create mode 100644 nemo/collections/nlp/nm/trainables/common/huggingface/__init__.py rename nemo/collections/nlp/{huggingface/bert.py => nm/trainables/common/huggingface/bert_nm.py} (84%) create mode 100644 nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py create mode 100644 nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py create mode 100644 nemo/collections/nlp/nm/trainables/common/token_classification_nm.py create mode 100644 nemo/collections/nlp/nm/trainables/common/transformer/__init__.py rename nemo/collections/nlp/{transformer/decoders.py => nm/trainables/common/transformer/transformer_decoders.py} (86%) rename nemo/collections/nlp/{transformer/encoders.py => nm/trainables/common/transformer/transformer_encoders.py} (91%) rename nemo/collections/nlp/{transformer/generators.py => nm/trainables/common/transformer/transformer_generators.py} (95%) rename nemo/collections/nlp/{transformer/modules.py => nm/trainables/common/transformer/transformer_modules.py} (92%) rename nemo/collections/nlp/{modules => nm/trainables/common/transformer}/transformer_nm.py (92%) rename nemo/collections/nlp/{transformer/utils.py => nm/trainables/common/transformer/transformer_utils.py} (100%) create mode 100644 nemo/collections/nlp/nm/trainables/joint_intent_slot/__init__.py create mode 100644 nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py delete mode 100644 nemo/collections/nlp/transformer/__init__.py create mode 100644 nemo/collections/nlp/utils/callback_utils.py delete mode 100644 nemo/collections/nlp/utils/callbacks/__init__.py delete mode 100644 nemo/collections/nlp/utils/callbacks/language_modeling.py delete mode 100644 nemo/collections/nlp/utils/callbacks/sentence_classification.py create mode 100644 nemo/collections/nlp/utils/common_nlp_utils.py create mode 100644 nemo/collections/nlp/utils/loss_utils.py delete mode 100644 nemo/collections/nlp/utils/metrics/__init__.py delete mode 100644 nemo/collections/nlp/utils/nlp_utils.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b0c3a818beb4..9936424681c0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -84,6 +84,10 @@ To release a new version, please update the changelog as followed: ([PR #286](https://github.com/NVIDIA/NeMo/pull/286)) - @stasbel - Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params). ([PR #309](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia +- Refactoring of `nemo_nlp` collections: +([PR #316](https://github.com/NVIDIA/NeMo/pull/316)) - @VahidooX, @yzhang123, @ekmb + - renaming of files and restructuring of folder in `nemo_nlp` + - Updated licenses ### Dependencies Update diff --git a/Jenkinsfile b/Jenkinsfile index c43f67d59ca5..d0d2b0eaa5b1 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -60,7 +60,7 @@ pipeline { } stage ('NMT test') { steps { - sh 'cd examples/nlp && CUDA_VISIBLE_DEVICES=0 python nmt_tutorial.py' + sh 'cd examples/nlp && CUDA_VISIBLE_DEVICES=0 python machine_translation_tutorial.py' } } } diff --git a/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb b/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb index 4a842b3a4365..0a4a5842f0b8 100644 --- a/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb +++ b/examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb @@ -173,7 +173,6 @@ "data_layer = AudioDataLayer()\n", "\n", "data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(\n", - " factory=neural_factory,\n", " **model_definition['AudioToMelSpectrogramPreprocessor'])\n", "\n", "jasper_encoder = nemo_asr.JasperEncoder(\n", diff --git a/examples/nlp/BERTPretrainingTutorial.ipynb b/examples/nlp/BERTPretrainingTutorial.ipynb index 6c62a495db50..f33887452dbc 100644 --- a/examples/nlp/BERTPretrainingTutorial.ipynb +++ b/examples/nlp/BERTPretrainingTutorial.ipynb @@ -58,8 +58,8 @@ "from nemo.utils.lr_policies import CosineAnnealing\n", "\n", "import nemo.collections.nlp as nemo_nlp\n", - "from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n", - "from nemo.collections.nlp.utils.callbacks.bert_pretraining import eval_iter_callback, \\\n", + "from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n", + "from nemo.collections.nlp.callbacks.lm_bert_callback import eval_iter_callback, \\\n", " eval_epochs_done_callback\n", "\n", "BATCHES_PER_STEP = 1\n", @@ -126,7 +126,7 @@ "metadata": {}, "outputs": [], "source": [ - "bert_model = nemo_nlp.huggingface.BERT(\n", + "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n", " vocab_size=tokenizer.vocab_size,\n", " num_hidden_layers=NUM_LAYERS,\n", " hidden_size=D_MODEL,\n", @@ -144,21 +144,21 @@ "outputs": [], "source": [ "# Masked Language Modeling Loss\n", - "mlm_classifier = nemo_nlp.BertTokenClassifier(D_MODEL,\n", + "mlm_classifier = nemo_nlp.nm.trainables.BertTokenClassifier(D_MODEL,\n", " num_classes=tokenizer.vocab_size,\n", " activation=HIDDEN_ACT,\n", " log_softmax=True)\n", - "mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM()\n", + "mlm_loss = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM()\n", "\n", "# Next Sentence Prediciton Loss\n", - "nsp_classifier = nemo_nlp.SequenceClassifier(D_MODEL,\n", + "nsp_classifier = nemo_nlp.nm.trainables.SequenceClassifier(D_MODEL,\n", " num_classes=2,\n", " num_layers=2,\n", " activation='tanh',\n", " log_softmax=False)\n", "nsp_loss = nemo.backends.pytorch.common.CrossEntropyLoss()\n", "\n", - "bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)" + "bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)" ] }, { @@ -167,7 +167,8 @@ "metadata": {}, "outputs": [], "source": [ - "train_data_layer = nemo_nlp.BertPretrainingDataLayer(\n", + "import os\n", + "train_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n", " tokenizer=tokenizer,\n", " dataset=os.path.join(\"data/lm/wikitext-2\", \"train.txt\"),\n", " max_seq_length=MAX_SEQ_LENGTH,\n", @@ -175,7 +176,7 @@ " batch_size=BATCH_SIZE\n", ")\n", "\n", - "eval_data_layer = nemo_nlp.BertPretrainingDataLayer(\n", + "eval_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n", " tokenizer=tokenizer,\n", " dataset=os.path.join(\"data/lm/wikitext-2\", \"valid.txt\"),\n", " max_seq_length=MAX_SEQ_LENGTH,\n", @@ -282,6 +283,13 @@ " \"grad_norm_clip\": None\n", " })" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/examples/nlp/NERWithBERT.ipynb b/examples/nlp/NERWithBERT.ipynb index 19cf18f8389b..9d993fd4c3a8 100644 --- a/examples/nlp/NERWithBERT.ipynb +++ b/examples/nlp/NERWithBERT.ipynb @@ -13,16 +13,18 @@ "from nemo.utils.lr_policies import WarmupAnnealing\n", "\n", "import nemo.collections.nlp as nemo_nlp\n", - "from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n", - "from nemo.collections.nlp.utils.callbacks.token_classification import \\\n", - " eval_iter_callback, eval_epochs_done_callback" + "from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n", + "from nemo.collections.nlp.callbacks.token_classification_callback import \\\n", + " eval_iter_callback, eval_epochs_done_callback\n", + "from nemo.collections.nlp.nm.losses import TokenClassificationLoss\n", + "from nemo.collections.nlp.nm.trainables import TokenClassifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it." + "You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it." ] }, { @@ -78,7 +80,7 @@ "# If you're using a standard BERT model, you should do it like this. To see the full\n", "# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n", "tokenizer = NemoBertTokenizer(pretrained_model=\"bert-base-cased\")\n", - "bert_model = nemo_nlp.huggingface.BERT(\n", + "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n", " pretrained_model_name=\"bert-base-cased\")" ] }, @@ -89,7 +91,7 @@ "outputs": [], "source": [ "# Describe training DAG\n", - "train_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n", + "train_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n", " tokenizer=tokenizer,\n", " text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n", " label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n", @@ -99,13 +101,12 @@ "label_ids = train_data_layer.dataset.label_ids\n", "num_classes = len(label_ids)\n", "\n", - "ner_classifier = nemo_nlp.TokenClassifier(hidden_size=bert_model.hidden_size,\n", + "hidden_size = bert_model.hidden_size\n", + "ner_classifier = TokenClassifier(hidden_size=hidden_size,\n", " num_classes=num_classes,\n", " dropout=CLASSIFICATION_DROPOUT)\n", "\n", - "ner_loss = nemo_nlp.TokenClassificationLoss(d_model=hidden_size,\n", - " num_classes=len(label_ids),\n", - " dropout=CLASSIFICATION_DROPOUT)\n", + "ner_loss = TokenClassificationLoss(num_classes=len(label_ids))\n", "\n", "input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()\n", "\n", @@ -124,7 +125,7 @@ "outputs": [], "source": [ "# Describe evaluation DAG\n", - "eval_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n", + "eval_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n", " tokenizer=tokenizer,\n", " text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n", " label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n", @@ -203,9 +204,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.7.4 64-bit", + "display_name": "Python 3", "language": "python", - "name": "python37464bitc56e562f54084a24b5afed5459c99218" + "name": "python3" }, "language_info": { "codemirror_mode": { diff --git a/examples/nlp/PunctuationWithBERT.ipynb b/examples/nlp/PunctuationWithBERT.ipynb index 58d0f57f8edb..de8c77eec5bc 100644 --- a/examples/nlp/PunctuationWithBERT.ipynb +++ b/examples/nlp/PunctuationWithBERT.ipynb @@ -11,12 +11,15 @@ "import os\n", "\n", "import nemo\n", + "from nemo import logging\n", "from nemo.utils.lr_policies import WarmupAnnealing\n", "\n", "import nemo.collections.nlp as nemo_nlp\n", - "from nemo.collections.nlp import NemoBertTokenizer, TokenClassifier, TokenClassificationLoss\n", - "from nemo.collections.nlp.data.datasets import utils\n", - "from nemo.collections.nlp.utils.callbacks.punctuation_capitalization import eval_iter_callback, eval_epochs_done_callback\n", + "from nemo.collections.nlp.data import NemoBertTokenizer\n", + "from nemo.collections.nlp.nm.trainables import TokenClassifier\n", + "from nemo.collections.nlp.nm.losses import TokenClassificationLoss, LossAggregatorNM\n", + "from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import eval_iter_callback, eval_epochs_done_callback\n", + "from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights\n", "\n", "DATA_DIR = \"PATH_TO_WHERE_THE_DATA_IS\"\n", "WORK_DIR = \"PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS\"\n", @@ -47,7 +50,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data." + "In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/nemo/collections/nlp/data/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data." ] }, { @@ -57,7 +60,8 @@ "outputs": [], "source": [ "# This should take about a minute since the data is already downloaded in the previous step\n", - "! python ../../scripts/get_tatoeba_data.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES" + "\n", + "! python ../../nemo/collections/nlp/data/scripts/get_tatoeba.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES" ] }, { @@ -116,7 +120,7 @@ "# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n", "\n", "tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)\n", - "bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)" + "bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)" ] }, { @@ -132,7 +136,7 @@ "metadata": {}, "outputs": [], "source": [ - "train_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n", + "train_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n", " tokenizer=tokenizer,\n", " text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n", " label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n", @@ -144,14 +148,14 @@ "\n", "\n", "# Define classifier for Punctuation and Capitalization tasks\n", - "punct_classifier = nemo_nlp.TokenClassifier(\n", + "punct_classifier = TokenClassifier(\n", " hidden_size=bert_model.hidden_size,\n", " num_classes=len(punct_label_ids),\n", " dropout=CLASSIFICATION_DROPOUT,\n", " num_layers=PUNCT_NUM_FC_LAYERS,\n", " name='Punctuation')\n", "\n", - "capit_classifier = nemo_nlp.TokenClassifier(\n", + "capit_classifier = TokenClassifier(\n", " hidden_size=bert_model.hidden_size,\n", " num_classes=len(capit_label_ids),\n", " dropout=CLASSIFICATION_DROPOUT,\n", @@ -160,14 +164,14 @@ "\n", "# If you don't want to use weighted loss for Punctuation task, use class_weights=None\n", "punct_label_freqs = train_data_layer.dataset.punct_label_frequencies\n", - "class_weights = utils.calc_class_weights(punct_label_freqs)\n", + "class_weights = calc_class_weights(punct_label_freqs)\n", "\n", "# define loss\n", - "punct_loss = nemo_nlp.TokenClassificationLoss(\n", + "punct_loss = TokenClassificationLoss(\n", " num_classes=len(punct_label_ids),\n", " class_weights=class_weights)\n", - "capit_loss = nemo_nlp.TokenClassificationLoss(num_classes=len(capit_label_ids))\n", - "task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)" + "capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids))\n", + "task_loss = LossAggregatorNM(num_inputs=2)" ] }, { @@ -218,7 +222,7 @@ "# during creation of the train_data_layer to make sure that the mapping is correct in case some of the labels from\n", "# the train set are missing in the dev set.\n", "\n", - "eval_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n", + "eval_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n", " tokenizer=tokenizer,\n", " text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n", " label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n", @@ -361,7 +365,7 @@ "metadata": {}, "outputs": [], "source": [ - "infer_data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(\n", + "infer_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(\n", " queries=queries,\n", " tokenizer=tokenizer,\n", " max_seq_length=MAX_SEQ_LENGTH,\n", @@ -399,7 +403,7 @@ "capit_preds = np.argmax(capit_logits, axis=2)\n", "\n", "for i, query in enumerate(queries):\n", - " nf.logger.info(f'Query: {query}')\n", + " logging(f'Query: {query}')\n", "\n", " punct_pred = punct_preds[i][subtokens_mask[i] > 0.5]\n", " capit_pred = capit_preds[i][subtokens_mask[i] > 0.5]\n", @@ -419,7 +423,7 @@ " if punct_label != 'O':\n", " output += punct_label\n", " output += ' '\n", - " nf.logger.info(f'Combined: {output.strip()}\\n')" + " logging(f'Combined: {output.strip()}\\n')" ] }, { diff --git a/examples/nlp/asr_postprocessor.py b/examples/nlp/asr_postprocessor.py index f65de6e8becc..483516621de8 100644 --- a/examples/nlp/asr_postprocessor.py +++ b/examples/nlp/asr_postprocessor.py @@ -1,13 +1,32 @@ -# Copyright (c) 2019 NVIDIA Corporation +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import math import os import torch -import nemo import nemo.collections.nlp as nemo_nlp +import nemo.collections.nlp.nm.data_layers.machine_translation_datalayer +from nemo import logging +from nemo.collections.nlp.callbacks.machine_translation_callback import ( + eval_epochs_done_callback_wer, + eval_iter_callback, +) from nemo.collections.nlp.data.tokenizers.bert_tokenizer import NemoBertTokenizer -from nemo.collections.nlp.utils.callbacks.translation import eval_epochs_done_callback_wer, eval_iter_callback from nemo.core.callbacks import CheckpointCallback from nemo.utils.lr_policies import SquareAnnealing @@ -47,7 +66,7 @@ parser.add_argument("--beam_size", default=4, type=int) parser.add_argument("--len_pen", default=0.0, type=float) parser.add_argument( - "--restore_from", dest="restore_from", type=str, default="../../scripts/bert-base-uncased_decoder.pt", + "--restore_from", dest="restore_from", type=str, default="../../scripts/bert-base-uncased_decoder.pt" ) args = parser.parse_args() @@ -66,14 +85,14 @@ tokens_to_add = vocab_size - tokenizer.vocab_size zeros_transform = nemo.backends.pytorch.common.ZerosLikeNM() -encoder = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_model) +encoder = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_model) device = encoder.bert.embeddings.word_embeddings.weight.get_device() zeros = torch.zeros((tokens_to_add, args.d_model)).to(device=device) encoder.bert.embeddings.word_embeddings.weight.data = torch.cat( (encoder.bert.embeddings.word_embeddings.weight.data, zeros) ) -decoder = nemo_nlp.TransformerDecoderNM( +decoder = nemo_nlp.nm.trainables.TransformerDecoderNM( d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, @@ -90,11 +109,13 @@ decoder.restore_from(args.restore_from, local_rank=args.local_rank) -t_log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True) +t_log_softmax = nemo_nlp.nm.trainables.TokenClassifier( + args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True +) -loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), label_smoothing=0.1) +loss_fn = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), label_smoothing=0.1) -beam_search = nemo_nlp.BeamSearchTranslatorNM( +beam_search = nemo_nlp.nm.trainables.BeamSearchTranslatorNM( decoder=decoder, log_softmax=t_log_softmax, max_seq_length=args.max_seq_length, @@ -114,7 +135,7 @@ def create_pipeline(dataset, tokens_in_batch, clean=False, training=True): dataset_src = os.path.join(args.data_dir, dataset + "." + args.src_lang) dataset_tgt = os.path.join(args.data_dir, dataset + "." + args.tgt_lang) - data_layer = nemo_nlp.TranslationDataLayer( + data_layer = nemo_nlp.nm.data_layers.machine_translation_datalayer.TranslationDataLayer( tokenizer_src=tokenizer, tokenizer_tgt=tokenizer, dataset_src=dataset_src, @@ -126,7 +147,7 @@ def create_pipeline(dataset, tokens_in_batch, clean=False, training=True): input_type_ids = zeros_transform(input_type_ids=src) src_hiddens = encoder(input_ids=src, token_type_ids=input_type_ids, attention_mask=src_mask) tgt_hiddens = decoder( - input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask, + input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask ) log_softmax = t_log_softmax(hidden_states=tgt_hiddens) loss = loss_fn(logits=log_softmax, target_ids=labels) @@ -150,7 +171,7 @@ def create_pipeline(dataset, tokens_in_batch, clean=False, training=True): def print_loss(x): loss = x[0].item() - nemo.logging.info("Training loss: {:.4f}".format(loss)) + logging.info("Training loss: {:.4f}".format(loss)) # callbacks @@ -186,6 +207,6 @@ def print_loss(x): callbacks=callbacks, optimizer=args.optimizer, lr_policy=lr_policy, - optimization_params={"num_epochs": 300, "lr": args.lr, "weight_decay": args.weight_decay,}, + optimization_params={"num_epochs": 300, "lr": args.lr, "weight_decay": args.weight_decay}, batches_per_step=args.iter_per_step, ) diff --git a/examples/nlp/bert_pretraining.py b/examples/nlp/bert_pretraining.py index 2207fe5184fa..046814231296 100644 --- a/examples/nlp/bert_pretraining.py +++ b/examples/nlp/bert_pretraining.py @@ -1,5 +1,18 @@ -#!/usr/bin/env python3 -# Copyright (c) 2019 NVIDIA Corporation +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= """ @@ -62,14 +75,13 @@ import math import os -import torch -from pytorch_transformers import BertConfig +from transformers import BertConfig -import nemo +import nemo.backends.pytorch.common as nemo_common import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.data.datasets.utils import BERTPretrainingDataDesc -from nemo.collections.nlp.transformer.utils import gelu -from nemo.collections.nlp.utils.callbacks.bert_pretraining import eval_epochs_done_callback, eval_iter_callback +import nemo.core as nemo_core +from nemo import logging +from nemo.collections.nlp.data.datasets.lm_bert_dataset import BERTPretrainingDataDesc from nemo.utils.lr_policies import get_lr_policy parser = argparse.ArgumentParser(description='BERT pretraining') @@ -86,9 +98,7 @@ parser.add_argument("--beta2", default=0.25, type=float) parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"]) parser.add_argument("--weight_decay", default=0.0, type=float) -parser.add_argument( - "--tokenizer", default="sentence-piece", type=str, choices=["sentence-piece", "nemo-bert"], -) +parser.add_argument("--tokenizer", default="sentence-piece", type=str, choices=["sentence-piece", "nemo-bert"]) parser.add_argument("--max_seq_length", default=128, type=int) parser.add_argument("--sample_size", default=1e7, type=int) parser.add_argument("--mask_probability", default=0.15, type=float) @@ -108,14 +118,10 @@ ) parser.add_argument("--data_dir", default="data/lm/wikitext-2", type=str) parser.add_argument( - "--preprocessed_data", action="store_true", default=False, help="specify if using preprocessed data", -) -parser.add_argument( - "--gradient_predivide", action="store_true", default=False, help="use gradient predivide", -) -parser.add_argument( - "--only_mlm_loss", action="store_true", default=False, help="use only masked language model loss", + "--preprocessed_data", action="store_true", default=False, help="specify if using preprocessed data" ) +parser.add_argument("--gradient_predivide", action="store_true", default=False, help="use gradient predivide") +parser.add_argument("--only_mlm_loss", action="store_true", default=False, help="use only masked language model loss") parser.add_argument( "--max_steps", default=-1, @@ -125,9 +131,7 @@ ) parser.add_argument("--dataset_name", default="wikitext-2", type=str) parser.add_argument("--load_dir", default=None, type=str) -parser.add_argument( - "--bert_checkpoint", default=None, type=str, help="specify path to pretrained BERT weights", -) +parser.add_argument("--bert_checkpoint", default=None, type=str, help="specify path to pretrained BERT weights") parser.add_argument("--work_dir", default="outputs/bert_lm", type=str) parser.add_argument("--save_epoch_freq", default=1, type=int) parser.add_argument("--save_step_freq", default=100, type=int) @@ -135,8 +139,8 @@ parser.add_argument("--config_file", default=None, type=str, help="The BERT model config") args = parser.parse_args() -nf = nemo.core.NeuralModuleFactory( - backend=nemo.core.Backend.PyTorch, +nf = nemo_core.NeuralModuleFactory( + backend=nemo_core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=args.work_dir, @@ -158,23 +162,23 @@ if not args.preprocessed_data: special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'] data_desc = BERTPretrainingDataDesc( - args.dataset_name, args.data_dir, args.vocab_size, args.sample_size, special_tokens, 'train.txt', + args.dataset_name, args.data_dir, args.vocab_size, args.sample_size, special_tokens, 'train.txt' ) if args.tokenizer == "sentence-piece": - nemo.logging.info("To use SentencePieceTokenizer.") - tokenizer = nemo_nlp.SentencePieceTokenizer(model_path=data_desc.tokenizer_model) + logging.info("To use SentencePieceTokenizer.") + tokenizer = nemo_nlp.data.SentencePieceTokenizer(model_path=data_desc.tokenizer_model) tokenizer.add_special_tokens(special_tokens) elif args.tokenizer == "nemo-bert": - nemo.logging.info("To use NemoBertTokenizer.") + logging.info("To use NemoBertTokenizer.") vocab_file = os.path.join(args.data_dir, 'vocab.txt') # To train on a Chinese dataset, use NemoBertTokenizer - tokenizer = nemo_nlp.NemoBertTokenizer(vocab_file=vocab_file) + tokenizer = nemo_nlp.data.NemoBertTokenizer(vocab_file=vocab_file) else: raise ValueError("Please add your tokenizer " "or use sentence-piece or nemo-bert.") args.vocab_size = tokenizer.vocab_size print(vars(args)) -bert_model = nemo_nlp.huggingface.BERT( +bert_model = nemo_nlp.nm.trainables.huggingface.BERT( vocab_size=args.vocab_size, num_hidden_layers=args.num_hidden_layers, hidden_size=args.hidden_size, @@ -191,17 +195,17 @@ data layers, BERT encoder, and MLM and NSP loss functions """ -mlm_classifier = nemo_nlp.BertTokenClassifier( - args.hidden_size, num_classes=args.vocab_size, activation=args.hidden_act, log_softmax=True, +mlm_classifier = nemo_nlp.nm.trainables.token_classification_nm.BertTokenClassifier( + args.hidden_size, num_classes=args.vocab_size, activation=args.hidden_act, log_softmax=True ) -mlm_loss_fn = nemo_nlp.MaskedLanguageModelingLossNM() +mlm_loss_fn = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM() if not args.only_mlm_loss: - nsp_classifier = nemo_nlp.SequenceClassifier( - args.hidden_size, num_classes=2, num_layers=2, activation='tanh', log_softmax=False, + nsp_classifier = nemo_nlp.nm.trainables.sequence_classification_nm.SequenceClassifier( + args.hidden_size, num_classes=2, num_layers=2, activation='tanh', log_softmax=False ) - nsp_loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss() + nsp_loss_fn = nemo_common.CrossEntropyLoss() - bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2) + bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2) # tie weights of MLM softmax layer and embedding layer of the encoder if mlm_classifier.mlp.last_linear_layer.weight.shape != bert_model.bert.embeddings.word_embeddings.weight.shape: @@ -209,31 +213,26 @@ mlm_classifier.mlp.last_linear_layer.weight = bert_model.bert.embeddings.word_embeddings.weight -def create_pipeline( - data_file, batch_size, preprocessed_data=False, batches_per_step=1, **kwargs, -): +def create_pipeline(data_file, batch_size, preprocessed_data=False, batches_per_step=1, **kwargs): if not preprocessed_data: max_seq_length, mask_probability, short_seq_prob = ( kwargs['max_seq_length'], kwargs['mask_probability'], kwargs['short_seq_prob'], ) - data_layer = nemo_nlp.BertPretrainingDataLayer( - tokenizer, data_file, max_seq_length, mask_probability, short_seq_prob, batch_size=batch_size, + data_layer = nemo_nlp.nm.data_layers.lm_bert_datalayer.BertPretrainingDataLayer( + tokenizer, data_file, max_seq_length, mask_probability, short_seq_prob, batch_size=batch_size ) else: - training, max_predictions_per_seq = ( - kwargs['training'], - kwargs['max_predictions_per_seq'], - ) - data_layer = nemo_nlp.BertPretrainingPreprocessedDataLayer( - data_file, max_predictions_per_seq, batch_size=batch_size, training=training, + training, max_predictions_per_seq = (kwargs['training'], kwargs['max_predictions_per_seq']) + data_layer = nemo_nlp.nm.data_layers.lm_bert_datalayer.BertPretrainingPreprocessedDataLayer( + data_file, max_predictions_per_seq, batch_size=batch_size, training=training ) steps_per_epoch = math.ceil(len(data_layer) / (batch_size * args.num_gpus * batches_per_step)) - (input_ids, input_type_ids, input_mask, output_ids, output_mask, nsp_labels,) = data_layer() - hidden_states = bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,) + (input_ids, input_type_ids, input_mask, output_ids, output_mask, nsp_labels) = data_layer() + hidden_states = bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) mlm_logits = mlm_classifier(hidden_states=hidden_states) mlm_loss = mlm_loss_fn(logits=mlm_logits, output_ids=output_ids, output_mask=output_mask) if not args.only_mlm_loss: @@ -275,15 +274,15 @@ def create_pipeline( else: log_tensors = [train_loss] print_msg = "Loss: {:.3f}" -train_callback = nemo.core.SimpleLossLoggerCallback( +train_callback = nemo_core.SimpleLossLoggerCallback( tensors=log_tensors, step_freq=args.print_step_freq, - print_func=lambda x: nemo.logging.info(print_msg.format(*[y.item() for y in x])), + print_func=lambda x: logging.info(print_msg.format(*[y.item() for y in x])), get_tb_values=lambda x: [["loss", x[0]]], tb_writer=nf.tb_writer, ) -ckpt_callback = nemo.core.CheckpointCallback( +ckpt_callback = nemo_core.CheckpointCallback( folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, load_from_folder=args.load_dir, @@ -294,11 +293,11 @@ def create_pipeline( if args.lr_policy is not None: if args.max_steps < 0: lr_policy_fn = get_lr_policy( - args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion, + args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion ) else: lr_policy_fn = get_lr_policy( - args.lr_policy, total_steps=args.max_steps, warmup_ratio=args.lr_warmup_proportion, + args.lr_policy, total_steps=args.max_steps, warmup_ratio=args.lr_warmup_proportion ) else: lr_policy_fn = None diff --git a/examples/nlp/glue_with_BERT.py b/examples/nlp/glue_benchmark_with_bert.py similarity index 72% rename from examples/nlp/glue_with_BERT.py rename to examples/nlp/glue_benchmark_with_bert.py index d7dcc8bc87b7..a7d909d93247 100644 --- a/examples/nlp/glue_with_BERT.py +++ b/examples/nlp/glue_benchmark_with_bert.py @@ -24,14 +24,14 @@ https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e To run this example on 1 GPU: -python glue_with_BERT.py \ +python glue_benchmark_with_bert.py \ --data_dir /path_to_data_dir/MRPC \ --task_name mrpc \ --work_dir /path_to_output_folder \ To run this example on 4 GPUs with mixed precision: python -m torch.distributed.launch \ ---nproc_per_node=4 glue_with_BERT.py \ +--nproc_per_node=4 glue_benchmark_with_bert.py \ --data_dir=/path_to_data/MNLI \ --task_name mnli \ --work_dir /path_to_output_folder \ @@ -64,17 +64,15 @@ import json import os -import nemo import nemo.collections.nlp as nemo_nlp +import nemo.core as nemo_core +from nemo import logging from nemo.backends.pytorch.common import CrossEntropyLoss, MSELoss -from nemo.collections.nlp import ( - GlueDataLayerClassification, - GlueDataLayerRegression, - NemoBertTokenizer, - SentencePieceTokenizer, -) -from nemo.collections.nlp.data.datasets.utils import output_modes, processors -from nemo.collections.nlp.utils.callbacks.glue import eval_epochs_done_callback, eval_iter_callback +from nemo.collections.nlp.callbacks.glue_benchmark_callback import eval_epochs_done_callback, eval_iter_callback +from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import output_modes, processors +from nemo.collections.nlp.nm.data_layers import GlueClassificationDataLayer, GlueRegressionDataLayer +from nemo.collections.nlp.nm.trainables import SequenceClassifier, SequenceRegression from nemo.utils.lr_policies import get_lr_policy parser = argparse.ArgumentParser(description="GLUE_with_pretrained_BERT") @@ -85,94 +83,71 @@ default='COLA', type=str, required=True, - help="The input data dir. Should contain the .tsv \ - files (or other data files) for the task.", + help="The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument( "--task_name", default="CoLA", type=str, required=True, - choices=['cola', 'sst-2', 'mrpc', 'sts-b', 'qqp', 'mnli', 'qnli', 'rte', 'wnli',], - help="GLUE task name, MNLI includes both matched and \ - mismatched tasks", -) -parser.add_argument( - "--dataset_type", default="GLUEDataset", type=str, help='Type of dataset to create datalayers', -) -parser.add_argument( - "--pretrained_bert_model", default="bert-base-cased", type=str, help="Name of the pre-trained model", -) -parser.add_argument( - "--bert_checkpoint", default=None, type=str, help="Path to model checkpoint", + choices=['cola', 'sst-2', 'mrpc', 'sts-b', 'qqp', 'mnli', 'qnli', 'rte', 'wnli'], + help="GLUE task name, MNLI includes both matched and mismatched tasks", ) parser.add_argument( - "--bert_config", default=None, type=str, help="Path to bert config file in json format", + "--pretrained_bert_model", default="bert-base-cased", type=str, help="Name of the pre-trained model" ) +parser.add_argument("--bert_checkpoint", default=None, type=str, help="Path to model checkpoint") +parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") parser.add_argument( "--tokenizer_model", default="tokenizer.model", type=str, - help="Path to pretrained tokenizer model, \ - only used if --tokenizer is sentencepiece", + help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", ) parser.add_argument( "--tokenizer", default="nemobert", type=str, choices=["nemobert", "sentencepiece"], - help="tokenizer to use, \ - only relevant when using custom pretrained checkpoint.", + help="tokenizer to use, only relevant when using custom pretrained checkpoint.", ) parser.add_argument( "--max_seq_length", default=128, type=int, choices=range(1, 513), - help="The maximum total input sequence length after \ - tokenization.Sequences longer than this will be \ + help="The maximum total input sequence length after tokenization.Sequences longer than this will be \ truncated, sequences shorter will be padded.", ) parser.add_argument("--optimizer_kind", default="adam", type=str, help="Optimizer kind") parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str) parser.add_argument("--lr", default=5e-5, type=float, help="The initial learning rate.") parser.add_argument("--lr_warmup_proportion", default=0.1, type=float) -parser.add_argument( - "--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.", -) -parser.add_argument( - "--num_epochs", default=3, type=int, help="Total number of training epochs to perform.", -) -parser.add_argument( - "--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.", -) +parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") +parser.add_argument("--num_epochs", default=3, type=int, help="Total number of training epochs to perform.") +parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.") parser.add_argument("--num_gpus", default=1, type=int, help="Number of GPUs") parser.add_argument( - "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision", -) -parser.add_argument( - "--local_rank", type=int, default=None, help="For distributed training: local_rank", + "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision" ) +parser.add_argument("--local_rank", type=int, default=None, help="For distributed training: local_rank") parser.add_argument( "--work_dir", default='output_glue', type=str, - help="The output directory where the model predictions \ - and checkpoints will be written.", + help="The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--save_epoch_freq", default=1, type=int, - help="Frequency of saving checkpoint \ - '-1' - epoch checkpoint won't be saved", + help="Frequency of saving checkpoint '-1' - epoch checkpoint won't be saved", ) parser.add_argument( "--save_step_freq", default=-1, type=int, - help="Frequency of saving checkpoint \ - '-1' - step checkpoint won't be saved", + help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved", ) parser.add_argument("--loss_step_freq", default=25, type=int, help="Frequency of printing loss") @@ -181,8 +156,7 @@ if not os.path.exists(args.data_dir): raise FileNotFoundError( "GLUE datasets not found. Datasets can be " - "obtained at https://gist.github.com/W4ngatang/ \ - 60c2bdb54d156a41194446737ce03e2e" + "obtained at https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e" ) args.work_dir = f'{args.work_dir}/{args.task_name.upper()}' @@ -203,8 +177,8 @@ output_mode = output_modes[args.task_name] # Instantiate neural factory with supported backend -nf = nemo.core.NeuralModuleFactory( - backend=nemo.core.Backend.PyTorch, +nf = nemo_core.NeuralModuleFactory( + backend=nemo_core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=args.work_dir, @@ -216,10 +190,10 @@ if args.bert_checkpoint is None: """ Use this if you're using a standard BERT model. To see the list of pretrained models, call: - nemo_nlp.huggingface.BERT.list_pretrained_models() + nemo_nlp.nm.trainables.huggingface.BERT.list_pretrained_models() """ tokenizer = NemoBertTokenizer(args.pretrained_bert_model) - model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) + model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) else: """ Use this if you're using a BERT model that you pre-trained yourself. Replace BERT-STEP-150000.pt with the path to your checkpoint. @@ -234,9 +208,9 @@ if args.bert_config is not None: with open(args.bert_config) as json_file: config = json.load(json_file) - model = nemo_nlp.huggingface.BERT(**config) + model = nemo_nlp.nm.trainables.huggingface.BERT(**config) else: - model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) + model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) model.restore_from(args.bert_checkpoint) @@ -244,10 +218,10 @@ # uses [CLS] token for classification (the first token) if args.task_name == 'sts-b': - pooler = nemo_nlp.SequenceRegression(hidden_size=hidden_size) + pooler = SequenceRegression(hidden_size=hidden_size) glue_loss = MSELoss() else: - pooler = nemo_nlp.SequenceClassifier(hidden_size=hidden_size, num_classes=num_labels, log_softmax=False) + pooler = SequenceClassifier(hidden_size=hidden_size, num_classes=num_labels, log_softmax=False) glue_loss = CrossEntropyLoss() @@ -259,12 +233,11 @@ def create_pipeline( evaluate=False, processor=task_processors[0], ): - data_layer = GlueDataLayerClassification + data_layer = GlueClassificationDataLayer if output_mode == 'regression': - data_layer = GlueDataLayerRegression + data_layer = GlueRegressionDataLayer data_layer = data_layer( - dataset_type=args.dataset_type, processor=processor, evaluate=evaluate, batch_size=batch_size, @@ -278,7 +251,7 @@ def create_pipeline( input_ids, input_type_ids, input_mask, labels = data_layer() - hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,) + hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) """ For STS-B (regressiont tast), the pooler_output represents a is single @@ -296,18 +269,13 @@ def create_pipeline( return loss, steps_per_epoch, data_layer, [pooler_output, labels] -token_params = { - 'bos_token': None, - 'eos_token': '[SEP]', - 'pad_token': '[PAD]', - 'cls_token': '[CLS]', -} +token_params = {'bos_token': None, 'eos_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]'} train_loss, steps_per_epoch, _, _ = create_pipeline() _, _, eval_data_layer, eval_tensors = create_pipeline(evaluate=True) callbacks_eval = [ - nemo.core.EvaluatorCallback( + nemo_core.EvaluatorCallback( eval_tensors=eval_tensors, user_iter_callback=lambda x, y: eval_iter_callback(x, y), user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, args.work_dir, eval_task_names[0]), @@ -323,7 +291,7 @@ def create_pipeline( if args.task_name == 'mnli': _, _, eval_data_layer_mm, eval_tensors_mm = create_pipeline(evaluate=True, processor=task_processors[1]) callbacks_eval.append( - nemo.core.EvaluatorCallback( + nemo_core.EvaluatorCallback( eval_tensors=eval_tensors_mm, user_iter_callback=lambda x, y: eval_iter_callback(x, y), user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, args.work_dir, eval_task_names[1]), @@ -332,8 +300,8 @@ def create_pipeline( ) ) -nemo.logging.info(f"steps_per_epoch = {steps_per_epoch}") -callback_train = nemo.core.SimpleLossLoggerCallback( +logging.info(f"steps_per_epoch = {steps_per_epoch}") +callback_train = nemo_core.SimpleLossLoggerCallback( tensors=[train_loss], print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())), get_tb_values=lambda x: [["loss", x[0]]], @@ -341,12 +309,12 @@ def create_pipeline( tb_writer=nf.tb_writer, ) -ckpt_callback = nemo.core.CheckpointCallback( - folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, +ckpt_callback = nemo_core.CheckpointCallback( + folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) lr_policy_fn = get_lr_policy( - args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion, + args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion ) nf.train( diff --git a/examples/nlp/joint_intent_slot_infer.py b/examples/nlp/joint_intent_slot_infer.py index d2f3efaf8c68..942d1c98bd0d 100644 --- a/examples/nlp/joint_intent_slot_infer.py +++ b/examples/nlp/joint_intent_slot_infer.py @@ -1,13 +1,29 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import argparse import os import numpy as np -from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import classification_report from transformers import BertTokenizer -import nemo -import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.data.datasets.utils import JointIntentSlotDataDesc +import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm +from nemo import logging +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc # Parsing arguments parser = argparse.ArgumentParser(description='Joint-intent BERT') @@ -28,22 +44,24 @@ raise ValueError(f'Data not found at {args.data_dir}') nf = nemo.core.NeuralModuleFactory( - backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=None, + backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=None ) """ Load the pretrained BERT parameters See the list of pretrained models, call: nemo_nlp.huggingface.BERT.list_pretrained_models() """ -pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) +pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT( + pretrained_model_name=args.pretrained_bert_model +) hidden_size = pretrained_bert_model.hidden_size tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model) data_desc = JointIntentSlotDataDesc(args.data_dir, args.do_lower_case, args.dataset_name) # Evaluation pipeline -nemo.logging.info("Loading eval data...") -data_layer = nemo_nlp.BertJointIntentSlotDataLayer( +logging.info("Loading eval data...") +data_layer = nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer.BertJointIntentSlotDataLayer( input_file=f'{data_desc.data_dir}/{args.eval_file_prefix}.tsv', slot_file=f'{data_desc.data_dir}/{args.eval_file_prefix}_slots.tsv', pad_label=data_desc.pad_label, @@ -51,15 +69,13 @@ max_seq_length=args.max_seq_length, shuffle=False, batch_size=args.batch_size, - # num_workers=0, - # local_rank=args.local_rank, ) -classifier = nemo_nlp.JointIntentSlotClassifier( - hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, +classifier = nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm.JointIntentSlotClassifier( + hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots ) -(ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots,) = data_layer() +(ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots) = data_layer() hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask) intent_logits, slot_logits = classifier(hidden_states=hidden_states) @@ -69,7 +85,7 @@ # Instantiate an optimizer to perform `infer` action evaluated_tensors = nf.infer( - tensors=[intent_logits, slot_logits, loss_mask, subtokens_mask, intents, slots,], checkpoint_dir=args.work_dir, + tensors=[intent_logits, slot_logits, loss_mask, subtokens_mask, intents, slots], checkpoint_dir=args.work_dir ) @@ -86,13 +102,13 @@ def get_preds(logits): ] pred_intents = np.argmax(intent_logits, 1) -nemo.logging.info('Intent prediction results') +logging.info('Intent prediction results') intents = np.asarray(intents) pred_intents = np.asarray(pred_intents) intent_accuracy = sum(intents == pred_intents) / len(pred_intents) -nemo.logging.info(f'Intent accuracy: {intent_accuracy}') -nemo.logging.info(classification_report(intents, pred_intents)) +logging.info(f'Intent accuracy: {intent_accuracy}') +logging.info(classification_report(intents, pred_intents)) slot_preds = np.argmax(slot_logits, axis=2) slot_preds_list, slot_labels_list = [], [] @@ -101,9 +117,9 @@ def get_preds(logits): slot_preds_list.extend(list(slot_preds[i][subtokens_mask[i]])) slot_labels_list.extend(list(slot_labels[i][subtokens_mask[i]])) -nemo.logging.info('Slot prediction results') +logging.info('Slot prediction results') slot_labels_list = np.asarray(slot_labels_list) slot_preds_list = np.asarray(slot_preds_list) slot_accuracy = sum(slot_labels_list == slot_preds_list) / len(slot_labels_list) -nemo.logging.info(f'Slot accuracy: {slot_accuracy}') -nemo.logging.info(classification_report(slot_labels_list, slot_preds_list)) +logging.info(f'Slot accuracy: {slot_accuracy}') +logging.info(classification_report(slot_labels_list, slot_preds_list)) diff --git a/examples/nlp/joint_intent_slot_infer_b1.py b/examples/nlp/joint_intent_slot_infer_b1.py index 089a2c06820e..55c467f3f5ea 100644 --- a/examples/nlp/joint_intent_slot_infer_b1.py +++ b/examples/nlp/joint_intent_slot_infer_b1.py @@ -1,12 +1,28 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import argparse import numpy as np from transformers import BertTokenizer -import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.data.datasets.utils import JointIntentSlotDataDesc -from nemo.collections.nlp.utils.nlp_utils import read_intent_slot_outputs +import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc +from nemo.collections.nlp.utils.common_nlp_utils import read_intent_slot_outputs # Parsing arguments parser = argparse.ArgumentParser(description='Joint-intent BERT') @@ -23,14 +39,14 @@ args = parser.parse_args() nf = nemo.core.NeuralModuleFactory( - backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None, + backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None ) """ Load the pretrained BERT parameters See the list of pretrained models, call: -nemo_nlp.huggingface.BERT.list_pretrained_models() +nemo_nlp.BERT.list_pretrained_models() """ -pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) +pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model) hidden_size = pretrained_bert_model.hidden_size @@ -40,13 +56,13 @@ if args.do_lower_case: query = query.lower() -data_layer = nemo_nlp.BertJointIntentSlotInferDataLayer( - queries=[query], tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1, +data_layer = nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer.BertJointIntentSlotInferDataLayer( + queries=[query], tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1 ) # Create sentence classification loss on top -classifier = nemo_nlp.JointIntentSlotClassifier( - hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout, +classifier = nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm.JointIntentSlotClassifier( + hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout ) ids, type_ids, input_mask, loss_mask, subtokens_mask = data_layer() @@ -58,7 +74,7 @@ ########################################################################### -evaluated_tensors = nf.infer(tensors=[intent_logits, slot_logits, subtokens_mask], checkpoint_dir=args.work_dir,) +evaluated_tensors = nf.infer(tensors=[intent_logits, slot_logits, subtokens_mask], checkpoint_dir=args.work_dir) def concatenate(lists): @@ -68,5 +84,5 @@ def concatenate(lists): intent_logits, slot_logits, subtokens_mask = [concatenate(tensors) for tensors in evaluated_tensors] read_intent_slot_outputs( - [query], data_desc.intent_dict_file, data_desc.slot_dict_file, intent_logits, slot_logits, subtokens_mask, + [query], data_desc.intent_dict_file, data_desc.slot_dict_file, intent_logits, slot_logits, subtokens_mask ) diff --git a/examples/nlp/joint_intent_slot_with_bert.py b/examples/nlp/joint_intent_slot_with_bert.py index 8e0d5874f226..f700a21f7943 100644 --- a/examples/nlp/joint_intent_slot_with_bert.py +++ b/examples/nlp/joint_intent_slot_with_bert.py @@ -1,3 +1,19 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import argparse import math import os @@ -5,10 +21,12 @@ import numpy as np from transformers import BertTokenizer -import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.data.datasets.utils import JointIntentSlotDataDesc -from nemo.collections.nlp.utils.callbacks.joint_intent_slot import eval_epochs_done_callback, eval_iter_callback +import nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer +import nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm +from nemo import logging +from nemo.collections.nlp.callbacks.joint_intent_slot_callback import eval_epochs_done_callback, eval_iter_callback +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import JointIntentSlotDataDesc from nemo.utils.lr_policies import get_lr_policy # Parsing arguments @@ -44,9 +62,7 @@ parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--shuffle_data", action='store_true') parser.add_argument("--intent_loss_weight", default=0.6, type=float) -parser.add_argument( - "--class_balancing", default="regular", type=str, choices=["regular", "weighted_loss"], -) +parser.add_argument("--class_balancing", default="regular", type=str, choices=["regular", "weighted_loss"]) args = parser.parse_args() @@ -71,43 +87,47 @@ nemo_nlp.huggingface.BERT.list_pretrained_models() """ if args.bert_checkpoint and args.bert_config: - pretrained_bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config, factory=nf) + pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT( + config_filename=args.bert_config + ) pretrained_bert_model.restore_from(args.bert_checkpoint) else: - pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model, factory=nf) + pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT( + pretrained_model_name=args.pretrained_bert_model + ) hidden_size = pretrained_bert_model.hidden_size data_desc = JointIntentSlotDataDesc( - args.data_dir, args.do_lower_case, args.dataset_name, args.none_slot_label, args.pad_label, + args.data_dir, args.do_lower_case, args.dataset_name, args.none_slot_label, args.pad_label ) # Create sentence classification loss on top -classifier = nemo_nlp.JointIntentSlotClassifier( - hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout, +classifier = nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm.JointIntentSlotClassifier( + hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout ) if args.class_balancing == 'weighted_loss': # Using weighted loss will enable weighted loss for both intents and slots # Use the intent_loss_weight hyperparameter to adjust intent loss to # prevent overfitting or underfitting. - loss_fn = nemo_nlp.JointIntentSlotLoss( + loss_fn = nemo_nlp.nm.losses.JointIntentSlotLoss( num_slots=data_desc.num_slots, slot_classes_loss_weights=data_desc.slot_weights, intent_classes_loss_weights=data_desc.intent_weights, intent_loss_weight=args.intent_loss_weight, ) else: - loss_fn = nemo_nlp.JointIntentSlotLoss(num_slots=data_desc.num_slots) + loss_fn = nemo_nlp.nm.losses.JointIntentSlotLoss(num_slots=data_desc.num_slots) def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'): - nemo.logging.info(f"Loading {mode} data...") + logging.info(f"Loading {mode} data...") data_file = f'{data_desc.data_dir}/{mode}.tsv' slot_file = f'{data_desc.data_dir}/{mode}_slots.tsv' shuffle = args.shuffle_data if mode == 'train' else False - data_layer = nemo_nlp.BertJointIntentSlotDataLayer( + data_layer = nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer.BertJointIntentSlotDataLayer( input_file=data_file, slot_file=slot_file, pad_label=data_desc.pad_label, @@ -116,43 +136,35 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod num_samples=num_samples, shuffle=shuffle, batch_size=batch_size, - num_workers=0, - local_rank=local_rank, ignore_extra_tokens=args.ignore_extra_tokens, ignore_start_end=args.ignore_start_end, ) - (ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots,) = data_layer() + (ids, type_ids, input_mask, loss_mask, subtokens_mask, intents, slots) = data_layer() data_size = len(data_layer) print(f'The length of data layer is {data_size}') if data_size < batch_size: - nemo.logging.warning("Batch_size is larger than the dataset size") - nemo.logging.warning("Reducing batch_size to dataset size") + logging.warning("Batch_size is larger than the dataset size") + logging.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) - nemo.logging.info(f"Steps_per_epoch = {steps_per_epoch}") + logging.info(f"Steps_per_epoch = {steps_per_epoch}") hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask) intent_logits, slot_logits = classifier(hidden_states=hidden_states) loss = loss_fn( - intent_logits=intent_logits, slot_logits=slot_logits, loss_mask=loss_mask, intents=intents, slots=slots, + intent_logits=intent_logits, slot_logits=slot_logits, loss_mask=loss_mask, intents=intents, slots=slots ) if mode == 'train': tensors_to_evaluate = [loss, intent_logits, slot_logits] else: - tensors_to_evaluate = [ - intent_logits, - slot_logits, - intents, - slots, - subtokens_mask, - ] + tensors_to_evaluate = [intent_logits, slot_logits, intents, slots, subtokens_mask] return tensors_to_evaluate, loss, steps_per_epoch, data_layer @@ -191,11 +203,11 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod # Create callback to save checkpoints ckpt_callback = nemo.core.CheckpointCallback( - folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, + folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) lr_policy_fn = get_lr_policy( - args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion, + args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion ) nf.train( @@ -203,5 +215,5 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod callbacks=[train_callback, eval_callback, ckpt_callback], lr_policy=lr_policy_fn, optimizer=args.optimizer_kind, - optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay,}, + optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay}, ) diff --git a/examples/nlp/transformer_lm.py b/examples/nlp/language_modeling_transformer.py similarity index 74% rename from examples/nlp/transformer_lm.py rename to examples/nlp/language_modeling_transformer.py index 41ca2e960ffb..9d2b08be9080 100644 --- a/examples/nlp/transformer_lm.py +++ b/examples/nlp/language_modeling_transformer.py @@ -1,10 +1,27 @@ -# Copyright (c) 2019 NVIDIA Corporation +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import math import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.data.datasets.utils import LanguageModelDataDesc -from nemo.collections.nlp.utils.callbacks.language_modeling import eval_epochs_done_callback, eval_iter_callback +import nemo.collections.nlp.nm.data_layers.lm_transformer_datalayer +import nemo.collections.nlp.nm.trainables.common.token_classification_nm +from nemo.collections.nlp.callbacks.lm_transformer_callback import eval_epochs_done_callback, eval_iter_callback +from nemo.collections.nlp.data.datasets.lm_transformer_dataset import LanguageModelDataDesc from nemo.utils.lr_policies import CosineAnnealing parser = nemo.utils.NemoArgParser(description='LM Transformer') @@ -67,14 +84,14 @@ # define tokenizer, in this example we use word-level tokenizer # we also adjust the vocabulary size to make it multiple of 8 to accelerate # training in fp16 mode with the use of Tensor Cores -tokenizer = nemo_nlp.WordTokenizer(f"{args.data_dir}/{args.tokenizer_model}") +tokenizer = nemo_nlp.data.WordTokenizer(f"{args.data_dir}/{args.tokenizer_model}") vocab_size = 8 * math.ceil(tokenizer.vocab_size / 8) # instantiate necessary modules for the whole translation pipeline, namely # data layers, encoder, decoder, output log_softmax, beam_search_translator # and loss function -encoder = nemo_nlp.TransformerEncoderNM( +encoder = nemo_nlp.nm.trainables.TransformerEncoderNM( d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, @@ -88,19 +105,23 @@ max_seq_length=args.max_seq_length, ) -log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True) +log_softmax = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier( + args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True +) -loss = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing) +loss = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM( + pad_id=tokenizer.pad_id(), label_smoothing=args.label_smoothing +) # tie weight of embedding and log_softmax layers log_softmax.mlp.last_linear_layer.weight = encoder.embedding_layer.token_embedding.weight def create_pipeline( - dataset, max_seq_length=args.max_seq_length, batch_step=args.max_seq_length, batch_size=args.batch_size, + dataset, max_seq_length=args.max_seq_length, batch_step=args.max_seq_length, batch_size=args.batch_size ): - data_layer = nemo_nlp.LanguageModelingDataLayer( - dataset, tokenizer, max_seq_length, batch_step, batch_size=batch_size + data_layer = nemo.collections.nlp.nm.data_layers.lm_transformer_datalayer.LanguageModelingDataLayer( + dataset, tokenizer, max_seq_length, batch_size, batch_step ) src, src_mask, labels = data_layer() src_hiddens = encoder(input_ids=src, input_mask_src=src_mask) @@ -141,7 +162,7 @@ def create_pipeline( # callback which saves checkpoints once in a while callback_ckpt = nemo.core.CheckpointCallback( - folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=-1, + folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=-1 ) # define learning rate decay policy diff --git a/examples/nlp/nmt_tutorial.py b/examples/nlp/machine_translation_tutorial.py similarity index 84% rename from examples/nlp/nmt_tutorial.py rename to examples/nlp/machine_translation_tutorial.py index 49775c187ce3..5ca3cc4a3ca5 100644 --- a/examples/nlp/nmt_tutorial.py +++ b/examples/nlp/machine_translation_tutorial.py @@ -1,4 +1,20 @@ -""" Copyright (c) 2019 NVIDIA Corporation +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +""" See the tutorial and download the data here: https://nvidia.github.io/NeMo/nlp/ neural-machine-translation.html#translation-with-pretrained-model @@ -7,7 +23,7 @@ import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.utils.callbacks.translation import eval_epochs_done_callback, eval_iter_callback +from nemo.collections.nlp.callbacks.machine_translation_callback import eval_epochs_done_callback, eval_iter_callback from nemo.utils.lr_policies import get_lr_policy parser = nemo.utils.NemoArgParser(description='Transformer for Neural Machine Translation') @@ -76,14 +92,14 @@ We use YouTokenToMe tokenizer trained on joint English & German data for both source and target languages. """ - src_tokenizer = nemo_nlp.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}") + src_tokenizer = nemo_nlp.data.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}") src_vocab_size = src_tokenizer.vocab_size if args.src_tokenizer_model == args.tgt_tokenizer_model: tgt_tokenizer = src_tokenizer # source and target use the same tokenizer, set tie_weight to True tie_weight = True else: - tgt_tokenizer = nemo_nlp.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.tgt_tokenizer_model}") + tgt_tokenizer = nemo_nlp.data.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.tgt_tokenizer_model}") # source and target use different tokenizers, set tie_weight to False tie_weight = False tgt_vocab_size = tgt_tokenizer.vocab_size @@ -92,9 +108,9 @@ We use YouTokenToMeTokenizer for src since the src contains English words and CharTokenizer for tgt since the tgt contains Chinese characters. """ - src_tokenizer = nemo_nlp.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}") + src_tokenizer = nemo_nlp.data.YouTokenToMeTokenizer(model_path=f"{args.data_dir}/{args.src_tokenizer_model}") src_vocab_size = src_tokenizer.vocab_size - tgt_tokenizer = nemo_nlp.CharTokenizer(vocab_path=f"{args.data_dir}/{args.tgt_tokenizer_model}") + tgt_tokenizer = nemo_nlp.data.CharTokenizer(vocab_path=f"{args.data_dir}/{args.tgt_tokenizer_model}") tgt_vocab_size = tgt_tokenizer.vocab_size # source and target use different tokenizers, set tie_weight to False tie_weight = False @@ -104,7 +120,7 @@ # instantiate necessary modules for the whole translation pipeline, namely # data layers, encoder, decoder, output log_softmax, beam_search_translator # and loss function -encoder = nemo_nlp.TransformerEncoderNM( +encoder = nemo_nlp.nm.trainables.TransformerEncoderNM( d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, @@ -117,7 +133,7 @@ max_seq_length=args.max_seq_length, ) -decoder = nemo_nlp.TransformerDecoderNM( +decoder = nemo_nlp.nm.trainables.TransformerDecoderNM( d_model=args.d_model, d_inner=args.d_inner, num_layers=args.num_layers, @@ -130,11 +146,11 @@ max_seq_length=args.max_seq_length, ) -log_softmax = nemo_nlp.TokenClassifier( - args.d_model, num_classes=tgt_tokenizer.vocab_size, num_layers=1, log_softmax=True, +log_softmax = nemo_nlp.nm.trainables.token_classification_nm.TokenClassifier( + args.d_model, num_classes=tgt_tokenizer.vocab_size, num_layers=1, log_softmax=True ) -beam_search = nemo_nlp.BeamSearchTranslatorNM( +beam_search = nemo_nlp.nm.trainables.BeamSearchTranslatorNM( decoder=decoder, log_softmax=log_softmax, max_seq_length=args.max_seq_length, @@ -144,7 +160,7 @@ eos_token=tgt_tokenizer.eos_id(), ) -loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM( +loss_fn = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM( pad_id=tgt_tokenizer.pad_id(), label_smoothing=args.label_smoothing ) @@ -154,7 +170,7 @@ def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, training=True): - data_layer = nemo_nlp.TranslationDataLayer( + data_layer = nemo_nlp.nm.data_layers.machine_translation_datalayer.TranslationDataLayer( tokenizer_src=src_tokenizer, tokenizer_tgt=tgt_tokenizer, dataset_src=dataset_src, @@ -165,7 +181,7 @@ def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, trai src, src_mask, tgt, tgt_mask, labels, sent_ids = data_layer() src_hiddens = encoder(input_ids=src, input_mask_src=src_mask) tgt_hiddens = decoder( - input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask, + input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask ) logits = log_softmax(hidden_states=tgt_hiddens) loss = loss_fn(logits=logits, target_ids=labels) @@ -207,7 +223,7 @@ def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, trai # callback which saves checkpoints once in a while ckpt_dir = nf.checkpoint_dir if not args.interactive else args.restore_checkpoint_from ckpt_callback = nemo.core.CheckpointCallback( - folder=ckpt_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=1, + folder=ckpt_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=1 ) # define learning rate decay policy @@ -228,7 +244,7 @@ def create_pipeline(dataset_src, dataset_tgt, tokens_in_batch, clean=False, trai callbacks=[train_callback, eval_callback, ckpt_callback], optimizer=args.optimizer, lr_policy=lr_policy_fn, - optimization_params={**stop_training_condition, "lr": args.lr, "weight_decay": args.weight_decay,}, + optimization_params={**stop_training_condition, "lr": args.lr, "weight_decay": args.weight_decay}, batches_per_step=args.iter_per_step, ) else: diff --git a/examples/nlp/punctuation_capitalization.py b/examples/nlp/punctuation_capitalization.py index cf2a2d20cda6..abd67203ffd4 100644 --- a/examples/nlp/punctuation_capitalization.py +++ b/examples/nlp/punctuation_capitalization.py @@ -1,18 +1,34 @@ -# pylint: disable=invalid-name +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import argparse import json import os -import sys -import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer, TokenClassificationLoss, TokenClassifier -from nemo.collections.nlp.data.datasets import utils -from nemo.collections.nlp.utils.callbacks.punctuation_capitalization import ( +import nemo.collections.nlp.utils.common_nlp_utils +from nemo import logging +from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import ( eval_epochs_done_callback, eval_iter_callback, ) +from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer +from nemo.collections.nlp.nm.data_layers import PunctuationCapitalizationDataLayer +from nemo.collections.nlp.nm.losses.token_classification_loss import TokenClassificationLoss +from nemo.collections.nlp.nm.trainables import TokenClassifier from nemo.utils.lr_policies import get_lr_policy # Parsing arguments @@ -40,9 +56,7 @@ parser.add_argument("--shuffle_data", action='store_true') parser.add_argument("--pretrained_bert_model", default="bert-base-uncased", type=str) parser.add_argument("--bert_checkpoint", default=None, type=str) -parser.add_argument( - "--bert_config", default=None, type=str, help="Path to bert config file in json format", -) +parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") parser.add_argument("--punct_classifier_checkpoint", default=None, type=str) parser.add_argument("--capit_classifier_checkpoint", default=None, type=str) parser.add_argument( @@ -67,9 +81,7 @@ help="The output directory where the model prediction\ and checkpoints will be written.", ) -parser.add_argument( - "--use_cache", action='store_true', help="Whether to cache preprocessed data", -) +parser.add_argument("--use_cache", action='store_true', help="Whether to cache preprocessed data") parser.add_argument( "--save_epoch_freq", default=1, @@ -84,9 +96,7 @@ help="Frequency of saving checkpoint \ '-1' - step checkpoint won't be saved", ) -parser.add_argument( - "--loss_step_freq", default=250, type=int, help="Frequency of printing loss", -) +parser.add_argument("--loss_step_freq", default=250, type=int, help="Frequency of printing loss") parser.add_argument( "--use_weighted_loss_punct", action='store_true', @@ -109,7 +119,7 @@ add_time_to_log_dir=True, ) -nemo.logging.info(args) +logging.info(args) output_file = f'{nf.work_dir}/output.txt' @@ -119,7 +129,7 @@ nemo_nlp.huggingface.BERT.list_pretrained_models() """ tokenizer = NemoBertTokenizer(args.pretrained_bert_model) - model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) + model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) else: """ Use this if you're using a BERT model that you pre-trained yourself. """ @@ -133,29 +143,20 @@ if args.bert_config is not None: with open(args.bert_config) as json_file: config = json.load(json_file) - model = nemo_nlp.huggingface.BERT(**config) + model = nemo_nlp.nm.trainables.huggingface.BERT(**config) else: - model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) + model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) model.restore_from(args.bert_checkpoint) - nemo.logging.info(f"Model restored from {args.bert_checkpoint}") + logging.info(f"Model restored from {args.bert_checkpoint}") hidden_size = model.hidden_size -punct_classifier = "TokenClassifier" -punct_loss = "TokenClassificationLoss" - -capit_classifier = "TokenClassifier" -capit_loss = "TokenClassificationLoss" -task_loss = None - def create_pipeline( - num_samples=-1, pad_label=args.none_label, max_seq_length=args.max_seq_length, batch_size=args.batch_size, - local_rank=args.local_rank, num_gpus=args.num_gpus, mode='train', punct_label_ids=None, @@ -165,10 +166,11 @@ def create_pipeline( use_cache=args.use_cache, dropout=args.fc_dropout, punct_num_layers=args.punct_num_fc_layers, + punct_classifier=TokenClassifier, + capit_classifier=TokenClassifier, ): - global punct_classifier, punct_loss, capit_classifier, capit_loss, task_loss - nemo.logging.info(f"Loading {mode} data...") + logging.info(f"Loading {mode} data...") shuffle = args.shuffle_data if mode == 'train' else False text_file = f'{args.data_dir}/text_{mode}.txt' @@ -187,7 +189,7 @@ def create_pipeline( [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).' ) - data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer( + data_layer = PunctuationCapitalizationDataLayer( tokenizer=tokenizer, text_file=text_file, label_file=label_file, @@ -196,15 +198,13 @@ def create_pipeline( capit_label_ids=capit_label_ids, max_seq_length=max_seq_length, batch_size=batch_size, - num_workers=0, - local_rank=local_rank, shuffle=shuffle, ignore_extra_tokens=ignore_extra_tokens, ignore_start_end=ignore_start_end, use_cache=use_cache, ) - (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels,) = data_layer() + (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels) = data_layer() if mode == 'train': punct_label_ids = data_layer.dataset.punct_label_ids @@ -212,12 +212,11 @@ def create_pipeline( class_weights = None if args.use_weighted_loss_punct: - nemo.logging.info(f"Using weighted loss for punctuation task") + logging.info(f"Using weighted loss for punctuation task") punct_label_freqs = data_layer.dataset.punct_label_frequencies - class_weights = utils.calc_class_weights(punct_label_freqs) + class_weights = nemo.collections.nlp.utils.common_nlp_utils.calc_class_weights(punct_label_freqs) # Initialize punctuation loss - punct_classifier = getattr(sys.modules[__name__], punct_classifier) punct_classifier = punct_classifier( hidden_size=hidden_size, num_classes=len(punct_label_ids), @@ -226,20 +225,17 @@ def create_pipeline( name='Punctuation', ) - punct_loss = getattr(sys.modules[__name__], punct_loss) - punct_loss = punct_loss(num_classes=len(punct_label_ids), class_weights=class_weights) + punct_loss = TokenClassificationLoss(num_classes=len(punct_label_ids), class_weights=class_weights) # Initialize capitalization loss - capit_classifier = getattr(sys.modules[__name__], capit_classifier) capit_classifier = capit_classifier( - hidden_size=hidden_size, num_classes=len(capit_label_ids), dropout=dropout, name='Capitalization', + hidden_size=hidden_size, num_classes=len(capit_label_ids), dropout=dropout, name='Capitalization' ) - capit_loss = getattr(sys.modules[__name__], capit_loss) - capit_loss = capit_loss(num_classes=len(capit_label_ids)) + capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids)) - task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2) + task_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2) - hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,) + hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) punct_logits = punct_classifier(hidden_states=hidden_states) capit_logits = capit_classifier(hidden_states=hidden_states) @@ -253,31 +249,31 @@ def create_pipeline( losses = [task_loss, punct_loss, capit_loss] logits = [punct_logits, capit_logits] - return ( - losses, - logits, - steps_per_epoch, - punct_label_ids, - capit_label_ids, - ) + return losses, logits, steps_per_epoch, punct_label_ids, capit_label_ids, punct_classifier, capit_classifier else: - tensors_to_evaluate = [ - punct_logits, - capit_logits, - punct_labels, - capit_labels, - subtokens_mask, - ] + tensors_to_evaluate = [punct_logits, capit_logits, punct_labels, capit_labels, subtokens_mask] return tensors_to_evaluate, data_layer -(losses, train_logits, steps_per_epoch, punct_label_ids, capit_label_ids,) = create_pipeline() +( + losses, + train_logits, + steps_per_epoch, + punct_label_ids, + capit_label_ids, + punct_classifier, + capit_classifier, +) = create_pipeline() eval_tensors, data_layer = create_pipeline( - mode='dev', punct_label_ids=punct_label_ids, capit_label_ids=capit_label_ids, + mode='dev', + punct_label_ids=punct_label_ids, + capit_label_ids=capit_label_ids, + punct_classifier=punct_classifier, + capit_classifier=capit_classifier, ) -nemo.logging.info(f"steps_per_epoch = {steps_per_epoch}") +logging.info(f"steps_per_epoch = {steps_per_epoch}") # Create trainer and execute training action train_callback = nemo.core.SimpleLossLoggerCallback( @@ -298,11 +294,11 @@ def create_pipeline( ) ckpt_callback = nemo.core.CheckpointCallback( - folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, + folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) lr_policy_fn = get_lr_policy( - args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion, + args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion ) nf.train( diff --git a/examples/nlp/punctuation_capitalization_infer.py b/examples/nlp/punctuation_capitalization_infer.py index 2456e64408f2..2d18fcda82fd 100644 --- a/examples/nlp/punctuation_capitalization_infer.py +++ b/examples/nlp/punctuation_capitalization_infer.py @@ -1,16 +1,33 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import argparse import os import numpy as np -from sklearn.metrics import classification_report import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp import NemoBertTokenizer -from nemo.collections.nlp.utils.nlp_utils import get_vocab +from nemo import logging +from nemo.collections.nlp.data import NemoBertTokenizer +from nemo.collections.nlp.nm.data_layers import BertTokenClassificationInferDataLayer +from nemo.collections.nlp.utils.common_nlp_utils import get_vocab # Parsing arguments -parser = argparse.ArgumentParser(description='NER with pretrained BERT') +parser = argparse.ArgumentParser(description='Punctuation and capitalization detection inference') parser.add_argument("--max_seq_length", default=128, type=int) parser.add_argument("--fc_dropout", default=0, type=float) parser.add_argument("--punct_num_fc_layers", default=3, type=int) @@ -26,8 +43,7 @@ 'how are you', 'how\'s the weather today', 'okay', - 'we bought four shirts one mug and ten ' - + 'thousand titan rtx graphics cards the more ' + 'we bought four shirts one mug and ten thousand titan rtx graphics cards the more ' + 'you buy the more you save', ], help="Example: --queries 'san francisco' --queries 'la'", @@ -66,7 +82,7 @@ ) nf = nemo.core.NeuralModuleFactory( - backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None, + backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None ) punct_labels_dict = get_vocab(args.punct_labels_dict) @@ -75,17 +91,17 @@ """ Load the pretrained BERT parameters See the list of pretrained models, call: -nemo_nlp.huggingface.BERT.list_pretrained_models() +nemo.collections.nlp.BERT.list_pretrained_models() """ -pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) +pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) hidden_size = pretrained_bert_model.hidden_size tokenizer = NemoBertTokenizer(args.pretrained_bert_model) -data_layer = nemo_nlp.BertTokenClassificationInferDataLayer( - queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1, +data_layer = BertTokenClassificationInferDataLayer( + queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1 ) -punct_classifier = nemo_nlp.TokenClassifier( +punct_classifier = nemo_nlp.nm.trainables.TokenClassifier( hidden_size=hidden_size, num_classes=len(punct_labels_dict), dropout=args.fc_dropout, @@ -93,13 +109,13 @@ name='Punctuation', ) -capit_classifier = nemo_nlp.TokenClassifier( - hidden_size=hidden_size, num_classes=len(capit_labels_dict), dropout=args.fc_dropout, name='Capitalization', +capit_classifier = nemo_nlp.nm.trainables.TokenClassifier( + hidden_size=hidden_size, num_classes=len(capit_labels_dict), dropout=args.fc_dropout, name='Capitalization' ) input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask = data_layer() -hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,) +hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) punct_logits = punct_classifier(hidden_states=hidden_states) capit_logits = capit_classifier(hidden_states=hidden_states) @@ -107,26 +123,20 @@ ########################################################################### # Instantiate an optimizer to perform `infer` action -evaluated_tensors = nf.infer( - tensors=[punct_logits, capit_logits, subtokens_mask], checkpoint_dir=args.checkpoints_dir, -) +evaluated_tensors = nf.infer(tensors=[punct_logits, capit_logits, subtokens_mask], checkpoint_dir=args.checkpoints_dir) def concatenate(lists): return np.concatenate([t.cpu() for t in lists]) -def get_preds(logits): - return np.argmax(logits, 1) - - punct_logits, capit_logits, subtokens_mask = [concatenate(tensors) for tensors in evaluated_tensors] punct_preds = np.argmax(punct_logits, axis=2) capit_preds = np.argmax(capit_logits, axis=2) for i, query in enumerate(args.queries): - nemo.logging.info(f'Query: {query}') + logging.info(f'Query: {query}') punct_pred = punct_preds[i][subtokens_mask[i] > 0.5] capit_pred = capit_preds[i][subtokens_mask[i] > 0.5] @@ -145,4 +155,4 @@ def get_preds(logits): if punct_label != args.none_label: output += punct_label output += ' ' - nemo.logging.info(f'Combined: {output.strip()}\n') + logging.info(f'Combined: {output.strip()}\n') diff --git a/examples/nlp/squad.py b/examples/nlp/question_answering_squad.py similarity index 80% rename from examples/nlp/squad.py rename to examples/nlp/question_answering_squad.py index 627b8bd00300..44b737d57cab 100755 --- a/examples/nlp/squad.py +++ b/examples/nlp/question_answering_squad.py @@ -16,12 +16,14 @@ Some transformer of this code were adapted from the HuggingFace library at https://github.com/huggingface/transformers +""" +""" Download the Squad data by running the script: -examples/nlp/scripts/download_squad.py +examples/nlp/scripts/get_squad.py To finetune Squad v1.1 on pretrained BERT large uncased on 1 GPU: -python squad.py +python question_answering_squad.py --data_dir /path_to_data_dir/squad/v1.1 --work_dir /path_to_output_folder --bert_checkpoint /path_to_bert_checkpoint @@ -39,7 +41,7 @@ Huggingface pretrained checkpoints. To finetune Squad v1.1 on pretrained BERT large uncased on 8 GPU: -python -m torch.distributed.launch --nproc_per_node=8 squad.py +python -m torch.distributed.launch --nproc_per_node=8 question_answering_squad.py --amp_opt_level "O1" --data_dir /path_to_data_dir/squad/v1.1 --bert_checkpoint /path_to_bert_checkpoint @@ -62,9 +64,10 @@ import json import os -import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.utils.callbacks.squad import eval_epochs_done_callback, eval_iter_callback +import nemo.core as nemo_core +from nemo import logging +from nemo.collections.nlp.callbacks.qa_squad_callback import eval_epochs_done_callback, eval_iter_callback from nemo.utils.lr_policies import get_lr_policy @@ -79,17 +82,13 @@ def parse_args(): "(or other data files) for the task.", ) parser.add_argument( - "--pretrained_bert_model", default="bert-base-uncased", type=str, help="Name of the pre-trained model", - ) - parser.add_argument( - "--checkpoint_dir", default=None, type=str, help="Checkpoint directory for inference.", - ) - parser.add_argument( - "--bert_checkpoint", default=None, type=str, help="Path to BERT model checkpoint for finetuning.", + "--pretrained_bert_model", default="bert-base-uncased", type=str, help="Name of the pre-trained model" ) + parser.add_argument("--checkpoint_dir", default=None, type=str, help="Checkpoint directory for inference.") parser.add_argument( - "--bert_config", default=None, type=str, help="Path to bert config file in json format", + "--bert_checkpoint", default=None, type=str, help="Path to BERT model checkpoint for finetuning." ) + parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") parser.add_argument( "--tokenizer_model", default="tokenizer.model", @@ -107,23 +106,15 @@ def parse_args(): parser.add_argument("--lr_policy", default="WarmupAnnealing", type=str) parser.add_argument("--lr", default=3e-5, type=float, help="The initial learning rate.") parser.add_argument("--lr_warmup_proportion", default=0.0, type=float) - parser.add_argument( - "--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.", - ) - parser.add_argument( - "--num_epochs", default=2, type=int, help="Total number of training epochs to perform.", - ) - parser.add_argument( - "--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.", - ) + parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") + parser.add_argument("--num_epochs", default=2, type=int, help="Total number of training epochs to perform.") + parser.add_argument("--batch_size", default=8, type=int, help="Batch size per GPU/CPU for training/evaluation.") parser.add_argument( "--do_lower_case", action='store_true', help="Whether to lower case the input text. " "True for uncased models, False for cased models.", ) - parser.add_argument( - "--evaluation_only", action='store_true', help="Whether to only do evaluation.", - ) + parser.add_argument("--evaluation_only", action='store_true', help="Whether to only do evaluation.") parser.add_argument( "--doc_stride", default=128, @@ -149,11 +140,9 @@ def parse_args(): ) parser.add_argument("--num_gpus", default=1, type=int, help="Number of GPUs") parser.add_argument( - "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision", - ) - parser.add_argument( - "--local_rank", type=int, default=None, help="For distributed training: local_rank", + "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"], help="01/02 to enable mixed precision" ) + parser.add_argument("--local_rank", type=int, default=None, help="For distributed training: local_rank") parser.add_argument( "--work_dir", default='output_squad', @@ -172,12 +161,8 @@ def parse_args(): type=int, help="Frequency of saving checkpoint " "'-1' - step checkpoint won't be saved", ) - parser.add_argument( - "--loss_step_freq", default=100, type=int, help="Frequency of printing loss", - ) - parser.add_argument( - "--eval_step_freq", default=500, type=int, help="Frequency of evaluation on dev data", - ) + parser.add_argument("--loss_step_freq", default=100, type=int, help="Frequency of printing loss") + parser.add_argument("--eval_step_freq", default=500, type=int, help="Frequency of evaluation on dev data") parser.add_argument( "--version_2_with_negative", action="store_true", @@ -195,9 +180,7 @@ def parse_args(): type=int, help="The total number of n-best predictions to " "generate in the nbest_predictions.json output file.", ) - parser.add_argument( - "--batches_per_step", default=1, type=int, help="Number of iterations per step.", - ) + parser.add_argument("--batches_per_step", default=1, type=int, help="Number of iterations per step.") parser.add_argument( "--max_answer_length", default=30, @@ -232,7 +215,7 @@ def create_pipeline( batches_per_step=1, mode="train", ): - data_layer = nemo_nlp.BertQuestionAnsweringDataLayer( + data_layer = nemo_nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer( mode=mode, version_2_with_negative=version_2_with_negative, batch_size=batch_size, @@ -246,19 +229,19 @@ def create_pipeline( input_data = data_layer() hidden_states = model( - input_ids=input_data.input_ids, token_type_ids=input_data.input_type_ids, attention_mask=input_data.input_mask, + input_ids=input_data.input_ids, token_type_ids=input_data.input_type_ids, attention_mask=input_data.input_mask ) qa_output = head(hidden_states=hidden_states) loss_output = loss_fn( - logits=qa_output, start_positions=input_data.start_positions, end_positions=input_data.end_positions, + logits=qa_output, start_positions=input_data.start_positions, end_positions=input_data.end_positions ) steps_per_epoch = len(data_layer) // (batch_size * num_gpus * batches_per_step) return ( loss_output.loss, steps_per_epoch, - [loss_output.start_logits, loss_output.end_logits, input_data.unique_ids,], + [loss_output.start_logits, loss_output.end_logits, input_data.unique_ids], data_layer, ) @@ -266,9 +249,7 @@ def create_pipeline( if __name__ == "__main__": args = parse_args() if not os.path.exists(args.data_dir): - raise FileNotFoundError( - "SQUAD datasets not found. Datasets can be " "obtained using scripts/download_squad.py" - ) + raise FileNotFoundError("SQUAD datasets not found. Datasets can be " "obtained using scripts/get_squad.py") if not args.version_2_with_negative: args.work_dir = f'{args.work_dir}/squad1.1' @@ -276,8 +257,8 @@ def create_pipeline( args.work_dir = f'{args.work_dir}/squad2.0' # Instantiate neural factory with supported backend - nf = nemo.core.NeuralModuleFactory( - backend=nemo.core.Backend.PyTorch, + nf = nemo_core.NeuralModuleFactory( + backend=nemo_core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=args.work_dir, @@ -288,7 +269,7 @@ def create_pipeline( if args.tokenizer == "sentencepiece": try: - tokenizer = nemo_nlp.SentencePieceTokenizer(model_path=args.tokenizer_model) + tokenizer = nemo_nlp.data.utilsSentencePieceTokenizer(model_path=args.tokenizer_model) except Exception: raise ValueError( "Using --tokenizer=sentencepiece \ @@ -296,25 +277,27 @@ def create_pipeline( ) tokenizer.add_special_tokens(["[CLS]", "[SEP]"]) elif args.tokenizer == "nemobert": - tokenizer = nemo_nlp.NemoBertTokenizer(args.pretrained_bert_model) + tokenizer = nemo_nlp.data.NemoBertTokenizer(args.pretrained_bert_model) else: raise ValueError(f"received unexpected tokenizer '{args.tokenizer}'") if args.bert_config is not None: with open(args.bert_config) as json_file: config = json.load(json_file) - model = nemo_nlp.huggingface.BERT(**config) + model = nemo_nlp.nm.trainables.huggingface.BERT(**config) else: """ Use this if you're using a standard BERT model. To see the list of pretrained models, call: nemo_nlp.huggingface.BERT.list_pretrained_models() """ - model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) + model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) hidden_size = model.hidden_size - qa_head = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False) - squad_loss = nemo_nlp.QuestionAnsweringLoss() + qa_head = nemo_nlp.nm.trainables.token_classification_nm.TokenClassifier( + hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False + ) + squad_loss = nemo_nlp.nm.losses.QuestionAnsweringLoss() if args.bert_checkpoint is not None: model.restore_from(args.bert_checkpoint) @@ -349,8 +332,8 @@ def create_pipeline( ) if not args.evaluation_only: - nemo.logging.info(f"steps_per_epoch = {train_steps_per_epoch}") - callback_train = nemo.core.SimpleLossLoggerCallback( + logging.info(f"steps_per_epoch = {train_steps_per_epoch}") + callback_train = nemo_core.SimpleLossLoggerCallback( tensors=[train_loss], print_func=lambda x: print("Loss: {:.3f}".format(x[0].item())), get_tb_values=lambda x: [["loss", x[0]]], @@ -358,10 +341,10 @@ def create_pipeline( tb_writer=nf.tb_writer, ) - ckpt_callback = nemo.core.CheckpointCallback( - folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, + ckpt_callback = nemo_core.CheckpointCallback( + folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) - callbacks_eval = nemo.core.EvaluatorCallback( + callbacks_eval = nemo_core.EvaluatorCallback( eval_tensors=eval_output, user_iter_callback=lambda x, y: eval_iter_callback(x, y), user_epochs_done_callback=lambda x: eval_epochs_done_callback( @@ -378,9 +361,7 @@ def create_pipeline( ) lr_policy_fn = get_lr_policy( - args.lr_policy, - total_steps=args.num_epochs * train_steps_per_epoch, - warmup_ratio=args.lr_warmup_proportion, + args.lr_policy, total_steps=args.num_epochs * train_steps_per_epoch, warmup_ratio=args.lr_warmup_proportion ) nf.train( @@ -416,7 +397,7 @@ def create_pipeline( null_score_diff_threshold=args.null_score_diff_threshold, do_lower_case=args.do_lower_case, ) - nemo.logging.info(f"exact_match: {exact_match}, f1: {f1}") + logging.info(f"exact_match: {exact_match}, f1: {f1}") if args.output_prediction_file is not None: with open(args.output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") diff --git a/scripts/convert_iob_format_to_token_classification_format.py b/examples/nlp/scripts/convert_iob_format_to_token_classification_format.py similarity index 83% rename from scripts/convert_iob_format_to_token_classification_format.py rename to examples/nlp/scripts/convert_iob_format_to_token_classification_format.py index e30345e547d8..0e95f62aa186 100644 --- a/scripts/convert_iob_format_to_token_classification_format.py +++ b/examples/nlp/scripts/convert_iob_format_to_token_classification_format.py @@ -1,21 +1,24 @@ -# Copyright (C) NVIDIA CORPORATION. All Rights Reserved. +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the “License”); +# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, +# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License.**** +# limitations under the License. +# ============================================================================= import argparse -import logging import os +from nemo import logging + def __convert_data(in_file, out_text, out_labels): """ diff --git a/nemo/collections/nlp/utils/download_squad.py b/examples/nlp/scripts/get_squad.py similarity index 79% rename from nemo/collections/nlp/utils/download_squad.py rename to examples/nlp/scripts/get_squad.py index 80c4739e7b62..037d1b3d3fbb 100755 --- a/nemo/collections/nlp/utils/download_squad.py +++ b/examples/nlp/scripts/get_squad.py @@ -1,4 +1,6 @@ -# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved. +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -10,11 +12,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +# ============================================================================= import argparse import os import urllib.request +from nemo import logging + class SquadDownloader: def __init__(self, save_path): @@ -32,12 +37,8 @@ def __init__(self, save_path): self.download_urls = { 'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/train-v1.1.json': 'v1.1/train-v1.1.json', 'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/dev-v1.1.json': 'v1.1/dev-v1.1.json', - 'https://worksheets.codalab.org/rest/bundles' - '/0xbcd57bee090b421c982906709c8c27e1/contents/blob/': 'v1.1/evaluate-v1.1.py', 'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/train-v2.0.json': 'v2.0/train-v2.0.json', 'https://rajpurkar.github.io/SQuAD-explorer' '/dataset/dev-v2.0.json': 'v2.0/dev-v2.0.json', - 'https://worksheets.codalab.org/rest/bundles' - '/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/': 'v2.0/evaluate-v2.0.py', } def download(self): @@ -45,9 +46,9 @@ def download(self): url = item file = self.download_urls[item] - print('Downloading:', url) + logging.info('Downloading:', url) if os.path.isfile(self.save_path + '/' + file): - print('** Download file already exists, skipping download') + logging.info('** Download file already exists, skipping download') else: response = urllib.request.urlopen(url) with open(self.save_path + '/' + file, "wb") as handle: @@ -61,8 +62,9 @@ def download(self): type=str, required=False, help='directory to store data', - default=os.path.split(os.path.abspath(__file__))[0] + '/../data/lm', + default=os.path.split(os.path.abspath(__file__))[0] + '../../../../../../examples/data/lm', ) args = parser.parse_args() + logging.info(args.destDir) squad_dl = SquadDownloader(args.destDir) squad_dl.download() diff --git a/scripts/get_tatoeba_data.py b/examples/nlp/scripts/get_tatoeba.py similarity index 89% rename from scripts/get_tatoeba_data.py rename to examples/nlp/scripts/get_tatoeba.py index 47cb09791b72..0da3137e54ee 100644 --- a/scripts/get_tatoeba_data.py +++ b/examples/nlp/scripts/get_tatoeba.py @@ -1,16 +1,18 @@ -# Copyright (C) NVIDIA CORPORATION. All Rights Reserved. +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. # -# Licensed under the Apache License, Version 2.0 (the “License”); +# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an “AS IS” BASIS, +# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License.**** +# limitations under the License. +# ============================================================================= import argparse import logging @@ -19,7 +21,8 @@ import re import string import urllib.request -from collections import Counter + +from nemo import logging URL = {'tatoeba': 'https://downloads.tatoeba.org/exports/sentences.csv'} @@ -182,18 +185,12 @@ def __delete_file(file_to_del): parser = argparse.ArgumentParser(description='Prepare tatoeba dataset') parser.add_argument("--data_dir", required=True, type=str) parser.add_argument("--dataset", default='tatoeba', type=str) + parser.add_argument("--num_samples", default=-1, type=int, help='-1 to use the whole dataset') + parser.add_argument("--percent_to_cut", default=0, type=float, help='Percent of sentences to cut in the middle') parser.add_argument( - "--num_samples", default=-1, type=int, help='-1 to use the whole dataset', - ) - parser.add_argument( - "--percent_to_cut", default=0, type=float, help='Percent of sentences to cut in the middle', - ) - parser.add_argument( - "--num_lines_to_combine", default=1, type=int, help='Number of lines to combine into single example', - ) - parser.add_argument( - "--percent_dev", default=0.2, type=float, help='Size of the dev set, float', + "--num_lines_to_combine", default=1, type=int, help='Number of lines to combine into single example' ) + parser.add_argument("--percent_dev", default=0.2, type=float, help='Size of the dev set, float') parser.add_argument("--clean_dir", action='store_true') args = parser.parse_args() @@ -210,7 +207,7 @@ def __delete_file(file_to_del): logging.info(f'Processing English sentences...') clean_eng_sentences = os.path.join(args.data_dir, 'clean_eng_sentences.txt') __process_english_sentences( - tatoeba_dataset, clean_eng_sentences, args.percent_to_cut, args.num_lines_to_combine, args.num_samples, + tatoeba_dataset, clean_eng_sentences, args.percent_to_cut, args.num_lines_to_combine, args.num_samples ) train_file = os.path.join(args.data_dir, 'train.txt') diff --git a/examples/nlp/scripts/process_wiki_zh.py b/examples/nlp/scripts/process_wiki_zh.py index a7f195fbb9c0..58d944a5c727 100755 --- a/examples/nlp/scripts/process_wiki_zh.py +++ b/examples/nlp/scripts/process_wiki_zh.py @@ -1,6 +1,7 @@ #!/usr/bin/env python + # ============================================================================= -# Copyright 2019 NVIDIA Corporation. All Rights Reserved. +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -23,9 +24,7 @@ from functools import partial -def create_vocab( - lines, vocab_file, min_frequency=3, special_symbols=["[PAD]", "[SEP]", "[CLS]", "[MASK]", "[UNK]"], -): +def create_vocab(lines, vocab_file, min_frequency=3, special_symbols=["[PAD]", "[SEP]", "[CLS]", "[MASK]", "[UNK]"]): """Create vocabulary from lines""" # Count word occurency vocab = {} @@ -140,11 +139,9 @@ def process(data_dir, output_dir=None, min_frequency=3, max_files=-1): parser.add_argument("--data_dir", default="/raid/data/wiki_zh", type=str) parser.add_argument("--output_dir", default="./", type=str) parser.add_argument( - "--min_frequency", default=0, type=int, help="Characters occuring less frequently " "will be filtered out", - ) - parser.add_argument( - "--max_files", default=-1, type=int, help="Max number of dirs to process", + "--min_frequency", default=0, type=int, help="Characters occuring less frequently " "will be filtered out" ) + parser.add_argument("--max_files", default=-1, type=int, help="Max number of dirs to process") args = parser.parse_args() process(args.data_dir, args.output_dir, args.min_frequency, args.max_files) diff --git a/examples/nlp/sentence_classification_with_bert.py b/examples/nlp/text_classification_with_bert.py similarity index 72% rename from examples/nlp/sentence_classification_with_bert.py rename to examples/nlp/text_classification_with_bert.py index 2cd622e65ac3..4dd8535e2347 100644 --- a/examples/nlp/sentence_classification_with_bert.py +++ b/examples/nlp/text_classification_with_bert.py @@ -1,15 +1,30 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import argparse import math import numpy as np -import torch -from torch import nn from transformers import BertTokenizer -import nemo -import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.data.datasets.utils import SentenceClassificationDataDesc -from nemo.collections.nlp.utils.callbacks.sentence_classification import eval_epochs_done_callback, eval_iter_callback +import nemo.collections.nlp.nm.data_layers.text_classification_datalayer +import nemo.collections.nlp.nm.trainables.common.sequence_classification_nm +from nemo import logging +from nemo.collections.nlp.callbacks.text_classification_callback import eval_epochs_done_callback, eval_iter_callback +from nemo.collections.nlp.data.datasets.text_classification_dataset import SentenceClassificationDataDesc from nemo.utils.lr_policies import get_lr_policy # Parsing arguments @@ -45,9 +60,7 @@ parser.add_argument("--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2"]) parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--shuffle_data", action='store_true') -parser.add_argument( - "--class_balancing", default="None", type=str, choices=["None", "weighted_loss"], -) +parser.add_argument("--class_balancing", default="None", type=str, choices=["None", "weighted_loss"]) args = parser.parse_args() @@ -68,10 +81,14 @@ """ if args.bert_checkpoint and args.bert_config: - pretrained_bert_model = nemo_nlp.huggingface.BERT(config_filename=args.bert_config) + pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT( + config_filename=args.bert_config + ) pretrained_bert_model.restore_from(args.bert_checkpoint) else: - pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) + pretrained_bert_model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT( + pretrained_model_name=args.pretrained_bert_model + ) hidden_size = pretrained_bert_model.hidden_size tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model) @@ -79,8 +96,8 @@ data_desc = SentenceClassificationDataDesc(args.dataset_name, args.data_dir, args.do_lower_case) # Create sentence classification loss on top -classifier = nemo_nlp.SequenceClassifier( - hidden_size=hidden_size, num_classes=data_desc.num_labels, dropout=args.fc_dropout, +classifier = nemo.collections.nlp.nm.trainables.common.sequence_classification_nm.SequenceClassifier( + hidden_size=hidden_size, num_classes=data_desc.num_labels, dropout=args.fc_dropout ) if args.class_balancing == 'weighted_loss': @@ -91,31 +108,29 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'): - nemo.logging.info(f"Loading {mode} data...") + logging.info(f"Loading {mode} data...") data_file = f'{data_desc.data_dir}/{mode}.tsv' shuffle = args.shuffle_data if mode == 'train' else False - data_layer = nemo_nlp.BertSentenceClassificationDataLayer( + data_layer = nemo.collections.nlp.nm.data_layers.text_classification_datalayer.BertSentenceClassificationDataLayer( input_file=data_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, num_samples=num_samples, shuffle=shuffle, batch_size=batch_size, - # num_workers=0, - # local_rank=local_rank, ) ids, type_ids, input_mask, labels = data_layer() data_size = len(data_layer) if data_size < batch_size: - nemo.logging.warning("Batch_size is larger than the dataset size") - nemo.logging.warning("Reducing batch_size to dataset size") + logging.warning("Batch_size is larger than the dataset size") + logging.warning("Reducing batch_size to dataset size") batch_size = data_size steps_per_epoch = math.ceil(data_size / (batch_size * num_gpus)) - nemo.logging.info(f"Steps_per_epoch = {steps_per_epoch}") + logging.info(f"Steps_per_epoch = {steps_per_epoch}") hidden_states = pretrained_bert_model(input_ids=ids, token_type_ids=type_ids, attention_mask=input_mask) @@ -164,11 +179,11 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod # Create callback to save checkpoints ckpt_callback = nemo.core.CheckpointCallback( - folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, + folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) lr_policy_fn = get_lr_policy( - args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion, + args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion ) nf.train( @@ -176,5 +191,5 @@ def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mod callbacks=[train_callback, eval_callback, ckpt_callback], lr_policy=lr_policy_fn, optimizer=args.optimizer_kind, - optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay,}, + optimization_params={"num_epochs": args.num_epochs, "lr": args.lr, "weight_decay": args.weight_decay}, ) diff --git a/examples/nlp/token_classification.py b/examples/nlp/token_classification.py index 43749c299e05..3a88bad1a958 100644 --- a/examples/nlp/token_classification.py +++ b/examples/nlp/token_classification.py @@ -1,22 +1,35 @@ -# pylint: disable=invalid-name +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import argparse import json import os -import sys -import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer, TokenClassificationLoss, TokenClassifier -from nemo.collections.nlp.data.datasets import utils -from nemo.collections.nlp.utils.callbacks.token_classification import eval_epochs_done_callback, eval_iter_callback +import nemo.collections.nlp.utils.common_nlp_utils +from nemo import logging +from nemo.collections.nlp.callbacks.token_classification_callback import eval_epochs_done_callback, eval_iter_callback +from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer +from nemo.collections.nlp.nm.data_layers import BertTokenClassificationDataLayer +from nemo.collections.nlp.nm.losses import TokenClassificationLoss +from nemo.collections.nlp.nm.trainables import TokenClassifier from nemo.utils.lr_policies import get_lr_policy # Parsing arguments -parser = argparse.ArgumentParser( - description="Token classification\ - with pretrained BERT" -) +parser = argparse.ArgumentParser(description="Token classification with pretrained BERT") parser.add_argument("--local_rank", default=None, type=int) parser.add_argument("--batch_size", default=8, type=int) parser.add_argument("--max_seq_length", default=128, type=int) @@ -37,54 +50,41 @@ parser.add_argument("--shuffle_data", action='store_false') parser.add_argument("--pretrained_bert_model", default="bert-base-cased", type=str) parser.add_argument("--bert_checkpoint", default=None, type=str) -parser.add_argument( - "--bert_config", default=None, type=str, help="Path to bert config file in json format", -) +parser.add_argument("--bert_config", default=None, type=str, help="Path to bert config file in json format") parser.add_argument( "--tokenizer_model", default="tokenizer.model", type=str, - help="Path to pretrained tokenizer model, \ - only used if --tokenizer is sentencepiece", + help="Path to pretrained tokenizer model, only used if --tokenizer is sentencepiece", ) parser.add_argument( "--tokenizer", default="nemobert", type=str, choices=["nemobert", "sentencepiece"], - help="tokenizer to use, \ - only relevant when using custom pretrained checkpoint.", + help="tokenizer to use, only relevant when using custom pretrained checkpoint.", ) parser.add_argument( "--work_dir", default='output', type=str, - help="The output directory where the model prediction\ - and checkpoints will be written.", -) -parser.add_argument( - "--use_cache", action='store_true', help="Whether to cache preprocessed data", + help="The output directory where the model prediction and checkpoints will be written.", ) +parser.add_argument("--use_cache", action='store_true', help="Whether to cache preprocessed data") parser.add_argument( "--save_epoch_freq", default=1, type=int, - help="Frequency of saving checkpoint\ - '-1' - step checkpoint won't be saved", + help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved", ) parser.add_argument( "--save_step_freq", default=-1, type=int, - help="Frequency of saving checkpoint \ - '-1' - step checkpoint won't be saved", -) -parser.add_argument( - "--loss_step_freq", default=250, type=int, help="Frequency of printing loss", -) -parser.add_argument( - "--use_weighted_loss", action='store_true', help="Flag to indicate whether to use weighted loss", + help="Frequency of saving checkpoint '-1' - step checkpoint won't be saved", ) +parser.add_argument("--loss_step_freq", default=250, type=int, help="Frequency of printing loss") +parser.add_argument("--use_weighted_loss", action='store_true', help="Flag to indicate whether to use weighted loss") args = parser.parse_args() @@ -106,17 +106,17 @@ add_time_to_log_dir=True, ) -nemo.logging.info(args) +logging.info(args) output_file = f'{nf.work_dir}/output.txt' if args.bert_checkpoint is None: """ Use this if you're using a standard BERT model. To see the list of pretrained models, call: - nemo_nlp.huggingface.BERT.list_pretrained_models() + nemo_nlp.nm.trainables.huggingface.BERT.list_pretrained_models() """ tokenizer = NemoBertTokenizer(args.pretrained_bert_model) - model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) + model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) else: """ Use this if you're using a BERT model that you pre-trained yourself. """ @@ -130,25 +130,20 @@ if args.bert_config is not None: with open(args.bert_config) as json_file: config = json.load(json_file) - model = nemo_nlp.huggingface.BERT(**config) + model = nemo_nlp.nm.trainables.huggingface.BERT(**config) else: - model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) + model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) model.restore_from(args.bert_checkpoint) - nemo.logging.info(f"Model restored from {args.bert_checkpoint}") + logging.info(f"Model restored from {args.bert_checkpoint}") hidden_size = model.hidden_size -classifier = "TokenClassifier" -task_loss = "TokenClassificationLoss" - def create_pipeline( - num_samples=-1, pad_label=args.none_label, max_seq_length=args.max_seq_length, batch_size=args.batch_size, - local_rank=args.local_rank, num_gpus=args.num_gpus, mode='train', label_ids=None, @@ -157,10 +152,10 @@ def create_pipeline( use_cache=args.use_cache, dropout=args.fc_dropout, num_layers=args.num_fc_layers, + classifier=TokenClassifier, ): - global classifier, task_loss - nemo.logging.info(f"Loading {mode} data...") + logging.info(f"Loading {mode} data...") shuffle = args.shuffle_data if mode == 'train' else False text_file = f'{args.data_dir}/text_{mode}.txt' @@ -179,7 +174,7 @@ def create_pipeline( [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).' ) - data_layer = nemo_nlp.BertTokenClassificationDataLayer( + data_layer = BertTokenClassificationDataLayer( tokenizer=tokenizer, text_file=text_file, label_file=label_file, @@ -187,54 +182,49 @@ def create_pipeline( label_ids=label_ids, max_seq_length=max_seq_length, batch_size=batch_size, - num_workers=0, - local_rank=local_rank, shuffle=shuffle, ignore_extra_tokens=ignore_extra_tokens, ignore_start_end=ignore_start_end, use_cache=use_cache, ) - (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, labels,) = data_layer() + (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, labels) = data_layer() if mode == 'train': label_ids = data_layer.dataset.label_ids class_weights = None if args.use_weighted_loss: - nemo.logging.info(f"Using weighted loss") + logging.info(f"Using weighted loss") label_freqs = data_layer.dataset.label_frequencies - class_weights = utils.calc_class_weights(label_freqs) + class_weights = nemo.collections.nlp.utils.common_nlp_utils.calc_class_weights(label_freqs) - nemo.logging.info(f"class_weights: {class_weights}") + logging.info(f"class_weights: {class_weights}") - classifier = getattr(sys.modules[__name__], classifier) classifier = classifier( - hidden_size=hidden_size, num_classes=len(label_ids), dropout=dropout, num_layers=num_layers, + hidden_size=hidden_size, num_classes=len(label_ids), dropout=dropout, num_layers=num_layers ) - task_loss = getattr(sys.modules[__name__], task_loss) - task_loss = task_loss(num_classes=len(label_ids), class_weights=class_weights) - - hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,) + task_loss = TokenClassificationLoss(num_classes=len(label_ids), class_weights=class_weights) + hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) - loss = task_loss(logits=logits, labels=labels, loss_mask=loss_mask) - - steps_per_epoch = len(data_layer) // (batch_size * num_gpus) if mode == 'train': + loss = task_loss(logits=logits, labels=labels, loss_mask=loss_mask) + steps_per_epoch = len(data_layer) // (batch_size * num_gpus) tensors_to_evaluate = [loss, logits] + return tensors_to_evaluate, loss, steps_per_epoch, label_ids, classifier else: tensors_to_evaluate = [logits, labels, subtokens_mask] - return tensors_to_evaluate, loss, steps_per_epoch, label_ids, data_layer + return tensors_to_evaluate, data_layer -train_tensors, train_loss, steps_per_epoch, label_ids, _ = create_pipeline() +train_tensors, train_loss, steps_per_epoch, label_ids, classifier = create_pipeline() -eval_tensors, _, _, _, data_layer = create_pipeline(mode='dev', label_ids=label_ids) +eval_tensors, data_layer = create_pipeline(mode='dev', label_ids=label_ids, classifier=classifier) -nemo.logging.info(f"steps_per_epoch = {steps_per_epoch}") +logging.info(f"steps_per_epoch = {steps_per_epoch}") # Create trainer and execute training action train_callback = nemo.core.SimpleLossLoggerCallback( @@ -253,11 +243,11 @@ def create_pipeline( ) ckpt_callback = nemo.core.CheckpointCallback( - folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, + folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) lr_policy_fn = get_lr_policy( - args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion, + args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion ) nf.train( diff --git a/examples/nlp/token_classification_infer.py b/examples/nlp/token_classification_infer.py index ae272f86d210..642be6d149b1 100644 --- a/examples/nlp/token_classification_infer.py +++ b/examples/nlp/token_classification_infer.py @@ -1,13 +1,30 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import argparse import os import numpy as np -from sklearn.metrics import classification_report import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp import NemoBertTokenizer -from nemo.collections.nlp.utils.nlp_utils import get_vocab +from nemo import logging +from nemo.collections.nlp.data import NemoBertTokenizer +from nemo.collections.nlp.nm.trainables import TokenClassifier +from nemo.collections.nlp.utils.common_nlp_utils import get_vocab # Parsing arguments parser = argparse.ArgumentParser(description='NER with pretrained BERT') @@ -46,7 +63,7 @@ raise ValueError(f'Dictionary with ids to labels not found at {args.labels_dict}') nf = nemo.core.NeuralModuleFactory( - backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None, + backend=nemo.core.Backend.PyTorch, optimization_level=args.amp_opt_level, log_dir=None ) labels_dict = get_vocab(args.labels_dict) @@ -55,35 +72,31 @@ See the list of pretrained models, call: nemo_nlp.huggingface.BERT.list_pretrained_models() """ -pretrained_bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) +pretrained_bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=args.pretrained_bert_model) hidden_size = pretrained_bert_model.hidden_size tokenizer = NemoBertTokenizer(args.pretrained_bert_model) -data_layer = nemo_nlp.BertTokenClassificationInferDataLayer( - queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1, +data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer( + queries=args.queries, tokenizer=tokenizer, max_seq_length=args.max_seq_length, batch_size=1 ) -classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=len(labels_dict), dropout=args.fc_dropout,) +classifier = TokenClassifier(hidden_size=hidden_size, num_classes=len(labels_dict), dropout=args.fc_dropout) input_ids, input_type_ids, input_mask, _, subtokens_mask = data_layer() -hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,) +hidden_states = pretrained_bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) ########################################################################### # Instantiate an optimizer to perform `infer` action -evaluated_tensors = nf.infer(tensors=[logits, subtokens_mask], checkpoint_dir=args.work_dir,) +evaluated_tensors = nf.infer(tensors=[logits, subtokens_mask], checkpoint_dir=args.work_dir) def concatenate(lists): return np.concatenate([t.cpu() for t in lists]) -def get_preds(logits): - return np.argmax(logits, 1) - - def add_brackets(text, add=args.add_brackets): return '[' + text + ']' if add else text @@ -93,7 +106,7 @@ def add_brackets(text, add=args.add_brackets): preds = np.argmax(logits, axis=2) for i, query in enumerate(args.queries): - nemo.logging.info(f'Query: {query}') + logging.info(f'Query: {query}') pred = preds[i][subtokens_mask[i] > 0.5] words = query.strip().split() @@ -108,4 +121,4 @@ def add_brackets(text, add=args.add_brackets): label = add_brackets(label) output += label output += ' ' - nemo.logging.info(f'Combined: {output.strip()}') + logging.info(f'Combined: {output.strip()}') diff --git a/examples/tts/tacotron2.py b/examples/tts/tacotron2.py index 2980ddf3e701..332da22e0be5 100644 --- a/examples/tts/tacotron2.py +++ b/examples/tts/tacotron2.py @@ -164,7 +164,7 @@ def create_train_dag( # Callbacks needed to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( - tensors=[loss_t, spec_target, mel_postnet, gate, gate_target, alignments,], + tensors=[loss_t, spec_target, mel_postnet, gate, gate_target, alignments], print_func=lambda x: nemo.logging.info(f"Loss: {x[0].data}"), log_to_tb_func=partial(tacotron2_log_to_tb_func, log_images=True, log_images_freq=log_freq), tb_writer=neural_factory.tb_writer, diff --git a/nemo/backends/pytorch/nm.py b/nemo/backends/pytorch/nm.py index 0a92cfe5cdc9..e759035f6a9d 100644 --- a/nemo/backends/pytorch/nm.py +++ b/nemo/backends/pytorch/nm.py @@ -205,7 +205,7 @@ def __init__(self): # (when the time for that will come;)) self._batch_size = 1 self._num_workers = os.cpu_count() # Use all CPUs by default. - self._shuffle = True # Shuffle by default. + self._shuffle = False # Don't shuffle by default. @property def input_ports(self): diff --git a/nemo/collections/nlp/__init__.py b/nemo/collections/nlp/__init__.py index 33c4a8aea2b2..06f6cd875da6 100644 --- a/nemo/collections/nlp/__init__.py +++ b/nemo/collections/nlp/__init__.py @@ -1,4 +1,5 @@ -# Copyright 2019 NVIDIA. All Rights Reserved. +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,10 +14,9 @@ # limitations under the License. # ============================================================================= -import nemo -from .data import * -from .huggingface import * -from .modules import * -from .transformer import * +import nemo.collections.nlp.callbacks +import nemo.collections.nlp.data +import nemo.collections.nlp.nm +import nemo.collections.nlp.utils backend = nemo.core.Backend.PyTorch diff --git a/nemo/collections/nlp/callbacks/__init__.py b/nemo/collections/nlp/callbacks/__init__.py new file mode 100644 index 000000000000..ada8ad45abe2 --- /dev/null +++ b/nemo/collections/nlp/callbacks/__init__.py @@ -0,0 +1,25 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.callbacks.glue_benchmark_callback import * +from nemo.collections.nlp.callbacks.joint_intent_slot_callback import * +from nemo.collections.nlp.callbacks.lm_bert_callback import * +from nemo.collections.nlp.callbacks.lm_transformer_callback import * +from nemo.collections.nlp.callbacks.machine_translation_callback import * +from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import * +from nemo.collections.nlp.callbacks.qa_squad_callback import * +from nemo.collections.nlp.callbacks.text_classification_callback import * +from nemo.collections.nlp.callbacks.token_classification_callback import * diff --git a/nemo/collections/nlp/utils/callbacks/glue.py b/nemo/collections/nlp/callbacks/glue_benchmark_callback.py similarity index 80% rename from nemo/collections/nlp/utils/callbacks/glue.py rename to nemo/collections/nlp/callbacks/glue_benchmark_callback.py index 3edb95fe6ea9..1368284d66fd 100644 --- a/nemo/collections/nlp/utils/callbacks/glue.py +++ b/nemo/collections/nlp/callbacks/glue_benchmark_callback.py @@ -19,8 +19,6 @@ Some transformer of this code were adapted from the HuggingFace library at https://github.com/huggingface/transformers """ -__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] - import os import random @@ -28,7 +26,10 @@ from scipy.stats import pearsonr, spearmanr from sklearn.metrics import f1_score, matthews_corrcoef -import nemo +from nemo import logging +from nemo.collections.nlp.utils.callback_utils import list2str, tensor2list + +__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] def eval_iter_callback(tensors, global_vars): @@ -46,16 +47,16 @@ def eval_iter_callback(tensors, global_vars): if 'logits' in kv: for v_tensor in v: for logit_tensor in v_tensor: - logits_lists.append(logit_tensor.detach().cpu().tolist()) + logits_lists.append(tensor2list(logit_tensor)) # for GLUE STS-B task (regression) elif 'preds' in kv: for v_tensor in v: for pred_tensor in v_tensor: - preds_lists.append(pred_tensor.detach().cpu().tolist()) + preds_lists.append(tensor2list(pred_tensor)) if 'labels' in kv: for v_tensor in v: for label_tensor in v_tensor: - labels_lists.append(label_tensor.detach().cpu().tolist()) + labels_lists.append(tensor2list(label_tensor)) if len(logits_lists) > 0: preds = list(np.argmax(np.asarray(logits_lists), 1)) @@ -66,21 +67,19 @@ def eval_iter_callback(tensors, global_vars): global_vars["all_labels"].extend(labels_lists) -def list2str(l): - return ' '.join([str(j) for j in l]) - - def eval_epochs_done_callback(global_vars, output_dir, task_name): labels = np.asarray(global_vars['all_labels']) preds = np.asarray(global_vars['all_preds']) + # print predictions and labels for a small random subset of data + sample_size = 20 i = 0 - if preds.shape[0] > 21: - i = random.randint(0, preds.shape[0] - 21) + if preds.shape[0] > sample_size + 1: + i = random.randint(0, preds.shape[0] - sample_size - 1) - nemo.logging.info("Task name: %s" % task_name.upper()) - nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + 20])) - nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + 20])) + logging.info("Task name: %s" % task_name.upper()) + logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size])) + logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size])) results = compute_metrics(task_name, preds, labels) @@ -89,7 +88,7 @@ def eval_epochs_done_callback(global_vars, output_dir, task_name): f.write('labels\t' + list2str(labels) + '\n') f.write('preds\t' + list2str(preds) + '\n') - nemo.logging.info(results) + logging.info(results) return results @@ -111,11 +110,7 @@ def mcc(preds, labels): def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] - return { - "pearson": pearson_corr, - "spearmanr": spearman_corr, - "corr": (pearson_corr + spearman_corr) / 2, - } + return {"pearson": pearson_corr, "spearmanr": spearman_corr, "corr": (pearson_corr + spearman_corr) / 2} def compute_metrics(task_name, preds, labels): diff --git a/nemo/collections/nlp/utils/callbacks/joint_intent_slot.py b/nemo/collections/nlp/callbacks/joint_intent_slot_callback.py similarity index 63% rename from nemo/collections/nlp/utils/callbacks/joint_intent_slot.py rename to nemo/collections/nlp/callbacks/joint_intent_slot_callback.py index 79db8a709f20..b3f49c5e33fb 100644 --- a/nemo/collections/nlp/utils/callbacks/joint_intent_slot.py +++ b/nemo/collections/nlp/callbacks/joint_intent_slot_callback.py @@ -1,23 +1,30 @@ -# Copyright (c) 2019 NVIDIA Corporation +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= -import os import random -import time -import matplotlib import numpy as np -from matplotlib import pyplot as plt -from sklearn.metrics import classification_report, confusion_matrix +from sklearn.metrics import classification_report -import nemo +from nemo import logging +from nemo.collections.nlp.utils.callback_utils import list2str, plot_confusion_matrix, tensor2list __all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] -def tensor2list(tensor): - return tensor.detach().cpu().tolist() - - def eval_iter_callback(tensors, global_vars, eval_data_layer): if "all_intent_preds" not in global_vars.keys(): global_vars["all_intent_preds"] = [] @@ -68,10 +75,6 @@ def eval_iter_callback(tensors, global_vars, eval_data_layer): global_vars["all_subtokens_mask"].extend(all_subtokens_mask) -def list2str(l): - return ' '.join([str(j) for j in l]) - - def eval_epochs_done_callback(global_vars, graph_fold): intent_labels = np.asarray(global_vars['all_intent_labels']) intent_preds = np.asarray(global_vars['all_intent_preds']) @@ -83,38 +86,31 @@ def eval_epochs_done_callback(global_vars, graph_fold): slot_labels = slot_labels[subtokens_mask] slot_preds = slot_preds[subtokens_mask] + # print predictions and labels for a small random subset of data + sample_size = 20 i = 0 - if intent_preds.shape[0] > 21: - i = random.randint(0, intent_preds.shape[0] - 21) - nemo.logging.info("Sampled i_preds: [%s]" % list2str(intent_preds[i : i + 20])) - nemo.logging.info("Sampled intents: [%s]" % list2str(intent_labels[i : i + 20])) - nemo.logging.info("Sampled s_preds: [%s]" % list2str(slot_preds[i : i + 20])) - nemo.logging.info("Sampled slots: [%s]" % list2str(slot_labels[i : i + 20])) - cm = confusion_matrix(intent_labels, intent_preds) - nemo.logging.info(f'Confusion matrix:\n{cm}') - fig = plt.figure() - ax = fig.add_subplot(111) - cax = ax.matshow(cm) - plt.title('Confusion matrix of the classifier') - fig.colorbar(cax) - plt.xlabel('Predicted') - plt.ylabel('True') - os.makedirs(graph_fold, exist_ok=True) - plt.savefig(os.path.join(graph_fold, time.strftime('%Y%m%d-%H%M%S'))) - - nemo.logging.info('Intent prediction results') + if intent_preds.shape[0] > sample_size + 1: + i = random.randint(0, intent_preds.shape[0] - sample_size - 1) + logging.info("Sampled i_preds: [%s]" % list2str(intent_preds[i : i + sample_size])) + logging.info("Sampled intents: [%s]" % list2str(intent_labels[i : i + sample_size])) + logging.info("Sampled s_preds: [%s]" % list2str(slot_preds[i : i + sample_size])) + logging.info("Sampled slots: [%s]" % list2str(slot_labels[i : i + sample_size])) + + plot_confusion_matrix(intent_labels, intent_preds, graph_fold) + + logging.info('Intent prediction results') correct_preds = sum(intent_labels == intent_preds) intent_accuracy = correct_preds / intent_labels.shape[0] - nemo.logging.info(f'Intent accuracy: {intent_accuracy}') - nemo.logging.info( + logging.info(f'Intent accuracy: {intent_accuracy}') + logging.info( f'Classification report:\n \ {classification_report(intent_labels, intent_preds)}' ) - nemo.logging.info('Slot prediction results') + logging.info('Slot prediction results') slot_accuracy = sum(slot_labels == slot_preds) / slot_labels.shape[0] - nemo.logging.info(f'Slot accuracy: {slot_accuracy}') - nemo.logging.info( + logging.info(f'Slot accuracy: {slot_accuracy}') + logging.info( f'Classification report:\n \ {classification_report(slot_labels[:-2], slot_preds[:-2])}' ) diff --git a/nemo/collections/nlp/utils/callbacks/bert_pretraining.py b/nemo/collections/nlp/callbacks/lm_bert_callback.py similarity index 52% rename from nemo/collections/nlp/utils/callbacks/bert_pretraining.py rename to nemo/collections/nlp/callbacks/lm_bert_callback.py index baeaabe2d701..e31f964a22da 100644 --- a/nemo/collections/nlp/utils/callbacks/bert_pretraining.py +++ b/nemo/collections/nlp/callbacks/lm_bert_callback.py @@ -1,9 +1,24 @@ -# Copyright (c) 2019 NVIDIA Corporation -__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import numpy as np -import nemo +from nemo import logging + +__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] def eval_iter_callback(tensors, global_vars): @@ -24,14 +39,14 @@ def eval_iter_callback(tensors, global_vars): def eval_epochs_done_callback(global_vars): if 'dev_mlm_loss' in global_vars: mlm_loss = np.mean(global_vars["dev_mlm_loss"]) - nemo.logging.info("Dev MLM perplexity: {0}".format(np.round(np.exp(mlm_loss), 3))) + logging.info("Dev MLM perplexity: {0}".format(np.round(np.exp(mlm_loss), 3))) global_vars["dev_mlm_loss"] = [] else: mlm_loss = -123.0 if 'dev_nsp_loss' in global_vars: nsp_loss = np.mean(global_vars["dev_nsp_loss"]) - nemo.logging.info("Dev NSP perplexity: {0}".format(np.round(np.exp(nsp_loss), 3))) + logging.info("Dev NSP perplexity: {0}".format(np.round(np.exp(nsp_loss), 3))) global_vars["dev_nsp_loss"] = [] else: nsp_loss = -123.0 diff --git a/nemo/collections/nlp/callbacks/lm_transformer_callback.py b/nemo/collections/nlp/callbacks/lm_transformer_callback.py new file mode 100644 index 000000000000..344873c216d0 --- /dev/null +++ b/nemo/collections/nlp/callbacks/lm_transformer_callback.py @@ -0,0 +1,46 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import numpy as np + +from nemo import logging + +__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] +GLOBAL_KEYS = ["eval_loss", "sys"] + + +def eval_iter_callback(tensors, global_vars): + for key in GLOBAL_KEYS: + if key not in global_vars.keys(): + global_vars[key] = [] + + for kv, v in tensors.items(): + if "loss" in kv: + for eval_loss in v: + global_vars["eval_loss"].append(eval_loss.item()) + + +def eval_epochs_done_callback(global_vars): + eval_loss = np.mean(global_vars["eval_loss"]) + eval_ppl = np.exp(eval_loss) + + logging.info("------------------------------------------------------") + logging.info("Eval loss: {0}".format(np.round(eval_loss, 3))) + logging.info("Eval ppl: {0}".format(np.round(eval_ppl, 3))) + logging.info("------------------------------------------------------") + for key in GLOBAL_KEYS: + global_vars[key] = [] + return dict({"Eval_loss": eval_loss, "Eval_ppl": eval_ppl}) diff --git a/nemo/collections/nlp/utils/callbacks/translation.py b/nemo/collections/nlp/callbacks/machine_translation_callback.py similarity index 64% rename from nemo/collections/nlp/utils/callbacks/translation.py rename to nemo/collections/nlp/callbacks/machine_translation_callback.py index 02f168de00c1..e0a885f3bf4c 100644 --- a/nemo/collections/nlp/utils/callbacks/translation.py +++ b/nemo/collections/nlp/callbacks/machine_translation_callback.py @@ -1,10 +1,26 @@ -# Copyright (c) 2019 NVIDIA Corporation -__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import numpy as np -from ..metrics.sacrebleu import corpus_bleu +from nemo import logging from nemo.collections.asr.metrics import word_error_rate +from nemo.collections.nlp.metrics.sacrebleu import corpus_bleu + +__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] GLOBAL_KEYS = ["eval_loss", "ref", "sys", "sent_ids", "nonpad_tokens"] @@ -64,19 +80,19 @@ def eval_epochs_done_callback(global_vars, validation_dataset=None): for i in range(3): sent_id = np.random.randint(len(all_sys)) - print("Ground truth: {0}\n".format(all_ref[0][sent_id])) - print("Translation: {0}\n".format(all_sys[sent_id])) + logging.info("Ground truth: {0}\n".format(all_ref[0][sent_id])) + logging.info("Translation: {0}\n".format(all_sys[sent_id])) - print("------------------------------------------------------------") - print("Validation loss: {0}".format(np.round(eval_loss, 3))) - print("TokenBLEU: {0}".format(np.round(token_bleu, 2))) - print("SacreBLEU: {0}".format(np.round(sacre_bleu, 2))) - print("------------------------------------------------------------") + logging.info("------------------------------------------------------------") + logging.info("Validation loss: {0}".format(np.round(eval_loss, 3))) + logging.info("TokenBLEU: {0}".format(np.round(token_bleu, 2))) + logging.info("SacreBLEU: {0}".format(np.round(sacre_bleu, 2))) + logging.info("------------------------------------------------------------") for key in GLOBAL_KEYS: global_vars[key] = [] - metrics = dict({"eval_loss": eval_loss, "token_bleu": token_bleu, "sacre_bleu": sacre_bleu,}) + metrics = dict({"eval_loss": eval_loss, "token_bleu": token_bleu, "sacre_bleu": sacre_bleu}) return metrics @@ -94,11 +110,11 @@ def eval_epochs_done_callback_wer(global_vars): eval_wer = word_error_rate(ref, sys) for i in range(3): sent_id = np.random.randint(len(sys)) - print("Ground truth: {0}\n".format(ref[sent_id])) - print("Translation: {0}\n".format(sys[sent_id])) + logging.info("Ground truth: {0}\n".format(ref[sent_id])) + logging.info("Translation: {0}\n".format(sys[sent_id])) - print("Validation loss: {0}".format(np.round(eval_loss, 3))) - print("Validation WER: {0}".format(eval_wer)) + logging.info("Validation loss: {0}".format(np.round(eval_loss, 3))) + logging.info("Validation WER: {0}".format(eval_wer)) global_vars["eval_loss"] = [] global_vars["ref"] = [] global_vars["sys"] = [] diff --git a/nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py b/nemo/collections/nlp/callbacks/punctuation_capitalization_callback.py similarity index 70% rename from nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py rename to nemo/collections/nlp/callbacks/punctuation_capitalization_callback.py index a3f8d01add15..dc76015d7363 100644 --- a/nemo/collections/nlp/utils/callbacks/punctuation_capitalization.py +++ b/nemo/collections/nlp/callbacks/punctuation_capitalization_callback.py @@ -1,14 +1,28 @@ -# Copyright (c) 2019 NVIDIA Corporation -__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import random import numpy as np from sklearn.metrics import classification_report -import nemo -from nemo.collections.nlp.data.datasets.utils import list2str, tensor2list -from nemo.collections.nlp.utils.nlp_utils import plot_confusion_matrix +from nemo import logging +from nemo.collections.nlp.utils.callback_utils import list2str, plot_confusion_matrix, tensor2list + +__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] def eval_iter_callback(tensors, global_vars): @@ -64,9 +78,7 @@ def eval_iter_callback(tensors, global_vars): global_vars["all_subtokens_mask"].extend(all_subtokens_mask) -def eval_epochs_done_callback( - global_vars, punct_label_ids, capit_label_ids, graph_fold=None, normalize_cm=True, -): +def eval_epochs_done_callback(global_vars, punct_label_ids, capit_label_ids, graph_fold=None, normalize_cm=True): ''' Args: graph_fold (str): path to output folder @@ -78,10 +90,7 @@ def eval_epochs_done_callback( capit_accuracy = _eval_epochs_done_callback('capit', global_vars, capit_label_ids, graph_fold, normalize_cm) - return { - "Punctuation_task_accuracy": punct_accuracy, - "Capitalization_task_accuracy": capit_accuracy, - } + return {"Punctuation_task_accuracy": punct_accuracy, "Capitalization_task_accuracy": capit_accuracy} def _eval_epochs_done_callback(task_name, global_vars, label_ids, graph_fold=None, normalize_cm=True): @@ -93,25 +102,23 @@ def _eval_epochs_done_callback(task_name, global_vars, label_ids, graph_fold=Non preds = preds[subtokens_mask] accuracy = sum(labels == preds) / labels.shape[0] - nemo.logging.info(f'Accuracy for task {task_name}: {accuracy}') + logging.info(f'Accuracy for task {task_name}: {accuracy}') # print predictions and labels for a small random subset of data sample_size = 20 i = 0 if preds.shape[0] > sample_size + 1: i = random.randint(0, preds.shape[0] - sample_size - 1) - nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size])) - nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size])) + logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size])) + logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size])) # remove labels from label_ids that don't appear in the dev set used_labels = set(labels) | set(preds) label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels} - nemo.logging.info(classification_report(labels, preds, target_names=label_ids)) + logging.info(classification_report(labels, preds, target_names=label_ids)) # calculate and plot confusion_matrix if graph_fold: - plot_confusion_matrix( - label_ids, labels, preds, graph_fold, normalize=normalize_cm, prefix=task_name, - ) + plot_confusion_matrix(labels, preds, graph_fold, label_ids, normalize=normalize_cm, prefix=task_name) return accuracy diff --git a/nemo/collections/nlp/utils/callbacks/squad.py b/nemo/collections/nlp/callbacks/qa_squad_callback.py similarity index 67% rename from nemo/collections/nlp/utils/callbacks/squad.py rename to nemo/collections/nlp/callbacks/qa_squad_callback.py index 5f87132bc7e4..321999d902ba 100644 --- a/nemo/collections/nlp/utils/callbacks/squad.py +++ b/nemo/collections/nlp/callbacks/qa_squad_callback.py @@ -1,18 +1,22 @@ -""" -Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at +from nemo import logging - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -""" +__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] def eval_iter_callback(tensors, global_vars): @@ -63,7 +67,7 @@ def eval_epochs_done_callback( do_lower_case=do_lower_case, ) - print(f"Exact_match = {exact_match}, f1 = {f1}") + logging.info(f"Exact_match = {exact_match}, f1 = {f1}") global_vars["eval_unique_ids"] = [] global_vars["eval_start_logits"] = [] diff --git a/nemo/collections/nlp/callbacks/text_classification_callback.py b/nemo/collections/nlp/callbacks/text_classification_callback.py new file mode 100644 index 000000000000..14b89d8e57e7 --- /dev/null +++ b/nemo/collections/nlp/callbacks/text_classification_callback.py @@ -0,0 +1,68 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import random + +import numpy as np +from sklearn.metrics import classification_report + +from nemo import logging +from nemo.collections.nlp.utils.callback_utils import list2str, plot_confusion_matrix, tensor2list + +__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] + + +def eval_iter_callback(tensors, global_vars, eval_data_layer): + if "all_preds" not in global_vars.keys(): + global_vars["all_preds"] = [] + if "all_labels" not in global_vars.keys(): + global_vars["all_labels"] = [] + + logits_lists = [] + labels_lists = [] + + for kv, v in tensors.items(): + if 'logits' in kv: + for v_tensor in v: + for logit_tensor in v_tensor: + logits_lists.append(tensor2list(logit_tensor)) + + if 'labels' in kv: + for v_tensor in v: + for label_tensor in v_tensor: + labels_lists.append(tensor2list(label_tensor)) + + preds = list(np.argmax(np.asarray(logits_lists), 1)) + global_vars["all_preds"].extend(preds) + global_vars["all_labels"].extend(labels_lists) + + +def eval_epochs_done_callback(global_vars, graph_fold): + labels = np.asarray(global_vars['all_labels']) + preds = np.asarray(global_vars['all_preds']) + accuracy = sum(labels == preds) / labels.shape[0] + logging.info(f'Accuracy: {accuracy}') + + # print predictions and labels for a small random subset of data + sample_size = 20 + i = 0 + if preds.shape[0] > sample_size + 1: + i = random.randint(0, preds.shape[0] - sample_size - 1) + logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size])) + logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size])) + plot_confusion_matrix(labels, preds, graph_fold) + logging.info(classification_report(labels, preds)) + return dict({"accuracy": accuracy}) diff --git a/nemo/collections/nlp/utils/callbacks/token_classification.py b/nemo/collections/nlp/callbacks/token_classification_callback.py similarity index 64% rename from nemo/collections/nlp/utils/callbacks/token_classification.py rename to nemo/collections/nlp/callbacks/token_classification_callback.py index 20d3036118f1..0f4d3c545622 100644 --- a/nemo/collections/nlp/utils/callbacks/token_classification.py +++ b/nemo/collections/nlp/callbacks/token_classification_callback.py @@ -1,14 +1,28 @@ -# Copyright (c) 2019 NVIDIA Corporation -__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import random import numpy as np from sklearn.metrics import classification_report -import nemo -from nemo.collections.nlp.data.datasets.utils import list2str, tensor2list -from nemo.collections.nlp.utils.nlp_utils import plot_confusion_matrix +from nemo import logging +from nemo.collections.nlp.utils.callback_utils import list2str, plot_confusion_matrix, tensor2list + +__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] def eval_iter_callback(tensors, global_vars): @@ -52,24 +66,24 @@ def eval_epochs_done_callback(global_vars, label_ids, graph_fold=None, none_labe preds = preds[subtokens_mask] accuracy = sum(labels == preds) / labels.shape[0] - nemo.logging.info(f'Accuracy: {accuracy}') + logging.info(f'Accuracy: {accuracy}') # print predictions and labels for a small random subset of data sample_size = 20 i = 0 if preds.shape[0] > sample_size + 1: i = random.randint(0, preds.shape[0] - sample_size - 1) - nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size])) - nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size])) + logging.info("Sampled preds: [%s]" % list2str(preds[i : i + sample_size])) + logging.info("Sampled labels: [%s]" % list2str(labels[i : i + sample_size])) # remove labels from label_ids that don't appear in the dev set used_labels = set(labels) | set(preds) label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels} - nemo.logging.info(classification_report(labels, preds, target_names=label_ids)) + logging.info(classification_report(labels, preds, target_names=label_ids)) # calculate and plot confusion_matrix if graph_fold: - plot_confusion_matrix(label_ids, labels, preds, graph_fold, normalize=normalize_cm) + plot_confusion_matrix(labels, preds, graph_fold, label_ids, normalize=normalize_cm) return dict({'Accuracy': accuracy}) diff --git a/nemo/collections/nlp/data/__init__.py b/nemo/collections/nlp/data/__init__.py index 6e6bf8956b48..87a10d8803c8 100644 --- a/nemo/collections/nlp/data/__init__.py +++ b/nemo/collections/nlp/data/__init__.py @@ -1,3 +1,18 @@ -from .data_layers import * -from .datasets import * -from .tokenizers import * +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data.datasets import * +from nemo.collections.nlp.data.tokenizers import * diff --git a/nemo/collections/nlp/data/data_layers.py b/nemo/collections/nlp/data/data_layers.py deleted file mode 100644 index 36dac97ec98d..000000000000 --- a/nemo/collections/nlp/data/data_layers.py +++ /dev/null @@ -1,1128 +0,0 @@ -# Copyright (c) 2019 NVIDIA Corporation - -# If you want to add your own data layer, you should put its name in -# __all__ so that it can be imported with 'from text_data_layers import *' - - -__all__ = [ - 'GlueDataLayerClassification', - 'GlueDataLayerRegression', - 'BertJointIntentSlotDataLayer', - 'BertJointIntentSlotInferDataLayer', - 'BertPunctuationCapitalizationDataLayer', - 'BertPunctuationCapitalizationInferDataLayer', - 'BertPretrainingDataLayer', - 'BertPretrainingPreprocessedDataLayer', - 'BertSentenceClassificationDataLayer', - 'BertTokenClassificationDataLayer', - 'BertTokenClassificationInferDataLayer', - 'BertQuestionAnsweringDataLayer', - 'LanguageModelingDataLayer', - 'TextDataLayer', - 'TranslationDataLayer', -] - -import os -import random -import sys - -import h5py -import numpy as np -import torch -from torch.utils import data as pt_data - -import nemo -from .datasets import * -from nemo.backends.pytorch.nm import DataLayerNM -from nemo.core.neural_types import * - - -class TextDataLayer(DataLayerNM): - """ - Generic Text Data Layer NM which wraps PyTorch's dataset - - Args: - dataset_type: type of dataset used for this datalayer - dataset_params (dict): all the params for the dataset - batch_size: size of batch - """ - - def __init__(self, dataset_type, dataset_params, batch_size): - super().__init__() - if isinstance(dataset_type, str): - dataset_type = getattr(sys.modules[__name__], dataset_type) - self._dataset = dataset_type(**dataset_params) - self._batch_size = batch_size - - def __len__(self): - return len(self._dataset) - - @property - def dataset(self): - return self._dataset - - @property - def data_iterator(self): - return None - - -class BertSentenceClassificationDataLayer(TextDataLayer): - """ - Creates the data layer to use for the task of sentence classification - with pretrained model. - - All the data processing is done BertSentenceClassificationDataset. - - Args: - dataset (BertSentenceClassificationDataset): - the dataset that needs to be converted to DataLayerNM - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - labels: - 0: AxisType(BatchTag) - - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "labels": NeuralType({0: AxisType(BatchTag),}), - } - - def __init__( - self, - input_file, - tokenizer, - max_seq_length, - num_samples=-1, - shuffle=False, - batch_size=64, - dataset_type=BertSentenceClassificationDataset, - ): - dataset_params = { - 'input_file': input_file, - 'tokenizer': tokenizer, - 'max_seq_length': max_seq_length, - 'num_samples': num_samples, - 'shuffle': shuffle, - } - super().__init__(dataset_type, dataset_params, batch_size) - - -class BertJointIntentSlotDataLayer(TextDataLayer): - """ - Creates the data layer to use for the task of joint intent - and slot classification with pretrained model. - - All the data processing is done in BertJointIntentSlotDataset. - - input_mask: used to ignore some of the input tokens like paddings - - loss_mask: used to mask and ignore tokens in the loss function - - subtokens_mask: used to ignore the outputs of unwanted tokens in - the inference and evaluation like the start and end tokens - - Args: - dataset (BertJointIntentSlotDataset): - the dataset that needs to be converted to DataLayerNM - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - loss_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - subtokens_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - intents: - 0: AxisType(BatchTag) - - slots: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "intents": NeuralType({0: AxisType(BatchTag),}), - "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - def __init__( - self, - input_file, - slot_file, - pad_label, - tokenizer, - max_seq_length, - num_samples=-1, - shuffle=False, - batch_size=64, - ignore_extra_tokens=False, - ignore_start_end=False, - dataset_type=BertJointIntentSlotDataset, - ): - dataset_params = { - 'input_file': input_file, - 'slot_file': slot_file, - 'pad_label': pad_label, - 'tokenizer': tokenizer, - 'max_seq_length': max_seq_length, - 'num_samples': num_samples, - 'shuffle': shuffle, - 'ignore_extra_tokens': ignore_extra_tokens, - 'ignore_start_end': ignore_start_end, - } - super().__init__(dataset_type, dataset_params, batch_size) - - -class BertJointIntentSlotInferDataLayer(TextDataLayer): - """ - Creates the data layer to use for the task of joint intent - and slot classification with pretrained model. This is for - - All the data processing is done in BertJointIntentSlotInferDataset. - - input_mask: used to ignore some of the input tokens like paddings - - loss_mask: used to mask and ignore tokens in the loss function - - subtokens_mask: used to ignore the outputs of unwanted tokens in - the inference and evaluation like the start and end tokens - - Args: - dataset (BertJointIntentSlotInferDataset): - the dataset that needs to be converted to DataLayerNM - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - loss_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - subtokens_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - def __init__(self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertJointIntentSlotInferDataset): - dataset_params = { - 'queries': queries, - 'tokenizer': tokenizer, - 'max_seq_length': max_seq_length, - } - super().__init__(dataset_type, dataset_params, batch_size) - - -class LanguageModelingDataLayer(TextDataLayer): - """ - Data layer for standard language modeling task. - - Args: - dataset (str): path to text document with data - tokenizer (TokenizerSpec): tokenizer - max_seq_length (int): maximum allowed length of the text segments - batch_step (int): how many tokens to skip between two successive - segments of text when constructing batches - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: indices of tokens which constitute batches of text segments - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: bool tensor with 0s in place of tokens to be masked - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - labels: indices of tokens which should be predicted from each of the - corresponding tokens in input_ids; for left-to-right language - modeling equals to input_ids shifted by 1 to the right - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - def __init__( - self, dataset, tokenizer, max_seq_length, batch_size, batch_step=128, dataset_type=LanguageModelingDataset - ): - dataset_params = { - 'dataset': dataset, - 'tokenizer': tokenizer, - 'max_seq_length': max_seq_length, - 'batch_step': batch_step, - } - super().__init__(dataset_type, dataset_params, batch_size) - - -class BertTokenClassificationDataLayer(TextDataLayer): - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - loss_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - subtokens_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - labels: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - def __init__( - self, - text_file, - label_file, - tokenizer, - max_seq_length, - pad_label='O', - label_ids=None, - num_samples=-1, - shuffle=False, - batch_size=64, - ignore_extra_tokens=False, - ignore_start_end=False, - use_cache=False, - dataset_type=BertTokenClassificationDataset, - ): - dataset_params = { - 'text_file': text_file, - 'label_file': label_file, - 'max_seq_length': max_seq_length, - 'tokenizer': tokenizer, - 'num_samples': num_samples, - 'shuffle': shuffle, - 'pad_label': pad_label, - 'label_ids': label_ids, - 'ignore_extra_tokens': ignore_extra_tokens, - 'ignore_start_end': ignore_start_end, - 'use_cache': use_cache, - } - super().__init__(dataset_type, dataset_params, batch_size) - - -class BertTokenClassificationInferDataLayer(TextDataLayer): - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - loss_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - subtokens_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - def __init__( - self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertTokenClassificationInferDataset - ): - dataset_params = { - 'queries': queries, - 'tokenizer': tokenizer, - 'max_seq_length': max_seq_length, - } - super().__init__(dataset_type, dataset_params, batch_size) - - -class BertPunctuationCapitalizationDataLayer(TextDataLayer): - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - loss_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - subtokens_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - punct_labels: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - capit_labels: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "punct_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "capit_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - def __init__( - self, - text_file, - label_file, - tokenizer, - max_seq_length, - pad_label='O', - punct_label_ids=None, - capit_label_ids=None, - num_samples=-1, - shuffle=False, - batch_size=64, - ignore_extra_tokens=False, - ignore_start_end=False, - use_cache=False, - dataset_type=BertPunctuationCapitalizationDataset, - ): - dataset_params = { - 'text_file': text_file, - 'label_file': label_file, - 'max_seq_length': max_seq_length, - 'tokenizer': tokenizer, - 'num_samples': num_samples, - 'shuffle': shuffle, - 'pad_label': pad_label, - 'punct_label_ids': punct_label_ids, - 'capit_label_ids': capit_label_ids, - 'ignore_extra_tokens': ignore_extra_tokens, - 'ignore_start_end': ignore_start_end, - 'use_cache': use_cache, - } - super().__init__(dataset_type, dataset_params, batch_size) - - -class BertPunctuationCapitalizationInferDataLayer(TextDataLayer): - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - loss_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - subtokens_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - def __init__( - self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertTokenClassificationInferDataset, - ): - dataset_params = { - 'queries': queries, - 'tokenizer': tokenizer, - 'max_seq_length': max_seq_length, - } - super().__init__(dataset_type, dataset_params, batch_size) - - -class BertQuestionAnsweringDataLayer(TextDataLayer): - """ - Creates the data layer to use for Question Answering classification task. - - Args: - data_dir (str): Directory that contains train.*.json and dev.*.json. - tokenizer (obj): Tokenizer object, e.g. NemoBertTokenizer. - version_2_with_negative (bool): True if training should allow - unanswerable questions. - doc_stride (int): When splitting up a long document into chunks, - how much stride to take between chunks. - max_query_length (iny): All training files which have a duration less - than min_duration are dropped. Can't be used if the `utt2dur` file - does not exist. Defaults to None. - max_seq_length (int): All training files which have a duration more - than max_duration are dropped. Can't be used if the `utt2dur` file - does not exist. Defaults to None. - mode (str): Use "train" or "dev" to define between - training and evaluation. - batch_size (int): Batch size. Defaults to 64. - dataset_type (class): Question Answering class. - Defaults to SquadDataset. - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - start_positions: - 0: AxisType(BatchTag) - - end_positions: - 0: AxisType(BatchTag) - - unique_ids: - 0: AxisType(BatchTag) - - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "start_positions": NeuralType({0: AxisType(BatchTag)}), - "end_positions": NeuralType({0: AxisType(BatchTag)}), - "unique_ids": NeuralType({0: AxisType(BatchTag)}), - } - - def __init__( - self, - data_dir, - tokenizer, - version_2_with_negative, - doc_stride, - max_query_length, - max_seq_length, - mode="train", - batch_size=64, - dataset_type=SquadDataset, - ): - dataset_params = { - 'data_dir': data_dir, - 'mode': mode, - 'tokenizer': tokenizer, - 'version_2_with_negative': version_2_with_negative, - 'max_query_length': max_query_length, - 'max_seq_length': max_seq_length, - 'doc_stride': doc_stride, - } - - super().__init__(dataset_type, dataset_params, batch_size) - - -class BertPretrainingDataLayer(TextDataLayer): - """ - Data layer for masked language modeling task. - - Args: - tokenizer (TokenizerSpec): tokenizer - dataset (str): directory or a single file with dataset documents - max_seq_length (int): maximum allowed length of the text segments - mask_probability (float): probability of masking input sequence tokens - batch_size (int): batch size in segments - short_seeq_prob (float): Probability of creating sequences which are - shorter than the maximum length. - Defualts to 0.1. - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: indices of tokens which constitute batches of text segments - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: indices of token types (e.g., sentences A & B in BERT) - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: bool tensor with 0s in place of tokens to be masked - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - output_ids: indices of output tokens which should be predicted - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - output_mask: bool tensor with 0s in place of tokens to be excluded - from loss calculation - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - labels: indices of classes to be predicted from [CLS] token of text - segments (e.g, 0 or 1 in next sentence prediction task) - 0: AxisType(BatchTag) - - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "labels": NeuralType({0: AxisType(BatchTag)}), - } - - def __init__(self, tokenizer, dataset, max_seq_length, mask_probability, short_seq_prob=0.1, batch_size=64): - dataset_params = { - 'tokenizer': tokenizer, - 'dataset': dataset, - 'max_seq_length': max_seq_length, - 'mask_probability': mask_probability, - 'short_seq_prob': short_seq_prob, - } - super().__init__(BertPretrainingDataset, dataset_params, batch_size) - - -class BertPretrainingPreprocessedDataLayer(DataLayerNM): - """ - Data layer for masked language modeling task. - - Args: - tokenizer (TokenizerSpec): tokenizer - dataset (str): directory or a single file with dataset documents - max_seq_length (int): maximum allowed length of the text segments - mask_probability (float): probability of masking input sequence tokens - batch_size (int): batch size in segments - short_seeq_prob (float): Probability of creating sequences which are - shorter than the maximum length. - Defualts to 0.1. - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: indices of tokens which constitute batches of text segments - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: indices of token types (e.g., sentences A & B in BERT) - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: bool tensor with 0s in place of tokens to be masked - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - output_ids: indices of output tokens which should be predicted - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - output_mask: bool tensor with 0s in place of tokens to be excluded - from loss calculation - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - labels: indices of classes to be predicted from [CLS] token of text - segments (e.g, 0 or 1 in next sentence prediction task) - 0: AxisType(BatchTag) - - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "labels": NeuralType({0: AxisType(BatchTag)}), - } - - def __init__(self, dataset, max_pred_length, batch_size=64, training=True): - super().__init__() - self._batch_size = batch_size - - if os.path.isdir(dataset): - self.files = [ - os.path.join(dataset, f) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f)) - ] - else: - self.files = [dataset] - self.files.sort() - self.num_files = len(self.files) - self.max_pred_length = max_pred_length - self.training = training - total_length = 0 - for f in self.files: - fp = h5py.File(f, 'r') - total_length += len(fp['input_ids']) - fp.close() - self.total_length = total_length - - def _collate_fn(self, x): - num_components = len(x[0]) - components = [[] for _ in range(num_components)] - batch_size = len(x) - for i in range(batch_size): - for j in range(num_components): - components[j].append(x[i][j]) - src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids = [np.stack(x, axis=0) for x in components] - src_ids = torch.Tensor(src_ids).long().to(self._device) - src_segment_ids = torch.Tensor(src_segment_ids).long().to(self._device) - src_mask = torch.Tensor(src_mask).long().to(self._device) - tgt_ids = torch.Tensor(tgt_ids).long().to(self._device) - tgt_mask = torch.Tensor(tgt_mask).long().to(self._device) - sent_ids = torch.Tensor(sent_ids).long().to(self._device) - return src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids - - def __len__(self): - return self.total_length - - @property - def dataset(self): - return None - - @property - def data_iterator(self): - while True: - if self.training: - random.shuffle(self.files) - for f_id in range(self.num_files): - data_file = self.files[f_id] - train_data = BertPretrainingPreprocessedDataset( - input_file=data_file, max_pred_length=self.max_pred_length - ) - train_sampler = pt_data.RandomSampler(train_data) - train_dataloader = pt_data.DataLoader( - dataset=train_data, - batch_size=self.batch_size, - collate_fn=self._collate_fn, - shuffle=train_sampler is None, - sampler=train_sampler, - ) - for x in train_dataloader: - yield x - - -class TranslationDataLayer(TextDataLayer): - """ - Data layer for neural machine translation from source (src) language to - target (tgt) language. - - Args: - tokenizer_src (TokenizerSpec): source language tokenizer - tokenizer_tgt (TokenizerSpec): target language tokenizer - dataset_src (str): path to source data - dataset_tgt (str): path to target data - tokens_in_batch (int): maximum allowed number of tokens in batches, - batches will be constructed to minimize the use of tokens - clean (bool): whether to use parallel data cleaning such as removing - pairs with big difference in sentences length, removing pairs with - the same tokens in src and tgt, etc; useful for training data layer - and should not be used in evaluation data layer - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - src_ids: indices of tokens which correspond to source sentences - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - src_mask: bool tensor with 0s in place of source tokens to be masked - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - tgt_ids: indices of tokens which correspond to target sentences - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - tgt_mask: bool tensor with 0s in place of target tokens to be masked - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - labels: indices of tokens which should be predicted from each of the - corresponding target tokens in tgt_ids; for standard neural - machine translation equals to tgt_ids shifted by 1 to the right - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - sent_ids: indices of the sentences in a batch; important for - evaluation with external metrics, such as SacreBLEU - 0: AxisType(BatchTag) - - """ - return { - "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "src_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "tgt_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "sent_ids": NeuralType({0: AxisType(BatchTag)}), - } - - def __init__( - self, - tokenizer_src, - tokenizer_tgt, - dataset_src, - dataset_tgt, - batch_size=64, - tokens_in_batch=1024, - clean=False, - dataset_type=TranslationDataset, - ): - dataset_params = { - 'tokenizer_src': tokenizer_src, - 'tokenizer_tgt': tokenizer_tgt, - 'dataset_src': dataset_src, - 'dataset_tgt': dataset_tgt, - 'tokens_in_batch': tokens_in_batch, - 'clean': clean, - } - super().__init__(dataset_type, dataset_params, batch_size) - - if self._placement == nemo.core.DeviceType.AllGpu: - sampler = pt_data.distributed.DistributedSampler(self._dataset) - else: - sampler = None - - self._dataloader = pt_data.DataLoader( - dataset=self._dataset, batch_size=1, collate_fn=self._collate_fn, shuffle=sampler is None, sampler=sampler, - ) - - def _collate_fn(self, x): - src_ids, src_mask, tgt_ids, tgt_mask, labels, sent_ids = x[0] - src_ids = torch.Tensor(src_ids).long().to(self._device) - src_mask = torch.Tensor(src_mask).float().to(self._device) - tgt_ids = torch.Tensor(tgt_ids).long().to(self._device) - tgt_mask = torch.Tensor(tgt_mask).float().to(self._device) - labels = torch.Tensor(labels).long().to(self._device) - sent_ids = torch.Tensor(sent_ids).long().to(self._device) - return src_ids, src_mask, tgt_ids, tgt_mask, labels, sent_ids - - @property - def dataset(self): - return None - - @property - def data_iterator(self): - return self._dataloader - - -class GlueDataLayerClassification(TextDataLayer): - """ - Creates the data layer to use for the GLUE classification tasks, - more details here: https://gluebenchmark.com/tasks - - All the data processing is done in GLUEDataset. - - Args: - dataset_type (GLUEDataset): - the dataset that needs to be converted to DataLayerNM - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - labels: - 0: AxisType(CategoricalTag) - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "labels": NeuralType({0: AxisType(CategoricalTag),}), - } - - def __init__( - self, - data_dir, - tokenizer, - max_seq_length, - processor, - evaluate=False, - token_params={}, - num_samples=-1, - shuffle=False, - batch_size=64, - dataset_type=GLUEDataset, - ): - dataset_params = { - 'data_dir': data_dir, - 'output_mode': 'classification', - 'processor': processor, - 'evaluate': evaluate, - 'token_params': token_params, - 'tokenizer': tokenizer, - 'max_seq_length': max_seq_length, - } - - super().__init__(dataset_type, dataset_params, batch_size) - - -class GlueDataLayerRegression(TextDataLayer): - """ - Creates the data layer to use for the GLUE STS-B regression task, - more details here: https://gluebenchmark.com/tasks - - All the data processing is done in GLUEDataset. - - Args: - dataset_type (GLUEDataset): - the dataset that needs to be converted to DataLayerNM - """ - - @property - def output_ports(self): - """Returns definitions of module output ports. - - input_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_type_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - input_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - labels: - 0: AxisType(RegressionTag) - """ - return { - "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "labels": NeuralType({0: AxisType(RegressionTag),}), - } - - def __init__( - self, - data_dir, - tokenizer, - max_seq_length, - processor, - evaluate=False, - token_params={}, - num_samples=-1, - shuffle=False, - batch_size=64, - dataset_type=GLUEDataset, - ): - dataset_params = { - 'data_dir': data_dir, - 'output_mode': 'regression', - 'processor': processor, - 'evaluate': evaluate, - 'token_params': token_params, - 'tokenizer': tokenizer, - 'max_seq_length': max_seq_length, - } - - super().__init__(dataset_type, dataset_params, batch_size) diff --git a/nemo/collections/nlp/data/datasets/__init__.py b/nemo/collections/nlp/data/datasets/__init__.py index 3244c1266b19..f0eafa0d62f1 100644 --- a/nemo/collections/nlp/data/datasets/__init__.py +++ b/nemo/collections/nlp/data/datasets/__init__.py @@ -1,9 +1,37 @@ -from .bert_pretraining import BertPretrainingDataset, BertPretrainingPreprocessedDataset -from .glue import GLUEDataset -from .joint_intent_slot import BertJointIntentSlotDataset, BertJointIntentSlotInferDataset -from .language_modeling import LanguageModelingDataset -from .punctuation_capitalization import BertPunctuationCapitalizationDataset, BertPunctuationCapitalizationInferDataset -from .sentence_classification import BertSentenceClassificationDataset -from .squad import SquadDataset -from .token_classification import BertTokenClassificationDataset, BertTokenClassificationInferDataset -from .translation import TranslationDataset +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import GLUEDataset +from nemo.collections.nlp.data.datasets.joint_intent_slot_dataset import ( + BertJointIntentSlotDataset, + BertJointIntentSlotInferDataset, +) +from nemo.collections.nlp.data.datasets.lm_bert_dataset import ( + BertPretrainingDataset, + BertPretrainingPreprocessedDataset, +) +from nemo.collections.nlp.data.datasets.lm_transformer_dataset import LanguageModelingDataset +from nemo.collections.nlp.data.datasets.machine_translation_dataset import TranslationDataset +from nemo.collections.nlp.data.datasets.punctuation_capitalization_dataset import ( + BertPunctuationCapitalizationDataset, + BertPunctuationCapitalizationInferDataset, +) +from nemo.collections.nlp.data.datasets.qa_squad_dataset import SquadDataset +from nemo.collections.nlp.data.datasets.text_classification_dataset import BertTextClassificationDataset +from nemo.collections.nlp.data.datasets.token_classification_dataset import ( + BertTokenClassificationDataset, + BertTokenClassificationInferDataset, +) diff --git a/nemo/collections/nlp/data/datasets/datasets_utils.py b/nemo/collections/nlp/data/datasets/datasets_utils.py new file mode 100644 index 000000000000..f2851dd0cdce --- /dev/null +++ b/nemo/collections/nlp/data/datasets/datasets_utils.py @@ -0,0 +1,988 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import csv +import glob +import json +import os +import random +import re +import shutil +import string +import subprocess +from collections import Counter + +import numpy as np +from tqdm import tqdm + +from nemo import logging +from nemo.collections.nlp.utils.callback_utils import list2str +from nemo.collections.nlp.utils.common_nlp_utils import ( + get_vocab, + ids2text, + if_exist, + write_vocab, + write_vocab_in_order, +) + +__all__ = [ + 'get_label_stats', + 'process_sst_2', + 'process_imdb', + 'process_thucnews', + 'process_nlu', + 'process_twitter_airline', + 'process_atis', + 'process_jarvis_datasets', + 'process_mturk', + 'process_intent_slot_mturk', + 'get_intents_mturk', + 'get_slot_labels', + 'merge', + 'get_intent_query_files_dialogflow', + 'get_intents_slots_dialogflow', + 'get_slots_dialogflow', + 'partition_data', + 'write_files', + 'process_dialogflow', + 'write_data', + 'create_dataset', + 'read_csv', + 'process_snips', + 'get_dataset', + 'partition', + 'map_entities', + 'get_entities', + 'get_data', + 'reverse_dict', + 'get_intent_labels', + 'download_wkt2', + 'normalize_answer', + 'get_tokens', +] + +DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}' +MODE_EXISTS_TMP = '{} mode of {} dataset has already been processed and stored at {}' + + +def get_label_stats(labels, outfile='stats.tsv'): + labels = Counter(labels) + total = sum(labels.values()) + out = open(outfile, 'w') + i = 0 + label_frequencies = labels.most_common() + for k, v in label_frequencies: + out.write(f'{k}\t{v / total}\n') + if i < 3: + logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.') + i += 1 + return total, label_frequencies + + +def process_sst_2(data_dir): + if not os.path.exists(data_dir): + link = 'https://gluebenchmark.com/tasks' + raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.') + logging.info('Keep in mind that SST-2 is only available in lower case.') + return data_dir + + +def process_imdb(data_dir, uncased, modes=['train', 'test']): + if not os.path.exists(data_dir): + link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset' + raise ValueError(f'Data not found at {data_dir}. ' f'Please download IMDB from {link}.') + + outfold = f'{data_dir}/nemo-processed' + + if uncased: + outfold = f'{outfold}_uncased' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('IMDB', outfold)) + return outfold + logging.info(f'Processing IMDB dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + + for mode in modes: + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') + outfiles[mode].write('sentence\tlabel\n') + for sent in ['neg', 'pos']: + if sent == 'neg': + label = 0 + else: + label = 1 + files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt') + for file in files: + with open(file, 'r') as f: + review = f.read().strip() + if uncased: + review = review.lower() + review = review.replace("
", "") + outfiles[mode].write(f'{review}\t{label}\n') + for mode in modes: + outfiles[mode].close() + + return outfold + + +def process_thucnews(data_dir): + modes = ['train', 'test'] + train_size = 0.8 + if not os.path.exists(data_dir): + link = 'thuctc.thunlp.org/' + raise ValueError(f'Data not found at {data_dir}. ' f'Please download THUCNews from {link}.') + + outfold = f'{data_dir}/nemo-processed-thucnews' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('THUCNews', outfold)) + return outfold + logging.info(f'Processing THUCNews dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + + for mode in modes: + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8') + outfiles[mode].write('sentence\tlabel\n') + categories = ['体育', '娱乐', '家居', '彩票', '房产', '教育', '时尚', '时政', '星座', '游戏', '社会', '科技', '股票', '财经'] + for category in categories: + label = categories.index(category) + category_files = glob.glob(f'{data_dir}/{category}/*.txt') + test_num = int(len(category_files) * (1 - train_size)) + test_files = category_files[:test_num] + train_files = category_files[test_num:] + for mode in modes: + logging.info(f'Processing {mode} data of the category {category}') + if mode == 'test': + files = test_files + else: + files = train_files + for file in tqdm(files): + with open(file, 'r', encoding='utf-8') as f: + news = f.read().strip().replace('\r', '') + news = news.replace('\n', '').replace('\t', ' ') + outfiles[mode].write(f'{news}\t{label}\n') + for mode in modes: + outfiles[mode].close() + + return outfold + + +def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'): + """ Dataset has to be of: + - ubuntu + - chat + - web + """ + + if not os.path.exists(filename): + link = 'https://github.com/sebischair/NLU-Evaluation-Corpora' + raise ValueError(f'Data not found at {filename}. ' f'Please download IMDB from {link}.') + + if dataset_name == 'nlu-ubuntu': + INTENT = {'makeupdate': 1, 'setupprinter': 2, 'shutdowncomputer': 3, 'softwarerecommendation': 4, 'none': 0} + elif dataset_name == 'nlu-chat': + INTENT = {'departuretime': 0, 'findconnection': 1} + elif dataset_name == 'nlu-web': + INTENT = { + 'changepassword': 1, + 'deleteaccount': 2, + 'downloadvideo': 3, + 'exportdata': 4, + 'filterspam': 5, + 'findalternative': 6, + 'syncaccounts': 7, + 'none': 0, + } + else: + raise ValueError(f'{dataset_name}: Invalid dataset name') + + infold = filename[: filename.rfind('/')] + outfold = f'{infold}/{dataset_name}-nemo-processed' + + if uncased: + outfold = f'{outfold}_uncased' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold)) + return outfold + logging.info(f'Processing data and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + + for mode in modes: + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') + outfiles[mode].write('sentence\tlabel\n') + + with open(filename, 'r') as f: + data = json.load(f) + + for obj in data['sentences']: + sentence = obj['text'].strip() + if uncased: + sentence = sentence.lower() + intent = obj['intent'].lower().replace(' ', '') + label = INTENT[intent] + txt = f'{sentence}\t{label}\n' + if obj['training']: + outfiles['train'].write(txt) + else: + outfiles['test'].write(txt) + for mode in modes: + outfiles[mode].close() + return outfold + + +def process_twitter_airline(filename, uncased, modes=['train', 'test']): + """ Dataset from Kaggle: + https://www.kaggle.com/crowdflower/twitter-airline-sentiment + """ + pass + + +def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0): + """ MSFT's dataset, processed by Kaggle + https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk + """ + outfold = f'{infold}/nemo-processed' + vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') + + if uncased: + outfold = f'{outfold}-uncased' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) + return outfold + logging.info(f'Processing ATIS dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + + for mode in modes: + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') + outfiles[mode].write('sentence\tlabel\n') + outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') + + queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines() + intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines() + slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines() + + for i, query in enumerate(queries): + sentence = ids2text(query.strip().split()[1:-1], vocab) + outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') + slot = ' '.join(slots[i].strip().split()[1:-1]) + outfiles[mode + '_slots'].write(slot + '\n') + + shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') + shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') + for mode in modes: + outfiles[mode].close() + + return outfold + + +def process_jarvis_datasets(infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False): + """ process and convert Jarvis datasets into NeMo's BIO format + """ + outfold = f'{infold}/{dataset_name}-nemo-processed' + infold = f'{infold}/' + + if uncased: + outfold = f'{outfold}-uncased' + + if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): + logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold)) + return outfold + + logging.info(f'Processing {dataset_name} dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + intents_list = {} + slots_list = {} + slots_list_all = {} + + outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w') + outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w') + + outfiles['dict_slots'].write('O\n') + slots_list["O"] = 0 + slots_list_all["O"] = 0 + + for mode in modes: + if if_exist(outfold, [f'{mode}.tsv']): + logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode)) + continue + + if not if_exist(infold, [f'{mode}.tsv']): + logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.') + continue + + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') + outfiles[mode].write('sentence\tlabel\n') + outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') + + queries = open(f'{infold}/{mode}.tsv', 'r').readlines() + + for i, query in enumerate(queries): + line_splits = query.strip().split("\t") + if len(line_splits) == 3: + intent_str, slot_tags_str, sentence = line_splits + else: + intent_str, sentence = line_splits + slot_tags_str = "" + + if intent_str not in intents_list: + intents_list[intent_str] = len(intents_list) + outfiles['dict_intents'].write(f'{intent_str}\n') + + if ignore_prev_intent: + start_token = 2 + else: + start_token = 1 + sentence_cld = " ".join(sentence.strip().split()[start_token:-1]) + outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n') + + slot_tags_list = [] + if slot_tags_str.strip(): + slot_tags = slot_tags_str.strip().split(",") + for st in slot_tags: + if not st.strip(): + continue + [start_i, end_i, slot_name] = st.strip().split(":") + slot_tags_list.append([int(start_i), int(end_i), slot_name]) + if slot_name not in slots_list: + slots_list[slot_name] = len(slots_list) + slots_list_all[f'B-{slot_name}'] = len(slots_list_all) + slots_list_all[f'I-{slot_name}'] = len(slots_list_all) + outfiles['dict_slots'].write(f'B-{slot_name}\n') + outfiles['dict_slots'].write(f'I-{slot_name}\n') + + slot_tags_list.sort(key=lambda x: x[0]) + slots = [] + processed_index = 0 + for tag_start, tag_end, tag_str in slot_tags_list: + if tag_start > processed_index: + words_list = sentence[processed_index:tag_start].strip().split() + slots.extend([str(slots_list_all['O'])] * len(words_list)) + words_list = sentence[tag_start:tag_end].strip().split() + slots.append(str(slots_list_all[f'B-{tag_str}'])) + slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1)) + processed_index = tag_end + + if processed_index < len(sentence): + words_list = sentence[processed_index:].strip().split() + slots.extend([str(slots_list_all['O'])] * len(words_list)) + + slots = slots[1:-1] + slot = ' '.join(slots) + outfiles[mode + '_slots'].write(slot + '\n') + + outfiles[mode + '_slots'].close() + outfiles[mode].close() + + outfiles['dict_slots'].close() + outfiles['dict_intents'].close() + + return outfold + + +def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): + if not os.path.exists(data_dir): + link = 'www.mturk.com' + raise ValueError( + f'Data not found at {data_dir}. ' f'Export your mturk data from' f'{link} and unzip at {data_dir}.' + ) + + outfold = f'{data_dir}/nemo-processed' + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold)) + return outfold + + logging.info(f'Processing dataset from mturk and storing at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + classification_data_file = f'{data_dir}/classification.csv' + annotation_data_file = f'{data_dir}/annotation.manifest' + + if not os.path.exists(classification_data_file): + raise FileNotFoundError(f'File not found ' f'at {classification_data_file}') + + if not os.path.exists(annotation_data_file): + raise FileNotFoundError(f'File not found at {annotation_data_file}') + + utterances = [] + utterances = read_csv(classification_data_file) + + # This function assumes that the intent classification data has been + # reviewed and cleaned and only one label per utterance is present. + agreed_all, intent_names = get_intents_mturk(utterances, outfold) + + with open(annotation_data_file, 'r') as f: + slot_annotations = f.readlines() + + # This function assumes that the preprocess step would have made + # the task_name of all the annotations generic + task_name = 'retail-combined' + + # It is assumed that every utterances will have corresponding + # slot annotation information + if len(slot_annotations) < len(agreed_all): + raise ValueError(f'Every utterance must have corresponding' f'slot annotation information') + + slot_labels, intent_queries, slot_tags = process_intent_slot_mturk( + slot_annotations, agreed_all, intent_names, task_name + ) + + assert len(slot_tags) == len(intent_queries) + + dev_split = 0.1 + + train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split) + + write_files(train_queries, f'{outfold}/train.tsv') + write_files(train_slots, f'{outfold}/train_slots.tsv') + + write_files(test_queries, f'{outfold}/test.tsv') + write_files(test_slots, f'{outfold}/test_slots.tsv') + + write_files(slot_labels, f'{outfold}/dict.slots.csv') + write_files(intent_names, f'{outfold}/dict.intents.csv') + + return outfold + + +def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names, task_name): + slot_tags = [] + inorder_utterances = [] + all_labels = get_slot_labels(slot_annotations, task_name) + logging.info(f'agreed_all - {len(agreed_all)}') + logging.info(f'Slot annotations - {len(slot_annotations)}') + + for annotation in slot_annotations[0:]: + an = json.loads(annotation) + utterance = an['source'] + if len(utterance) > 2 and utterance.startswith('"') and utterance.endswith('"'): + utterance = utterance[1:-1] + + if utterance in agreed_all: + entities = {} + annotated_entities = an[task_name]['annotations']['entities'] + for i, each_anno in enumerate(annotated_entities): + entities[int(each_anno['startOffset'])] = i + + lastptr = 0 + slotlist = [] + # sorting annotations by the start offset + for i in sorted(entities.keys()): + annotated_entities = an[task_name]['annotations']['entities'] + tags = annotated_entities[entities.get(i)] + untagged_words = utterance[lastptr : tags['startOffset']] + for _ in untagged_words.split(): + slotlist.append(all_labels.get('O')) + anno_words = utterance[tags['startOffset'] : tags['endOffset']] + # tagging with the IOB format. + for j, _ in enumerate(anno_words.split()): + if j == 0: + b_slot = 'B-' + tags['label'] + slotlist.append(all_labels.get(b_slot)) + else: + i_slot = 'I-' + tags['label'] + slotlist.append(all_labels.get(i_slot)) + lastptr = tags['endOffset'] + + untagged_words = utterance[lastptr : len(utterance)] + for _ in untagged_words.split(): + slotlist.append(all_labels.get('O')) + + slotstr = ' '.join(slotlist) + slotstr = f'{slotstr.strip()}\n' + + slot_tags.append(slotstr) + intent_num = intent_names.get(agreed_all.get(utterance)) + query_text = f'{utterance.strip()}\t{intent_num}\n' + inorder_utterances.append(query_text) + # else: + # logging.warning(utterance) + + logging.info(f'inorder utterances - {len(inorder_utterances)}') + + return all_labels, inorder_utterances, slot_tags + + +def get_intents_mturk(utterances, outfold): + intent_names = {} + intent_count = 0 + + agreed_all = {} + + logging.info('Printing all intent_labels') + intent_dict = f'{outfold}/dict.intents.csv' + if os.path.exists(intent_dict): + with open(intent_dict, 'r') as f: + for intent_name in f.readlines(): + intent_names[intent_name.strip()] = intent_count + intent_count += 1 + logging.info(intent_names) + + for i, utterance in enumerate(utterances[1:]): + + if utterance[1] not in agreed_all: + agreed_all[utterance[0]] = utterance[1] + + if utterance[1] not in intent_names: + intent_names[utterance[1]] = intent_count + intent_count += 1 + + logging.info(f'Total number of utterance samples: {len(agreed_all)}') + + return agreed_all, intent_names + + +def get_slot_labels(slot_annotations, task_name): + slot_labels = json.loads(slot_annotations[0]) + + all_labels = {} + count = 0 + # Generating labels with the IOB format. + for label in slot_labels[task_name]['annotations']['labels']: + b_slot = 'B-' + label['label'] + i_slot = 'I-' + label['label'] + all_labels[b_slot] = str(count) + count += 1 + all_labels[i_slot] = str(count) + count += 1 + all_labels['O'] = str(count) + + return all_labels + + +def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']): + outfold = f'{data_dir}/{dataset_name}' + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold)) + slots = get_vocab(f'{outfold}/dict.slots.csv') + none_slot = 0 + for key in slots: + if slots[key] == 'O': + none_slot = key + break + return outfold, int(none_slot) + + os.makedirs(outfold, exist_ok=True) + + data_files, slot_files = {}, {} + for mode in modes: + data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w') + data_files[mode].write('sentence\tlabel\n') + slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w') + + intents, slots = {}, {} + intent_shift, slot_shift = 0, 0 + none_intent, none_slot = -1, -1 + + for subdir in subdirs: + curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv') + curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv') + + for key in curr_intents: + if intent_shift > 0 and curr_intents[key] == 'O': + continue + if curr_intents[key] == 'O' and intent_shift == 0: + none_intent = int(key) + intents[int(key) + intent_shift] = curr_intents[key] + + for key in curr_slots: + if slot_shift > 0 and curr_slots[key] == 'O': + continue + if slot_shift == 0 and curr_slots[key] == 'O': + none_slot = int(key) + slots[int(key) + slot_shift] = curr_slots[key] + + for mode in modes: + with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f: + for line in f.readlines()[1:]: + text, label = line.strip().split('\t') + label = int(label) + if curr_intents[label] == 'O': + label = none_intent + else: + label = label + intent_shift + data_files[mode].write(f'{text}\t{label}\n') + + with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f: + for line in f.readlines(): + labels = [int(label) for label in line.strip().split()] + shifted_labels = [] + for label in labels: + if curr_slots[label] == 'O': + shifted_labels.append(none_slot) + else: + shifted_labels.append(label + slot_shift) + slot_files[mode].write(list2str(shifted_labels) + '\n') + + intent_shift += len(curr_intents) + slot_shift += len(curr_slots) + + write_vocab_in_order(intents, f'{outfold}/dict.intents.csv') + write_vocab_in_order(slots, f'{outfold}/dict.slots.csv') + return outfold, none_slot + + +def get_intent_query_files_dialogflow(path): + fileslist = [] + for root, _, files in os.walk(path): + for file in files: + if '_usersays_en.json' in file: + fileslist.append(os.path.join(root, file)) + return fileslist + + +def get_intents_slots_dialogflow(files, slot_labels): + intent_names = [] + intent_queries = [] + slot_tags = [] + + for index, file in enumerate(files): + intent_names.append(os.path.basename(file).split('_usersays')[0]) + + with open(file) as json_file: + intent_data = json.load(json_file) + for query in intent_data: + query_text = "" + slots = "" + for segment in query['data']: + query_text = ''.join([query_text, segment['text']]) + if 'alias' in segment: + for _ in segment['text'].split(): + slots = ' '.join([slots, slot_labels.get(segment['alias'])]) + else: + for _ in segment['text'].split(): + slots = ' '.join([slots, slot_labels.get('O')]) + query_text = f'{query_text.strip()}\t{index}\n' + intent_queries.append(query_text) + slots = f'{slots.strip()}\n' + slot_tags.append(slots) + return intent_queries, intent_names, slot_tags + + +def get_slots_dialogflow(files): + slot_labels = {} + count = 0 + for file in files: + intent_head_file = ''.join([file.split('_usersays')[0], '.json']) + with open(intent_head_file) as json_file: + intent_meta_data = json.load(json_file) + for params in intent_meta_data['responses'][0]['parameters']: + if params['name'] not in slot_labels: + slot_labels[params['name']] = str(count) + count += 1 + slot_labels['O'] = str(count) + return slot_labels + + +def partition_data(intent_queries, slot_tags, split=0.1): + n = len(intent_queries) + n_dev = int(n * split) + dev_idx = set(random.sample(range(n), n_dev)) + dev_intents, dev_slots, train_intents, train_slots = [], [], [], [] + + dev_intents.append('sentence\tlabel\n') + train_intents.append('sentence\tlabel\n') + + for i, item in enumerate(intent_queries): + if i in dev_idx: + dev_intents.append(item) + dev_slots.append(slot_tags[i]) + else: + train_intents.append(item) + train_slots.append(slot_tags[i]) + return train_intents, train_slots, dev_intents, dev_slots + + +def write_files(data, outfile): + with open(outfile, 'w') as f: + for item in data: + item = f'{item.strip()}\n' + f.write(item) + + +def process_dialogflow(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): + if not os.path.exists(data_dir): + link = 'www.dialogflow.com' + raise ValueError( + f'Data not found at {data_dir}. ' f'Export your dialogflow data from' f'{link} and unzip at {data_dir}.' + ) + + outfold = f'{data_dir}/dialogflow/nemo-processed' + + '''TO DO - check for nemo-processed directory + already exists. If exists, skip the entire creation steps below. ''' + + os.makedirs(outfold, exist_ok=True) + + files = get_intent_query_files_dialogflow(data_dir) + + slot_labels = get_slots_dialogflow(files) + + intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(files, slot_labels) + + train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split) + + write_files(train_queries, f'{outfold}/train.tsv') + write_files(train_slots, f'{outfold}/train_slots.tsv') + + write_files(test_queries, f'{outfold}/test.tsv') + write_files(test_slots, f'{outfold}/test_slots.tsv') + + write_files(slot_labels, f'{outfold}/dict.slots.csv') + write_files(intent_names, f'{outfold}/dict.intents.csv') + + return outfold + + +def write_data(data, slot_dict, intent_dict, outfold, mode, uncased): + intent_file = open(f'{outfold}/{mode}.tsv', 'w') + intent_file.write('sentence\tlabel\n') + slot_file = open(f'{outfold}/{mode}_slots.tsv', 'w') + for tokens, slots, intent in data: + text = ' '.join(tokens) + if uncased: + text = text.lower() + intent_file.write(f'{text}\t{intent_dict[intent]}\n') + slots = [str(slot_dict[slot]) for slot in slots] + slot_file.write(' '.join(slots) + '\n') + intent_file.close() + slot_file.close() + + +def create_dataset(train, dev, slots, intents, uncased, outfold): + os.makedirs(outfold, exist_ok=True) + if 'O' in slots: + slots.remove('O') + slots = sorted(list(slots)) + ['O'] + intents = sorted(list(intents)) + slots = write_vocab(slots, f'{outfold}/dict.slots.csv') + intents = write_vocab(intents, f'{outfold}/dict.intents.csv') + write_data(train, slots, intents, outfold, 'train', uncased) + write_data(dev, slots, intents, outfold, 'test', uncased) + + +def read_csv(file_path): + rows = [] + with open(file_path, 'r') as csvfile: + read_csv = csv.reader(csvfile, delimiter=',') + for row in read_csv: + rows.append(row) + return rows + + +def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): + if not os.path.exists(data_dir): + link = 'www.github.com/snipsco/spoken-language' + '-understanding-research-datasets' + raise ValueError(f'Data not found at {data_dir}. ' f'Resquest to download the SNIPS dataset from {link}.') + + outfold = f'{data_dir}/nemo-processed' + + if uncased: + outfold = f'{outfold}-uncased' + + exist = True + for dataset in ['light', 'speak', 'all']: + if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(), outfold)) + else: + exist = False + if exist: + return outfold + + logging.info(f'Processing SNIPS dataset and store at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + speak_dir = 'smart-speaker-en-close-field' + light_dir = 'smart-lights-en-close-field' + + light_files = [f'{data_dir}/{light_dir}/dataset.json'] + speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json'] + speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json') + + light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split) + speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files) + + create_dataset(light_train, light_dev, light_slots, light_intents, uncased, f'{outfold}/light') + create_dataset(speak_train, speak_dev, speak_slots, speak_intents, uncased, f'{outfold}/speak') + create_dataset( + light_train + speak_train, + light_dev + speak_dev, + light_slots | speak_slots, + light_intents | speak_intents, + uncased, + f'{outfold}/all', + ) + + return outfold + + +def get_dataset(files, dev_split=0.1): + entity2value, value2entity = get_entities(files) + data, slots, intents = get_data(files, entity2value, value2entity) + if len(data) == 1: + train, dev = partition(data[0], split=dev_split) + else: + train, dev = data[0], data[1] + return train, dev, slots, intents + + +def partition(data, split=0.1): + n = len(data) + n_dev = int(n * split) + dev_idx = set(random.sample(range(n), n_dev)) + dev, train = [], [] + + for i, item in enumerate(data): + if i in dev_idx: + dev.append(item) + else: + train.append(item) + return train, dev + + +def map_entities(entity2value, entities): + for key in entities: + if 'data' in entities[key]: + if key not in entity2value: + entity2value[key] = set([]) + + values = [] + for value in entities[key]['data']: + values.append(value['value']) + values.extend(value['synonyms']) + entity2value[key] = entity2value[key] | set(values) + + return entity2value + + +def get_entities(files): + entity2value = {} + for file in files: + with open(file, 'r') as json_file: + data = json.load(json_file) + entity2value = map_entities(entity2value, data['entities']) + + value2entity = reverse_dict(entity2value) + return entity2value, value2entity + + +def get_data(files, entity2value, value2entity): + all_data, all_slots, all_intents = [], set(['O']), set() + for file in files: + file_data = [] + with open(file, 'r') as json_file: + data = json.load(json_file) + for intent in data['intents']: + all_intents.add(intent) + utterances = data['intents'][intent]['utterances'] + for utterance in utterances: + tokens, slots = [], [] + for frag in utterance['data']: + frag_tokens = frag['text'].strip().split() + tokens.extend(frag_tokens) + if 'slot_name' not in frag: + slot = 'O' + else: + slot = frag['slot_name'] + all_slots.add(slot) + slots.extend([slot] * len(frag_tokens)) + file_data.append((tokens, slots, intent)) + all_data.append(file_data) + return all_data, all_slots, all_intents + + +def reverse_dict(entity2value): + value2entity = {} + for entity in entity2value: + for value in entity2value[entity]: + value2entity[value] = entity + return value2entity + + +def get_intent_labels(intent_file): + labels = {} + label = 0 + with open(intent_file, 'r') as f: + for line in f: + intent = line.strip() + labels[intent] = label + label += 1 + return labels + + +def download_wkt2(data_dir): + os.makedirs('data/lm', exist_ok=True) + logging.warning(f'Data not found at {data_dir}. ' f'Downloading wikitext-2 to data/lm') + data_dir = 'data/lm/wikitext-2' + subprocess.call('scripts/get_wkt2.sh') + return data_dir + + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + + def remove_articles(text): + return re.sub(r'\b(a|an|the)\b', ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def get_tokens(s): + if not s: + return [] + return normalize_answer(s).split() + + +def get_stats(lengths): + lengths = np.asarray(lengths) + logging.info( + f'Min: {np.min(lengths)} | \ + Max: {np.max(lengths)} | \ + Mean: {np.mean(lengths)} | \ + Median: {np.median(lengths)}' + ) + logging.info(f'75 percentile: {np.percentile(lengths, 75)}') + logging.info(f'99 percentile: {np.percentile(lengths, 99)}') diff --git a/nemo/collections/nlp/data/datasets/glue.py b/nemo/collections/nlp/data/datasets/glue.py deleted file mode 100644 index 8893c5747c45..000000000000 --- a/nemo/collections/nlp/data/datasets/glue.py +++ /dev/null @@ -1,229 +0,0 @@ -""" -Copyright 2018 The Google AI Language Team Authors and -The HuggingFace Inc. team. -Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -Utility functions for GLUE tasks -Some transformer of this code were adapted from the HuggingFace library at -https://github.com/huggingface/transformers -""" - -import numpy as np -from torch.utils.data import Dataset - -import nemo - - -class GLUEDataset(Dataset): - def __init__( - self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params, - ): - self.tokenizer = tokenizer - self.label_list = processor.get_labels() - self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) - self.features = convert_examples_to_features( - self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params - ) - - def __len__(self): - return len(self.features) - - def __getitem__(self, idx): - feature = self.features[idx] - return ( - np.array(feature.input_ids), - np.array(feature.segment_ids), - np.array(feature.input_mask, dtype=np.long), - np.array(feature.label_id), - ) - - -def convert_examples_to_features( - examples, - label_list, - max_seq_length, - tokenizer, - output_mode, - bos_token=None, - eos_token='[SEP]', - pad_token='[PAD]', - cls_token='[CLS]', - sep_token_extra=None, - cls_token_at_end=False, - cls_token_segment_id=0, - pad_token_segment_id=0, - pad_on_left=False, - mask_padding_with_zero=True, - sequence_a_segment_id=0, - sequence_b_segment_id=1, -): - """ Loads a data file into a list of `InputBatch`s - `cls_token_at_end` define the location of the CLS token: - - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - `cls_token_segment_id` define the segment id associated to the CLS - token (0 for BERT, 2 for XLNet) - The convention in BERT is: - (a) For sequence pairs: - tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] - type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 - (b) For single sequences: - tokens: [CLS] the dog is hairy . [SEP] - type_ids: 0 0 0 0 0 0 0 - Where "type_ids" are used to indicate whether this is the first - sequence or the second sequence. The embedding vectors for `type=0` - and `type=1` were learned during pre-training and are added to the - wordpiece embedding vector (and position vector). This is - not *strictly* necessarysince the [SEP] token unambiguously separates - the sequences, but it makes it easier for the model to learn - the concept of sequences. - For classification tasks, the first vector (corresponding to [CLS]) - is used as as the "sentence vector". Note that this only makes sense - because the entire model is fine-tuned. - For NMT: - (a) For sequence pairs: - tokens: is this jack ##ville ? no it is not . - type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 - (b) For single sequences: - tokens: the dog is hairy . - type_ids: 0 0 0 0 0 0 0 - """ - label_map = {label: i for i, label in enumerate(label_list)} - - features = [] - for ex_index, example in enumerate(examples): - if ex_index % 10000 == 0: - nemo.logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - tokens_a = tokenizer.text_to_tokens(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.text_to_tokens(example.text_b) - - special_tokens_count = 2 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 2 if bos_token else 0 - special_tokens_count += 1 if cls_token else 0 - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - special_tokens_count = 1 if eos_token else 0 - special_tokens_count += 1 if sep_token_extra else 0 - special_tokens_count += 1 if bos_token else 0 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[: max_seq_length - special_tokens_count] - # Add special tokens to sequence_a - tokens = tokens_a - if bos_token: - tokens = [bos_token] + tokens - if eos_token: - tokens += [eos_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - # Add sequence separator between sequences - if tokens_b and sep_token_extra: - tokens += [sep_token_extra] - segment_ids += [sequence_a_segment_id] - - # Add special tokens to sequence_b - if tokens_b: - if bos_token: - tokens += [bos_token] - segment_ids += [sequence_b_segment_id] - tokens += tokens_b - segment_ids += [sequence_b_segment_id] * (len(tokens_b)) - if eos_token: - tokens += [eos_token] - segment_ids += [sequence_b_segment_id] - - # Add classification token - for BERT models - if cls_token: - if cls_token_at_end: - tokens += [cls_token] - segment_ids += [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - input_ids = tokenizer.tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] - if pad_on_left: - input_ids = ([pad_token_id] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - else: - input_ids = input_ids + ([pad_token_id] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) - if len(input_ids) != max_seq_length: - raise ValueError("input_ids must be of length max_seq_length") - if len(input_mask) != max_seq_length: - raise ValueError("input_mask must be of length max_seq_length") - if len(segment_ids) != max_seq_length: - raise ValueError("segment_ids must be of length max_seq_length") - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = np.float32(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - nemo.logging.info("*** Example ***") - nemo.logging.info("guid: %s" % (example.guid)) - nemo.logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) - nemo.logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) - nemo.logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) - nemo.logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) - nemo.logging.info("label: %s (id = %d)" % (example.label, label_id)) - - features.append( - InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id,) - ) - return features - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length. - - This will always truncate the longer sequence one token at a time. - This makes more sense than truncating an equal percent - of tokens from each, since if one sequence is very short then each token - that's truncated likely contains more information than a longer sequence. - """ - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, input_ids, input_mask, segment_ids, label_id): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id diff --git a/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py new file mode 100644 index 000000000000..26423c3aa549 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/glue_benchmark_dataset.py @@ -0,0 +1,593 @@ +""" +Copyright 2018 The Google AI Language Team Authors and +The HuggingFace Inc. team. +Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Utility functions for GLUE tasks +Some transformer of this code were adapted from the HuggingFace library at +https://github.com/huggingface/transformers +""" +import csv +import os + +import numpy as np +from torch.utils.data import Dataset + +from nemo import logging + +__all__ = ['GLUEDataset'] + + +class GLUEDataset(Dataset): + def __init__(self, data_dir, tokenizer, max_seq_length, processor, output_mode, evaluate, token_params): + self.tokenizer = tokenizer + self.label_list = processor.get_labels() + self.examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir) + self.features = convert_examples_to_features( + self.examples, self.label_list, max_seq_length, tokenizer, output_mode, **token_params + ) + + def __len__(self): + return len(self.features) + + def __getitem__(self, idx): + feature = self.features[idx] + return ( + np.array(feature.input_ids), + np.array(feature.segment_ids), + np.array(feature.input_mask, dtype=np.long), + np.array(feature.label_id), + ) + + +def convert_examples_to_features( + examples, + label_list, + max_seq_length, + tokenizer, + output_mode, + bos_token=None, + eos_token='[SEP]', + pad_token='[PAD]', + cls_token='[CLS]', + sep_token_extra=None, + cls_token_at_end=False, + cls_token_segment_id=0, + pad_token_segment_id=0, + pad_on_left=False, + mask_padding_with_zero=True, + sequence_a_segment_id=0, + sequence_b_segment_id=1, +): + """ Loads a data file into a list of `InputBatch`s + `cls_token_at_end` define the location of the CLS token: + - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] + - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] + `cls_token_segment_id` define the segment id associated to the CLS + token (0 for BERT, 2 for XLNet) + The convention in BERT is: + (a) For sequence pairs: + tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] + type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 + (b) For single sequences: + tokens: [CLS] the dog is hairy . [SEP] + type_ids: 0 0 0 0 0 0 0 + Where "type_ids" are used to indicate whether this is the first + sequence or the second sequence. The embedding vectors for `type=0` + and `type=1` were learned during pre-training and are added to the + wordpiece embedding vector (and position vector). This is + not *strictly* necessarysince the [SEP] token unambiguously separates + the sequences, but it makes it easier for the model to learn + the concept of sequences. + For classification tasks, the first vector (corresponding to [CLS]) + is used as as the "sentence vector". Note that this only makes sense + because the entire model is fine-tuned. + For NMT: + (a) For sequence pairs: + tokens: is this jack ##ville ? no it is not . + type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 + (b) For single sequences: + tokens: the dog is hairy . + type_ids: 0 0 0 0 0 0 0 + """ + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for ex_index, example in enumerate(examples): + if ex_index % 10000 == 0: + logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + tokens_a = tokenizer.text_to_tokens(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.text_to_tokens(example.text_b) + + special_tokens_count = 2 if eos_token else 0 + special_tokens_count += 1 if sep_token_extra else 0 + special_tokens_count += 2 if bos_token else 0 + special_tokens_count += 1 if cls_token else 0 + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) + else: + special_tokens_count = 1 if eos_token else 0 + special_tokens_count += 1 if sep_token_extra else 0 + special_tokens_count += 1 if bos_token else 0 + if len(tokens_a) > max_seq_length - special_tokens_count: + tokens_a = tokens_a[: max_seq_length - special_tokens_count] + # Add special tokens to sequence_a + tokens = tokens_a + if bos_token: + tokens = [bos_token] + tokens + if eos_token: + tokens += [eos_token] + segment_ids = [sequence_a_segment_id] * len(tokens) + + # Add sequence separator between sequences + if tokens_b and sep_token_extra: + tokens += [sep_token_extra] + segment_ids += [sequence_a_segment_id] + + # Add special tokens to sequence_b + if tokens_b: + if bos_token: + tokens += [bos_token] + segment_ids += [sequence_b_segment_id] + tokens += tokens_b + segment_ids += [sequence_b_segment_id] * (len(tokens_b)) + if eos_token: + tokens += [eos_token] + segment_ids += [sequence_b_segment_id] + + # Add classification token - for BERT models + if cls_token: + if cls_token_at_end: + tokens += [cls_token] + segment_ids += [cls_token_segment_id] + else: + tokens = [cls_token] + tokens + segment_ids = [cls_token_segment_id] + segment_ids + input_ids = tokenizer.tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_seq_length - len(input_ids) + pad_token_id = tokenizer.tokens_to_ids([pad_token])[0] + if pad_on_left: + input_ids = ([pad_token_id] * padding_length) + input_ids + input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask + segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids + else: + input_ids = input_ids + ([pad_token_id] * padding_length) + input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) + if len(input_ids) != max_seq_length: + raise ValueError("input_ids must be of length max_seq_length") + if len(input_mask) != max_seq_length: + raise ValueError("input_mask must be of length max_seq_length") + if len(segment_ids) != max_seq_length: + raise ValueError("segment_ids must be of length max_seq_length") + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = np.float32(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 5: + logging.info("*** Example ***") + logging.info("guid: %s" % (example.guid)) + logging.info("tokens: %s" % " ".join(list(map(str, tokens)))) + logging.info("input_ids: %s" % " ".join(list(map(str, input_ids)))) + logging.info("input_mask: %s" % " ".join(list(map(str, input_mask)))) + logging.info("segment_ids: %s" % " ".join(list(map(str, segment_ids)))) + logging.info("label: %s (id = %d)" % (example.label, label_id)) + + features.append( + InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id) + ) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length. + + This will always truncate the longer sequence one token at a time. + This makes more sense than truncating an equal percent + of tokens from each, since if one sequence is very short then each token + that's truncated likely contains more information than a longer sequence. + """ + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +""" +Utility functions for GLUE tasks +This code was adapted from the HuggingFace library at +https://github.com/huggingface/transformers +""" + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_id): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. + For single sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second + sequence. Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8-sig") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + # if sys.version_info[0] == 2: + # line = list(unicode(cell, 'utf-8') for cell in line) + lines.append(line) + return lines + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}') + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = line[4] + label = line[0] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8] + text_b = line[9] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliMismatchedProcessor(MnliProcessor): + """Processor for the MultiNLI Mismatched data set (GLUE version).""" + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched") + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line[3] + label = line[1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class Sst2Processor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[0] + label = line[1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class StsbProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return [None] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[7] + text_b = line[8] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QqpProcessor(DataProcessor): + """Processor for the QQP data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + try: + text_a = line[3] + text_b = line[4] + label = line[5] + except IndexError: + continue + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the QNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class RteProcessor(DataProcessor): + """Processor for the RTE data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class WnliProcessor(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mnli-mm": MnliMismatchedProcessor, + "mrpc": MrpcProcessor, + "sst-2": Sst2Processor, + "sts-b": StsbProcessor, + "qqp": QqpProcessor, + "qnli": QnliProcessor, + "rte": RteProcessor, + "wnli": WnliProcessor, +} +output_modes = { + "cola": "classification", + "mnli": "classification", + "mnli-mm": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification", +} +GLUE_TASKS_NUM_LABELS = { + "cola": 2, + "mnli": 3, + "mrpc": 2, + "sst-2": 2, + "sts-b": 1, + "qqp": 2, + "qnli": 2, + "rte": 2, + "wnli": 2, +} diff --git a/nemo/collections/nlp/data/datasets/joint_intent_slot.py b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py similarity index 54% rename from nemo/collections/nlp/data/datasets/joint_intent_slot.py rename to nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py index 5eae9c95a766..4abc70923226 100644 --- a/nemo/collections/nlp/data/datasets/joint_intent_slot.py +++ b/nemo/collections/nlp/data/datasets/joint_intent_slot_dataset.py @@ -18,14 +18,26 @@ Some parts of this code were adapted from the HuggingFace library at https://github.com/huggingface/pytorch-pretrained-BERT """ - import itertools import random import numpy as np from torch.utils.data import Dataset -from . import utils +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils import ( + get_label_stats, + get_stats, + merge, + process_atis, + process_dialogflow, + process_jarvis_datasets, + process_mturk, + process_snips, +) +from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, get_vocab, if_exist, label2idx + +__all__ = ['BertJointIntentSlotDataset', 'BertJointIntentSlotInferDataset', 'JointIntentSlotDataDesc'] def get_features( @@ -84,8 +96,8 @@ def get_features( all_slots.append(slots) max_seq_length = min(max_seq_length, max(sent_lengths)) - nemo.logging.info(f'Max length: {max_seq_length}') - utils.get_stats(sent_lengths) + logging.info(f'Max length: {max_seq_length}') + get_stats(sent_lengths) too_long_count = 0 for i, subtokens in enumerate(all_subtokens): @@ -113,16 +125,9 @@ def get_features( all_segment_ids.append([0] * max_seq_length) - nemo.logging.info(f'{too_long_count} are longer than {max_seq_length}') + logging.info(f'{too_long_count} are longer than {max_seq_length}') - return ( - all_input_ids, - all_segment_ids, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_slots, - ) + return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_slots) class BertJointIntentSlotDataset(Dataset): @@ -262,3 +267,139 @@ def __getitem__(self, idx): np.array(self.all_loss_mask[idx]), np.array(self.all_subtokens_mask[idx]), ) + + +class JointIntentSlotDataDesc: + """ Convert the raw data to the standard format supported by + JointIntentSlotDataset. + + By default, the None label for slots is 'O'. + + JointIntentSlotDataset requires two files: + + input_file: file to sequence + label. + the first line is header (sentence [tab] label) + each line should be [sentence][tab][label] + + slot_file: file to slot labels, each line corresponding to + slot labels for a sentence in input_file. No header. + + To keep the mapping from label index to label consistent during + training and inferencing, we require the following files: + dicts.intents.csv: each line is an intent. The first line + corresponding to the 0 intent label, the second line + corresponding to the 1 intent label, and so on. + + dicts.slots.csv: each line is a slot. The first line + corresponding to the 0 slot label, the second line + corresponding to the 1 slot label, and so on. + + Args: + data_dir (str): the directory of the dataset + do_lower_case (bool): whether to set your dataset to lowercase + dataset_name (str): the name of the dataset. If it's a dataset + that follows the standard JointIntentSlotDataset format, + you can set the name as 'default'. + none_slot_label (str): the label for slots that aren't indentified + defaulted to 'O' + pad_label (int): the int used for padding. If set to -1, + it'll be set to the whatever the None label is. + + """ + + def __init__(self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1): + if dataset_name == 'atis': + self.data_dir = process_atis(data_dir, do_lower_case) + elif dataset_name == 'snips-atis': + self.data_dir, self.pad_label = merge( + data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all'], dataset_name + ) + elif dataset_name == 'dialogflow': + self.data_dir = process_dialogflow(data_dir, do_lower_case) + elif dataset_name == 'mturk-processed': + self.data_dir = process_mturk(data_dir, do_lower_case) + elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']): + self.data_dir = process_snips(data_dir, do_lower_case) + if dataset_name.endswith('light'): + self.data_dir = f'{self.data_dir}/light' + elif dataset_name.endswith('speak'): + self.data_dir = f'{self.data_dir}/speak' + elif dataset_name.endswith('all'): + self.data_dir = f'{self.data_dir}/all' + elif dataset_name.startswith('jarvis'): + self.data_dir = process_jarvis_datasets( + data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False + ) + else: + if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): + raise FileNotFoundError( + "Make sure that your data follows the standard format " + "supported by JointIntentSlotDataset. Your data must " + "contain dict.intents.csv and dict.slots.csv." + ) + self.data_dir = data_dir + + self.intent_dict_file = self.data_dir + '/dict.intents.csv' + self.slot_dict_file = self.data_dir + '/dict.slots.csv' + self.num_intents = len(get_vocab(self.intent_dict_file)) + slots = label2idx(self.slot_dict_file) + self.num_slots = len(slots) + + for mode in ['train', 'test', 'eval']: + + if not if_exist(self.data_dir, [f'{mode}.tsv']): + logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') + continue + + slot_file = f'{self.data_dir}/{mode}_slots.tsv' + with open(slot_file, 'r') as f: + slot_lines = f.readlines() + + input_file = f'{self.data_dir}/{mode}.tsv' + with open(input_file, 'r') as f: + input_lines = f.readlines()[1:] # Skipping headers at index 0 + + if len(slot_lines) != len(input_lines): + raise ValueError( + "Make sure that the number of slot lines match the " + "number of intent lines. There should be a 1-1 " + "correspondence between every slot and intent lines." + ) + + dataset = list(zip(slot_lines, input_lines)) + + raw_slots, queries, raw_intents = [], [], [] + for slot_line, input_line in dataset: + slot_list = [int(slot) for slot in slot_line.strip().split()] + raw_slots.append(slot_list) + parts = input_line.strip().split() + raw_intents.append(int(parts[-1])) + queries.append(' '.join(parts[:-1])) + + infold = input_file[: input_file.rfind('/')] + + logging.info(f'Three most popular intents during {mode}ing') + total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv') + merged_slots = itertools.chain.from_iterable(raw_slots) + + logging.info(f'Three most popular slots during {mode}ing') + slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv') + + if mode == 'train': + self.slot_weights = calc_class_weights(slots_label_freq) + logging.info(f'Slot weights are - {self.slot_weights}') + + self.intent_weights = calc_class_weights(intent_label_freq) + logging.info(f'Intent weights are - {self.intent_weights}') + + logging.info(f'Total intents - {total_intents}') + logging.info(f'Intent label frequency - {intent_label_freq}') + logging.info(f'Total Slots - {slots_total}') + logging.info(f'Slots label frequency - {slots_label_freq}') + + if pad_label != -1: + self.pad_label = pad_label + else: + if none_slot_label not in slots: + raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') + self.pad_label = slots[none_slot_label] diff --git a/nemo/collections/nlp/data/datasets/language_modeling.py b/nemo/collections/nlp/data/datasets/language_modeling.py deleted file mode 100644 index d8912da7f891..000000000000 --- a/nemo/collections/nlp/data/datasets/language_modeling.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Pytorch Dataset for training Neural Machine Translation.""" - -import numpy as np -from torch.utils.data import Dataset - -from .. import utils - - -class LanguageModelingDataset(Dataset): - def __init__(self, tokenizer, dataset, max_seq_length=512, batch_step=None): - self.tokenizer = tokenizer - self.max_seq_length = max_seq_length - self.batch_step = batch_step or self.max_seq_length - ids = utils.dataset_to_ids(dataset, tokenizer, add_bos_eos=False) - self.ids = np.array([j for i in ids for j in i]) - - def __len__(self): - return (len(self.ids) - self.max_seq_length) // self.batch_step - - def __getitem__(self, idx): - left = idx * self.batch_step - right = left + self.max_seq_length - src_ids = self.ids[left:right] - labels = self.ids[left + 1 : right + 1] - src_mask = (src_ids != self.tokenizer.pad_id()).astype(np.float32) - return src_ids, src_mask, labels diff --git a/nemo/collections/nlp/data/datasets/bert_pretraining.py b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py similarity index 91% rename from nemo/collections/nlp/data/datasets/bert_pretraining.py rename to nemo/collections/nlp/data/datasets/lm_bert_dataset.py index 25ded90cd6ea..1ff975d25025 100644 --- a/nemo/collections/nlp/data/datasets/bert_pretraining.py +++ b/nemo/collections/nlp/data/datasets/lm_bert_dataset.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================= + """Pytorch Dataset for training BERT.""" import array @@ -26,6 +27,12 @@ from torch.utils.data import Dataset from tqdm import tqdm +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils import download_wkt2 +from nemo.collections.nlp.data.datasets.lm_transformer_dataset import create_vocab_mlm + +__all__ = ['BertPretrainingDataset', 'BertPretrainingPreprocessedDataset'] + class BertPretrainingDataset(Dataset): def __init__( @@ -187,7 +194,7 @@ def match_target_seq_length(document, target_seq_length, filename, line_idx, sen a_line_offset = self.sentence_indices[a_filename][a_line_idx] a_document = get_document(a_filename, a_line_offset) a_document, a_line_idx = match_target_seq_length( - a_document, target_seq_length_a, a_filename, a_line_idx, self.sentence_indices, + a_document, target_seq_length_a, a_filename, a_line_idx, self.sentence_indices ) is_last_line = a_line_idx >= (len(self.sentence_indices[a_filename]) - 1) @@ -221,7 +228,7 @@ def match_target_seq_length(document, target_seq_length, filename, line_idx, sen b_line_pos = self.sentence_indices[b_filename][b_line_idx] b_document = get_document(b_filename, b_line_pos) b_document, b_line_idx = match_target_seq_length( - b_document, target_seq_length_b, b_filename, b_line_idx, self.sentence_indices, + b_document, target_seq_length_b, b_filename, b_line_idx, self.sentence_indices ) def truncate_seq_pair(a, b, max_num_tokens): @@ -350,7 +357,7 @@ def __len__(self): return len(self.inputs[0]) def __getitem__(self, index): - [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels,] = [ + [input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids, next_sentence_labels] = [ input[index].astype(np.int64) for input in self.inputs ] @@ -367,11 +374,24 @@ def __getitem__(self, index): input_mask = np.asarray(input_mask, dtype=np.float32) output_mask = np.asarray(output_mask, dtype=np.float32) - return ( - input_ids, - segment_ids, - input_mask, - output_ids, - output_mask, - next_sentence_labels, - ) + return (input_ids, segment_ids, input_mask, output_ids, output_mask, next_sentence_labels) + + +class BERTPretrainingDataDesc: + def __init__(self, dataset_name, data_dir, vocab_size, sample_size, special_tokens, train_file=''): + if dataset_name == 'wikitext-2': + if not os.path.exists(data_dir): + data_dir = download_wkt2(data_dir) + self.data_dir, self.tokenizer_model = create_vocab_mlm( + data_dir, vocab_size, sample_size, special_tokens, train_file + ) + else: + logging.warning( + "Looks like you passed a dataset name that isn't " + "already supported by NeMo. Please make sure that " + "you build the preprocessing method for it." + ) + + self.train_file = f'{data_dir}/train.txt' + self.eval_file = f'{data_dir}/valid.txt' + self.test_file = f'{data_dir}/test.txt' diff --git a/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py new file mode 100644 index 000000000000..e2a9717abf11 --- /dev/null +++ b/nemo/collections/nlp/data/datasets/lm_transformer_dataset.py @@ -0,0 +1,187 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +"""Pytorch Dataset for training Neural Machine Translation.""" +import glob +import os +import pickle +import re + +import numpy as np +from sentencepiece import SentencePieceTrainer as SPT +from torch.utils.data import Dataset +from tqdm import tqdm + +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils import DATABASE_EXISTS_TMP, download_wkt2 +from nemo.collections.nlp.utils.common_nlp_utils import if_exist + +__all__ = ['LanguageModelingDataset'] + + +class LanguageModelingDataset(Dataset): + def __init__(self, tokenizer, dataset, max_seq_length=512, batch_step=None): + self.tokenizer = tokenizer + self.max_seq_length = max_seq_length + self.batch_step = batch_step or self.max_seq_length + ids = dataset_to_ids(dataset, tokenizer, add_bos_eos=False) + self.ids = np.array([j for i in ids for j in i]) + + def __len__(self): + return (len(self.ids) - self.max_seq_length) // self.batch_step + + def __getitem__(self, idx): + left = idx * self.batch_step + right = left + self.max_seq_length + src_ids = self.ids[left:right] + labels = self.ids[left + 1 : right + 1] + src_mask = (src_ids != self.tokenizer.pad_id()).astype(np.float32) + return src_ids, src_mask, labels + + +class LanguageModelDataDesc: + def __init__(self, dataset_name, data_dir, do_lower_case): + if dataset_name == 'wikitext-2': + if not os.path.exists(data_dir): + data_dir = download_wkt2(data_dir) + self.vocab_size = create_vocab_lm(data_dir, do_lower_case) + self.data_dir = data_dir + else: + logging.warning( + "Looks like you passed a dataset name that isn't " + "already supported by NeMo. Please make sure that " + "you build the preprocessing method for it." + ) + + +def create_vocab_mlm( + data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file='' +): + vocab = special_tokens[:] + bert_dir = f'{data_dir}/bert' + if if_exist(bert_dir, ['tokenizer.model']): + logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir)) + return data_dir, f'{bert_dir}/tokenizer.model' + logging.info(f'Processing WikiText dataset and store at {bert_dir}') + os.makedirs(bert_dir, exist_ok=True) + + if not train_file: + files = glob.glob(f'{data_dir}/*.txt') + train_file = f'{bert_dir}/merged.txt' + logging.info(f"Merging {len(files)} txt files into {train_file}") + + with open(train_file, "w") as merged: + for file in tqdm(files): + with open(file, 'r') as inf: + content = inf.read().strip() + merged.write(content + '\n\n\n') + else: + train_file = f'{data_dir}/{train_file}' + + cmd = ( + f"--input={train_file} --model_prefix={bert_dir}/tokenizer " + f"--vocab_size={vocab_size - len(vocab)} " + f"--input_sentence_size={sample_size} " + f"--shuffle_input_sentence=true --hard_vocab_limit=false " + f"--bos_id=-1 --eos_id=-1" + ) + SPT.Train(cmd) + + # Add BERT control symbols + tokens = [] + + with open(f"{bert_dir}/tokenizer.vocab", "r") as f: + f.readline() # skip first token + + # Read tokens from each line and parse for vocab + for line in f: + piece = line.split("\t")[0] + token = piece[1:] if piece.startswith("▁") else f"##{piece}" + tokens.append(token) + + vocab.extend(tokens) + + # Save vocabulary to output file + with open(f'{bert_dir}/vocab.txt', "w") as f: + for token in vocab: + f.write(f"{token}\n".format()) + return data_dir, f'{bert_dir}/tokenizer.model' + + +def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True): + """ + Reads dataset from file line by line, tokenizes each line with tokenizer, + and returns list of lists which corresponds to ids of tokenized strings. + + Args: + dataset: path to dataset + tokenizer: tokenizer to convert text into ids + cache_ids: if True, ids are saved to disk as pickle file + with similar name (e.g., data.txt --> data.txt.pkl) + add_bos_eos: bool, whether to add and symbols (e.g., for NMT) + Returns: + ids: list of ids which correspond to tokenized strings of the dataset + """ + + cached_ids_dataset = dataset + str(".pkl") + if os.path.isfile(cached_ids_dataset): + logging.info("Loading cached tokenized dataset ...") + ids = pickle.load(open(cached_ids_dataset, "rb")) + else: + logging.info("Tokenizing dataset ...") + data = open(dataset, "rb").readlines() + ids = [] + for sentence in data: + sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8")) + if add_bos_eos: + sent_ids = [tokenizer.bos_id()] + sent_ids + [tokenizer.eos_id()] + ids.append(sent_ids) + if cache_ids: + logging.info("Caching tokenized dataset ...") + pickle.dump(ids, open(cached_ids_dataset, "wb")) + return ids + + +def create_vocab_lm(data_dir, do_lower_case): + if if_exist(data_dir, ['train.txt', 'vocab.txt']): + logging.info("Vocabulary has been created.") + with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f: + vocab_size = len(f.readlines()) + return vocab_size + + logging.info(f'Creating vocabulary from training data at {data_dir}') + + with open(f'{data_dir}/train.txt', 'r') as f: + txt = f.read() + if do_lower_case: + txt = txt.lower() + lines = re.split(r'[\n]', txt) + sentences = [line.strip().split() for line in lines if line.strip()] + + vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3} + idx = 4 + for sentence in sentences: + for word in sentence: + if word not in vocab: + vocab[word] = idx + idx += 1 + + with open(f'{data_dir}/vocab.txt', 'w') as f: + for word in sorted(vocab.keys()): + f.write(word + '\n') + logging.info(f"Created vocabulary of size {len(vocab)}") + + return len(vocab) diff --git a/nemo/collections/nlp/data/datasets/translation.py b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py similarity index 77% rename from nemo/collections/nlp/data/datasets/translation.py rename to nemo/collections/nlp/data/datasets/machine_translation_dataset.py index e9c1134e70e0..0b8b049840ca 100644 --- a/nemo/collections/nlp/data/datasets/translation.py +++ b/nemo/collections/nlp/data/datasets/machine_translation_dataset.py @@ -1,3 +1,4 @@ +# ============================================================================= # Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -11,20 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -# ============================================================================== +# ============================================================================= + """Pytorch Dataset for training Neural Machine Translation.""" + from collections import OrderedDict import numpy as np from torch.utils.data import Dataset -from ..utils import clean_src_and_target, dataset_to_ids +from nemo.collections.nlp.data.datasets.lm_transformer_dataset import dataset_to_ids + +__all__ = ['TranslationDataset'] class TranslationDataset(Dataset): - def __init__( - self, tokenizer_src, tokenizer_tgt, dataset_src, dataset_tgt, tokens_in_batch=1024, clean=False, - ): + def __init__(self, tokenizer_src, tokenizer_tgt, dataset_src, dataset_tgt, tokens_in_batch=1024, clean=False): self.src_tokenizer = tokenizer_src self.tgt_tokenizer = tokenizer_tgt @@ -152,3 +155,36 @@ def pack_data_into_batches(self, src_ids, tgt_ids): batches.pop(-1) return batches + + +def clean_src_and_target(src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5): + """ + Cleans source and target sentences to get rid of noisy data. + Specifically, a pair of sentences is removed if + -- either source or target is longer than *max_tokens* + -- either source or target is shorter than *min_tokens* + -- absolute difference between source and target is larger than + *max_tokens_diff* + -- one sentence is *max_tokens_ratio* times longer than the other + """ + + if len(src_ids) != len(tgt_ids): + raise ValueError("Source and target corpora have different lengths!") + src_ids_, tgt_ids_ = [], [] + for i in range(len(src_ids)): + src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i]) + if ( + src_len > max_tokens + or tgt_len > max_tokens + or src_len < min_tokens + or tgt_len < min_tokens + or (src_ids[i] == tgt_ids[i]) + or np.abs(src_len - tgt_len) > max_tokens_diff + ): + continue + ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1) + if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio): + continue + src_ids_.append(src_ids[i]) + tgt_ids_.append(tgt_ids[i]) + return src_ids_, tgt_ids_ diff --git a/nemo/collections/nlp/data/datasets/punctuation_capitalization.py b/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py similarity index 91% rename from nemo/collections/nlp/data/datasets/punctuation_capitalization.py rename to nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py index ecabfc64032f..b8d8bfcd728b 100644 --- a/nemo/collections/nlp/data/datasets/punctuation_capitalization.py +++ b/nemo/collections/nlp/data/datasets/punctuation_capitalization_dataset.py @@ -19,6 +19,8 @@ https://github.com/huggingface/pytorch-pretrained-BERT """ +__all__ = ['BertPunctuationCapitalizationDataset', 'BertPunctuationCapitalizationInferDataset'] + import itertools import os import pickle @@ -27,8 +29,8 @@ import numpy as np from torch.utils.data import Dataset -import nemo -from . import utils +import nemo.collections.nlp.data.datasets.datasets_utils as utils +from nemo import logging def get_features( @@ -123,7 +125,7 @@ def get_features( capit_all_labels.append(capit_labels) max_seq_length = min(max_seq_length, max(sent_lengths)) - nemo.logging.info(f'Max length: {max_seq_length}') + logging.info(f'Max length: {max_seq_length}') utils.get_stats(sent_lengths) too_long_count = 0 @@ -154,18 +156,18 @@ def get_features( all_segment_ids.append([0] * max_seq_length) - nemo.logging.info(f'{too_long_count} are longer than {max_seq_length}') + logging.info(f'{too_long_count} are longer than {max_seq_length}') for i in range(min(len(all_input_ids), 5)): - nemo.logging.info("*** Example ***") - nemo.logging.info("i: %s" % (i)) - nemo.logging.info("subtokens: %s" % " ".join(list(map(str, all_subtokens[i])))) - nemo.logging.info("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i])))) - nemo.logging.info("input_mask: %s" % " ".join(list(map(str, all_input_mask[i])))) - nemo.logging.info("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i])))) + logging.info("*** Example ***") + logging.info("i: %s" % (i)) + logging.info("subtokens: %s" % " ".join(list(map(str, all_subtokens[i])))) + logging.info("loss_mask: %s" % " ".join(list(map(str, all_loss_mask[i])))) + logging.info("input_mask: %s" % " ".join(list(map(str, all_input_mask[i])))) + logging.info("subtokens_mask: %s" % " ".join(list(map(str, all_subtokens_mask[i])))) if with_label: - nemo.logging.info("punct_labels: %s" % " ".join(list(map(str, punct_all_labels[i])))) - nemo.logging.info("capit_labels: %s" % " ".join(list(map(str, capit_all_labels[i])))) + logging.info("punct_labels: %s" % " ".join(list(map(str, punct_all_labels[i])))) + logging.info("capit_labels: %s" % " ".join(list(map(str, capit_all_labels[i])))) return ( all_input_ids, @@ -245,7 +247,7 @@ def __init__( if use_cache and os.path.exists(features_pkl): # If text_file was already processed, load from pickle features = pickle.load(open(features_pkl, 'rb')) - nemo.logging.info(f'features restored from {features_pkl}') + logging.info(f'features restored from {features_pkl}') else: if num_samples == 0: raise ValueError("num_samples has to be positive", num_samples) @@ -288,16 +290,16 @@ def __init__( # for dev/test sets use label mapping from training set if punct_label_ids: if len(punct_label_ids) != len(punct_unique_labels): - nemo.logging.info( + logging.info( 'Not all labels from the specified' + 'label_ids dictionary are present in the' + 'current dataset. Using the provided' + 'label_ids dictionary.' ) else: - nemo.logging.info('Using the provided label_ids dictionary.') + logging.info('Using the provided label_ids dictionary.') else: - nemo.logging.info( + logging.info( 'Creating a new label to label_id dictionary.' + ' It\'s recommended to use label_ids generated' + ' during training for dev/test sets to avoid' @@ -332,7 +334,7 @@ def create_label_ids(unique_labels, pad_label=pad_label): if use_cache: pickle.dump(features, open(features_pkl, "wb")) - nemo.logging.info(f'features saved to {features_pkl}') + logging.info(f'features saved to {features_pkl}') self.all_input_ids = features[0] self.all_segment_ids = features[1] @@ -348,14 +350,14 @@ def create_label_ids(unique_labels, pad_label=pad_label): def get_stats_and_save(all_labels, label_ids, name): infold = text_file[: text_file.rfind('/')] merged_labels = itertools.chain.from_iterable(all_labels) - nemo.logging.info('Three most popular labels') + logging.info('Three most popular labels') _, label_frequencies = utils.get_label_stats(merged_labels, infold + '/label_count_' + name + '.tsv') out = open(os.path.join(infold, name + '_label_ids.csv'), 'w') labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1])) out.write('\n'.join(labels)) - nemo.logging.info(f'Labels: {label_ids}') - nemo.logging.info(f'Labels mapping saved to : {out.name}') + logging.info(f'Labels: {label_ids}') + logging.info(f'Labels mapping saved to : {out.name}') return label_frequencies diff --git a/nemo/collections/nlp/data/datasets/squad.py b/nemo/collections/nlp/data/datasets/qa_squad_dataset.py similarity index 88% rename from nemo/collections/nlp/data/datasets/squad.py rename to nemo/collections/nlp/data/datasets/qa_squad_dataset.py index 01f99d3c5d89..bf68e7bb7a55 100644 --- a/nemo/collections/nlp/data/datasets/squad.py +++ b/nemo/collections/nlp/data/datasets/qa_squad_dataset.py @@ -26,9 +26,9 @@ from torch.utils.data import Dataset from tqdm import tqdm -import nemo -from ...utils.metrics.squad_metrics import ( - _compute_softmax, +from nemo import logging +from nemo.collections.nlp.data.datasets.glue_benchmark_dataset import DataProcessor +from nemo.collections.nlp.metrics.squad_metrics import ( _get_best_indexes, apply_no_ans_threshold, exact_match_score, @@ -39,9 +39,10 @@ merge_eval, normalize_answer, ) -from .utils import DataProcessor -from nemo.collections.nlp.utils.nlp_utils import _is_whitespace +from nemo.collections.nlp.utils.common_nlp_utils import _is_whitespace +from nemo.collections.nlp.utils.loss_utils import _compute_softmax +__all__ = ['SquadDataset'] """ Utility functions for Question Answering NLP tasks @@ -72,7 +73,7 @@ class SquadDataset(Dataset): """ def __init__( - self, data_dir, tokenizer, doc_stride, max_query_length, max_seq_length, version_2_with_negative, mode, + self, data_dir, tokenizer, doc_stride, max_query_length, max_seq_length, version_2_with_negative, mode ): self.tokenizer = tokenizer if not version_2_with_negative: @@ -90,7 +91,7 @@ def __init__( cached_train_features_file = ( data_dir + '/cache' - + '_{0}_{1}_{2}_{3}'.format(mode, str(max_seq_length), str(doc_stride), str(max_query_length),) + + '_{0}_{1}_{2}_{3}'.format(mode, str(max_seq_length), str(doc_stride), str(max_query_length)) ) if os.path.exists(cached_train_features_file): @@ -107,9 +108,7 @@ def __init__( ) master_device = not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0 if master_device: - nemo.logging.info( - " Saving train features into cached file %s", cached_train_features_file, - ) + logging.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(self.features, writer) elif mode == "dev": @@ -159,7 +158,7 @@ def get_predictions( example_index_to_features[feature.example_index].append(feature) _PrelimPrediction = collections.namedtuple( - "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit",], + "PrelimPrediction", ["feature_index", "start_index", "end_index", "start_logit", "end_logit"] ) all_predictions = collections.OrderedDict() @@ -233,7 +232,7 @@ def get_predictions( end_logit=null_end_logit, ) ) - prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True,) + prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) _NbestPrediction = collections.namedtuple("NbestPrediction", ["text", "start_logit", "end_logit"]) @@ -268,21 +267,17 @@ def get_predictions( final_text = "" seen_predictions[final_text] = True - nbest.append( - _NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit,) - ) + nbest.append(_NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: - nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit,)) + nbest.append(_NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) # In very rare edge cases we could only # have single null pred. We just create a nonce prediction # in this case to avoid failure. if len(nbest) == 1: - nbest.insert( - 0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0), - ) + nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. @@ -327,9 +322,7 @@ def get_predictions( return all_predictions, all_nbest_json, scores_diff_json - def evaluate_predictions( - self, all_predictions, no_answer_probs=None, no_answer_probability_threshold=1.0, - ): + def evaluate_predictions(self, all_predictions, no_answer_probs=None, no_answer_probability_threshold=1.0): qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in self.examples} has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer] no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer] @@ -339,10 +332,10 @@ def evaluate_predictions( exact, f1 = self.get_raw_scores(all_predictions) exact_threshold = apply_no_ans_threshold( - exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold, + exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold ) f1_threshold = apply_no_ans_threshold( - f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold, + f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold ) evaluation = make_eval_dict(exact_threshold, f1_threshold) @@ -356,9 +349,7 @@ def evaluate_predictions( merge_eval(evaluation, no_ans_eval, "NoAns") if no_answer_probs: - find_all_best_thresh( - evaluation, all_predictions, exact, f1, no_answer_probs, qas_id_to_has_answer, - ) + find_all_best_thresh(evaluation, all_predictions, exact, f1, no_answer_probs, qas_id_to_has_answer) return evaluation["best_exact"], evaluation["best_f1"] @@ -380,7 +371,7 @@ def get_raw_scores(self, preds): gold_answers = [""] if qas_id not in preds: - print("Missing prediction for %s" % qas_id) + logging.warning("Missing prediction for %s" % qas_id) continue prediction = preds[qas_id] @@ -401,7 +392,7 @@ def evaluate( null_score_diff_threshold, ): - (all_predictions, all_nbest_json, scores_diff_json,) = self.get_predictions( + (all_predictions, all_nbest_json, scores_diff_json) = self.get_predictions( unique_ids, start_logits, end_logits, @@ -417,9 +408,7 @@ def evaluate( return exact_match, f1, all_predictions -def convert_examples_to_features( - examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth, -): +def convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, has_groundtruth): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 @@ -459,7 +448,7 @@ def convert_examples_to_features( tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text, + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text ) # The -3 accounts for [CLS], [SEP] and [SEP] @@ -544,28 +533,28 @@ def convert_examples_to_features( end_position = 0 if example_index < 1: - nemo.logging.info("*** Example ***") - nemo.logging.info("unique_id: %s" % (unique_id)) - nemo.logging.info("example_index: %s" % (example_index)) - nemo.logging.info("doc_span_index: %s" % (doc_span_index)) - nemo.logging.info("tokens: %s" % " ".join(tokens)) - nemo.logging.info( + logging.info("*** Example ***") + logging.info("unique_id: %s" % (unique_id)) + logging.info("example_index: %s" % (example_index)) + logging.info("doc_span_index: %s" % (doc_span_index)) + logging.info("tokens: %s" % " ".join(tokens)) + logging.info( "token_to_orig_map: %s" % " ".join(["%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]) ) - nemo.logging.info( + logging.info( "token_is_max_context: %s" % " ".join(["%d:%s" % (x, y) for (x, y) in token_is_max_context.items()]) ) - nemo.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - nemo.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - nemo.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) if has_groundtruth and example.is_impossible: - nemo.logging.info("impossible example") + logging.info("impossible example") if has_groundtruth and not example.is_impossible: answer_text = " ".join(tokens[start_position : (end_position + 1)]) - nemo.logging.info("start_position: %d" % (start_position)) - nemo.logging.info("end_position: %d" % (end_position)) - nemo.logging.info("answer: %s" % (answer_text)) + logging.info("start_position: %d" % (start_position)) + logging.info("end_position: %d" % (end_position)) + logging.info("answer: %s" % (answer_text)) features.append( InputFeatures( @@ -651,7 +640,7 @@ def get_train_examples(self, data_dir, filename=None): ) with open( - os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8", + os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8" ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train") @@ -676,7 +665,7 @@ def get_dev_examples(self, data_dir, filename=None): SquadV1Processor or SquadV2Processor" ) with open( - os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8", + os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8" ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev") @@ -797,7 +786,7 @@ def __init__( # start_position is index of word, end_position inclusive self.start_position = char_to_word_offset[start_position_character] self.end_position = char_to_word_offset[ - min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1,) + min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1) ] @@ -833,3 +822,45 @@ def _check_is_max_context(doc_spans, cur_span_index, position): best_span_index = span_index return cur_span_index == best_span_index + + +def check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token. + + Because of the sliding window approach taken to scoring documents, + a single token can appear in multiple documents. + + Example: + Doc: the man went to the store and bought a gallon of milk + Span A: the man went to the + Span B: to the store and bought + Span C: and bought a gallon of + ... + + Now the word 'bought' will have two scores from spans B and C. We only + want to consider the score with "maximum context", which we define as + the *minimum* of its left and right context (the *sum* of left and + right context will always be the same, of course). + + In the example the maximum context for 'bought' would be span C since + it has 1 left context and 3 right context, while span B has 4 left context + and 0 right context. + + Code adapted from the code by the Google AI and HuggingFace. + """ + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index diff --git a/nemo/collections/nlp/data/datasets/sentence_classification.py b/nemo/collections/nlp/data/datasets/text_classification_dataset.py similarity index 54% rename from nemo/collections/nlp/data/datasets/sentence_classification.py rename to nemo/collections/nlp/data/datasets/text_classification_dataset.py index 1847eaf7b205..11340ffa4da5 100644 --- a/nemo/collections/nlp/data/datasets/sentence_classification.py +++ b/nemo/collections/nlp/data/datasets/text_classification_dataset.py @@ -25,11 +25,24 @@ import numpy as np from torch.utils.data import Dataset -import nemo -from . import utils - - -class BertSentenceClassificationDataset(Dataset): +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils import ( + get_intent_labels, + get_label_stats, + get_stats, + process_imdb, + process_jarvis_datasets, + process_nlu, + process_sst_2, + process_thucnews, +) +from nemo.collections.nlp.utils.callback_utils import list2str +from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights, if_exist + +__all__ = ['BertTextClassificationDataset'] + + +class BertTextClassificationDataset(Dataset): """A dataset class that converts from raw data to a dataset that can be used by DataLayerNM. @@ -44,16 +57,14 @@ class BertSentenceClassificationDataset(Dataset): shuffle (bool): whether to shuffle your data. """ - def __init__( - self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffle=True, - ): + def __init__(self, input_file, max_seq_length, tokenizer, num_samples=-1, shuffle=True): with open(input_file, "r") as f: sent_labels, all_sent_subtokens = [], [] sent_lengths = [] too_long_count = 0 lines = f.readlines()[1:] - nemo.logging.info(f'{input_file}: {len(lines)}') + logging.info(f'{input_file}: {len(lines)}') if shuffle or num_samples > -1: random.seed(0) @@ -63,7 +74,7 @@ def __init__( for index, line in enumerate(lines): if index % 20000 == 0: - nemo.logging.debug(f"Processing line {index}/{len(lines)}") + logging.debug(f"Processing line {index}/{len(lines)}") sent_label = int(line.split()[-1]) sent_labels.append(sent_label) @@ -79,7 +90,7 @@ def __init__( all_sent_subtokens.append(sent_subtokens) sent_lengths.append(len(sent_subtokens)) - utils.get_stats(sent_lengths) + get_stats(sent_lengths) self.max_seq_length = min(max_seq_length, max(sent_lengths)) for i in range(len(all_sent_subtokens)): @@ -88,7 +99,7 @@ def __init__( all_sent_subtokens[i] = ['[CLS]'] + shorten_sent too_long_count += 1 - nemo.logging.info( + logging.info( f'{too_long_count} out of {len(sent_lengths)} \ sentencess with more than {max_seq_length} subtokens.' ) @@ -120,8 +131,7 @@ def convert_sequences_to_features(self, all_sent_subtokens, sent_labels, tokeniz for sent_id in range(len(all_sent_subtokens)): sent_subtokens = all_sent_subtokens[sent_id] sent_label = sent_labels[sent_id] - word_count = 0 - # input_ids = tokenizer.tokens_to_ids(sent_subtokens) + input_ids = [tokenizer._convert_token_to_id(t) for t in sent_subtokens] # The mask has 1 for real tokens and 0 for padding tokens. @@ -138,12 +148,12 @@ def convert_sequences_to_features(self, all_sent_subtokens, sent_labels, tokeniz assert len(input_mask) == max_seq_length if sent_id == 0: - nemo.logging.info("*** Example ***") - nemo.logging.info("example_index: %s" % sent_id) - nemo.logging.info("subtokens: %s" % " ".join(sent_subtokens)) - nemo.logging.info("sent_label: %s" % sent_label) - nemo.logging.info("input_ids: %s" % utils.list2str(input_ids)) - nemo.logging.info("input_mask: %s" % utils.list2str(input_mask)) + logging.info("*** Example ***") + logging.info("example_index: %s" % sent_id) + logging.info("subtokens: %s" % " ".join(sent_subtokens)) + logging.info("sent_label: %s" % sent_label) + logging.info("input_ids: %s" % list2str(input_ids)) + logging.info("input_mask: %s" % list2str(input_mask)) self.features.append( InputFeatures( @@ -165,3 +175,74 @@ def __init__(self, sent_id, sent_label, input_ids, input_mask, segment_ids): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids + + +class SentenceClassificationDataDesc: + def __init__(self, dataset_name, data_dir, do_lower_case): + if dataset_name == 'sst-2': + self.data_dir = process_sst_2(data_dir) + self.num_labels = 2 + self.eval_file = self.data_dir + '/dev.tsv' + elif dataset_name == 'imdb': + self.num_labels = 2 + self.data_dir = process_imdb(data_dir, do_lower_case) + self.eval_file = self.data_dir + '/test.tsv' + elif dataset_name == 'thucnews': + self.num_labels = 14 + self.data_dir = process_thucnews(data_dir) + self.eval_file = self.data_dir + '/test.tsv' + elif dataset_name.startswith('nlu-'): + if dataset_name.endswith('chat'): + self.data_dir = f'{data_dir}/ChatbotCorpus.json' + self.num_labels = 2 + elif dataset_name.endswith('ubuntu'): + self.data_dir = f'{data_dir}/AskUbuntuCorpus.json' + self.num_labels = 5 + elif dataset_name.endswith('web'): + data_dir = f'{data_dir}/WebApplicationsCorpus.json' + self.num_labels = 8 + self.data_dir = process_nlu(data_dir, do_lower_case, dataset_name=dataset_name) + self.eval_file = self.data_dir + '/test.tsv' + elif dataset_name.startswith('jarvis'): + self.data_dir = process_jarvis_datasets( + data_dir, do_lower_case, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False + ) + + intents = get_intent_labels(f'{self.data_dir}/dict.intents.csv') + self.num_labels = len(intents) + else: + raise ValueError( + "Looks like you passed a dataset name that isn't " + "already supported by NeMo. Please make sure " + "that you build the preprocessing method for it." + ) + + self.train_file = self.data_dir + '/train.tsv' + + for mode in ['train', 'test', 'eval']: + + if not if_exist(self.data_dir, [f'{mode}.tsv']): + logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') + continue + + input_file = f'{self.data_dir}/{mode}.tsv' + with open(input_file, 'r') as f: + input_lines = f.readlines()[1:] # Skipping headers at index 0 + + queries, raw_sentences = [], [] + for input_line in input_lines: + parts = input_line.strip().split() + raw_sentences.append(int(parts[-1])) + queries.append(' '.join(parts[:-1])) + + infold = input_file[: input_file.rfind('/')] + + logging.info(f'Three most popular classes during {mode}ing') + total_sents, sent_label_freq = get_label_stats(raw_sentences, infold + f'/{mode}_sentence_stats.tsv') + + if mode == 'train': + self.class_weights = calc_class_weights(sent_label_freq) + logging.info(f'Class weights are - {self.class_weights}') + + logging.info(f'Total Sentences - {total_sents}') + logging.info(f'Sentence class frequencies - {sent_label_freq}') diff --git a/nemo/collections/nlp/data/datasets/token_classification.py b/nemo/collections/nlp/data/datasets/token_classification_dataset.py similarity index 88% rename from nemo/collections/nlp/data/datasets/token_classification.py rename to nemo/collections/nlp/data/datasets/token_classification_dataset.py index 857153ca9b3a..966fefd42498 100644 --- a/nemo/collections/nlp/data/datasets/token_classification.py +++ b/nemo/collections/nlp/data/datasets/token_classification_dataset.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + """ Utility functions for Token Classification NLP tasks Some parts of this code were adapted from the HuggingFace library at @@ -27,8 +28,10 @@ import numpy as np from torch.utils.data import Dataset -import nemo -from . import utils +import nemo.collections.nlp.data.datasets.datasets_utils as datasets_utils +from nemo import logging + +__all__ = ['BertTokenClassificationDataset', 'BertTokenClassificationInferDataset'] def get_features( @@ -109,8 +112,8 @@ def get_features( all_labels.append(labels) max_seq_length = min(max_seq_length, max(sent_lengths)) - nemo.logging.info(f'Max length: {max_seq_length}') - utils.get_stats(sent_lengths) + logging.info(f'Max length: {max_seq_length}') + datasets_utils.get_stats(sent_lengths) too_long_count = 0 for i, subtokens in enumerate(all_subtokens): @@ -138,27 +141,18 @@ def get_features( all_segment_ids.append([0] * max_seq_length) - nemo.logging.warning(f'{too_long_count} are longer than {max_seq_length}') + logging.warning(f'{too_long_count} are longer than {max_seq_length}') for i in range(min(len(all_input_ids), 5)): - nemo.logging.debug("*** Example ***") - nemo.logging.debug("i: %s", i) - nemo.logging.debug("subtokens: %s", " ".join(list(map(str, all_subtokens[i])))) - nemo.logging.debug("loss_mask: %s", " ".join(list(map(str, all_loss_mask[i])))) - nemo.logging.debug("input_mask: %s", " ".join(list(map(str, all_input_mask[i])))) - nemo.logging.debug( - "subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i]))), - ) + logging.debug("*** Example ***") + logging.debug("i: %s", i) + logging.debug("subtokens: %s", " ".join(list(map(str, all_subtokens[i])))) + logging.debug("loss_mask: %s", " ".join(list(map(str, all_loss_mask[i])))) + logging.debug("input_mask: %s", " ".join(list(map(str, all_input_mask[i])))) + logging.debug("subtokens_mask: %s", " ".join(list(map(str, all_subtokens_mask[i])))) if with_label: - nemo.logging.debug("labels: %s", " ".join(list(map(str, all_labels[i])))) - return ( - all_input_ids, - all_segment_ids, - all_input_mask, - all_loss_mask, - all_subtokens_mask, - all_labels, - ) + logging.debug("labels: %s", " ".join(list(map(str, all_labels[i])))) + return (all_input_ids, all_segment_ids, all_input_mask, all_loss_mask, all_subtokens_mask, all_labels) class BertTokenClassificationDataset(Dataset): @@ -224,10 +218,10 @@ def __init__( if use_cache and os.path.exists(features_pkl) and os.path.exists(label_ids_pkl): # If text_file was already processed, load from pickle features = pickle.load(open(features_pkl, 'rb')) - nemo.logging.info(f'features restored from {features_pkl}') + logging.info(f'features restored from {features_pkl}') label_ids = pickle.load(open(label_ids_pkl, 'rb')) - nemo.logging.info(f'Labels to ids dict restored from {label_ids_pkl}') + logging.info(f'Labels to ids dict restored from {label_ids_pkl}') else: if num_samples == 0: raise ValueError("num_samples has to be positive", num_samples) @@ -261,16 +255,16 @@ def __init__( # for dev/test sets use label mapping from training set if label_ids: if len(label_ids) != len(unique_labels): - nemo.logging.warning( + logging.warning( f'Not all labels from the specified' + ' label_ids dictionary are present in the' + ' current dataset. Using the provided' + ' label_ids dictionary.' ) else: - nemo.logging.info(f'Using the provided label_ids dictionary.') + logging.info(f'Using the provided label_ids dictionary.') else: - nemo.logging.info( + logging.info( f'Creating a new label to label_id dictionary.' + ' It\'s recommended to use label_ids generated' + ' during training for dev/test sets to avoid' @@ -298,10 +292,10 @@ def __init__( if use_cache: pickle.dump(features, open(features_pkl, "wb")) - nemo.logging.info(f'features saved to {features_pkl}') + logging.info(f'features saved to {features_pkl}') pickle.dump(label_ids, open(label_ids_pkl, "wb")) - nemo.logging.info(f'labels to ids dict saved to {label_ids_pkl}') + logging.info(f'labels to ids dict saved to {label_ids_pkl}') self.all_input_ids = features[0] self.all_segment_ids = features[1] @@ -313,15 +307,15 @@ def __init__( infold = text_file[: text_file.rfind('/')] merged_labels = itertools.chain.from_iterable(self.all_labels) - nemo.logging.info('Three most popular labels') - _, self.label_frequencies = utils.get_label_stats(merged_labels, infold + '/label_stats.tsv') + logging.info('Three most popular labels') + _, self.label_frequencies = datasets_utils.get_label_stats(merged_labels, infold + '/label_stats.tsv') # save label_ids out = open(infold + '/label_ids.csv', 'w') labels, _ = zip(*sorted(self.label_ids.items(), key=lambda x: x[1])) out.write('\n'.join(labels)) - nemo.logging.info(f'Labels: {self.label_ids}') - nemo.logging.info(f'Labels mapping saved to : {out.name}') + logging.info(f'Labels: {self.label_ids}') + logging.info(f'Labels mapping saved to : {out.name}') def __len__(self): return len(self.all_input_ids) diff --git a/nemo/collections/nlp/data/datasets/utils.py b/nemo/collections/nlp/data/datasets/utils.py deleted file mode 100644 index 4ec542e50ddf..000000000000 --- a/nemo/collections/nlp/data/datasets/utils.py +++ /dev/null @@ -1,1681 +0,0 @@ -import csv -import glob -import itertools -import json -import os -import random -import re -import shutil -import subprocess -from collections import Counter - -import numpy as np -from sentencepiece import SentencePieceTrainer as SPT -from tqdm import tqdm - -import nemo -from ...utils.nlp_utils import get_vocab, label2idx, write_vocab, write_vocab_in_order - -DATABASE_EXISTS_TMP = '{} dataset has already been processed and stored at {}' -MODE_EXISTS_TMP = '{} mode of {} dataset has already been processed and stored at {}' - - -def get_stats(lengths): - lengths = np.asarray(lengths) - nemo.logging.info( - f'Min: {np.min(lengths)} | \ - Max: {np.max(lengths)} | \ - Mean: {np.mean(lengths)} | \ - Median: {np.median(lengths)}' - ) - nemo.logging.info(f'75 percentile: {np.percentile(lengths, 75)}') - nemo.logging.info(f'99 percentile: {np.percentile(lengths, 99)}') - - -def get_label_stats(labels, outfile='stats.tsv'): - labels = Counter(labels) - total = sum(labels.values()) - out = open(outfile, 'w') - i = 0 - label_frequencies = labels.most_common() - for k, v in label_frequencies: - out.write(f'{k}\t{v / total}\n') - if i < 3: - nemo.logging.info(f'{i} item: {k}, {v} out of {total}, {v / total}.') - i += 1 - return total, label_frequencies - - -def list2str(l): - return ' '.join([str(x) for x in l]) - - -def tensor2list(tensor): - return tensor.detach().cpu().tolist() - - -def if_exist(outfold, files): - if not os.path.exists(outfold): - return False - for file in files: - if not os.path.exists(f'{outfold}/{file}'): - return False - return True - - -def process_sst_2(data_dir): - if not os.path.exists(data_dir): - link = 'https://gluebenchmark.com/tasks' - raise ValueError(f'Data not found at {data_dir}. ' f'Please download SST-2 from {link}.') - nemo.logging.info('Keep in mind that SST-2 is only available in lower case.') - return data_dir - - -def process_imdb(data_dir, uncased, modes=['train', 'test']): - if not os.path.exists(data_dir): - link = 'www.kaggle.com/iarunava/imdb-movie-reviews-dataset' - raise ValueError(f'Data not found at {data_dir}. ' f'Please download IMDB from {link}.') - - outfold = f'{data_dir}/nemo-processed' - - if uncased: - outfold = f'{outfold}_uncased' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - nemo.logging.info(DATABASE_EXISTS_TMP.format('IMDB', outfold)) - return outfold - nemo.logging.info(f'Processing IMDB dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') - outfiles[mode].write('sentence\tlabel\n') - for sent in ['neg', 'pos']: - if sent == 'neg': - label = 0 - else: - label = 1 - files = glob.glob(f'{data_dir}/{mode}/{sent}/*.txt') - for file in files: - with open(file, 'r') as f: - review = f.read().strip() - if uncased: - review = review.lower() - review = review.replace("
", "") - outfiles[mode].write(f'{review}\t{label}\n') - for mode in modes: - outfiles[mode].close() - - return outfold - - -def process_thucnews(data_dir): - modes = ['train', 'test'] - train_size = 0.8 - if not os.path.exists(data_dir): - link = 'thuctc.thunlp.org/' - raise ValueError(f'Data not found at {data_dir}. ' f'Please download THUCNews from {link}.') - - outfold = f'{data_dir}/nemo-processed-thucnews' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - nemo.logging.info(DATABASE_EXISTS_TMP.format('THUCNews', outfold)) - return outfold - nemo.logging.info(f'Processing THUCNews dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'a+', encoding='utf-8') - outfiles[mode].write('sentence\tlabel\n') - categories = [ - '体育', - '娱乐', - '家居', - '彩票', - '房产', - '教育', - '时尚', - '时政', - '星座', - '游戏', - '社会', - '科技', - '股票', - '财经', - ] - for category in categories: - label = categories.index(category) - category_files = glob.glob(f'{data_dir}/{category}/*.txt') - test_num = int(len(category_files) * (1 - train_size)) - test_files = category_files[:test_num] - train_files = category_files[test_num:] - for mode in modes: - nemo.logging.info(f'Processing {mode} data of the category {category}') - if mode == 'test': - files = test_files - else: - files = train_files - for file in tqdm(files): - with open(file, 'r', encoding='utf-8') as f: - news = f.read().strip().replace('\r', '') - news = news.replace('\n', '').replace('\t', ' ') - outfiles[mode].write(f'{news}\t{label}\n') - for mode in modes: - outfiles[mode].close() - - return outfold - - -def process_nlu(filename, uncased, modes=['train', 'test'], dataset_name='nlu-ubuntu'): - """ Dataset has to be of: - - ubuntu - - chat - - web - """ - - if not os.path.exists(filename): - link = 'https://github.com/sebischair/NLU-Evaluation-Corpora' - raise ValueError(f'Data not found at {filename}. ' 'Please download IMDB from {link}.') - - if dataset_name == 'nlu-ubuntu': - INTENT = { - 'makeupdate': 1, - 'setupprinter': 2, - 'shutdowncomputer': 3, - 'softwarerecommendation': 4, - 'none': 0, - } - elif dataset_name == 'nlu-chat': - INTENT = {'departuretime': 0, 'findconnection': 1} - elif dataset_name == 'nlu-web': - INTENT = { - 'changepassword': 1, - 'deleteaccount': 2, - 'downloadvideo': 3, - 'exportdata': 4, - 'filterspam': 5, - 'findalternative': 6, - 'syncaccounts': 7, - 'none': 0, - } - else: - raise ValueError(f'{dataset_name}: Invalid dataset name') - - infold = filename[: filename.rfind('/')] - outfold = f'{infold}/{dataset_name}-nemo-processed' - - if uncased: - outfold = f'{outfold}_uncased' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - nemo.logging.info(DATABASE_EXISTS_TMP.format(dataset_name.upper(), outfold)) - return outfold - nemo.logging.info(f'Processing data and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') - outfiles[mode].write('sentence\tlabel\n') - - with open(filename, 'r') as f: - data = json.load(f) - - for obj in data['sentences']: - sentence = obj['text'].strip() - if uncased: - sentence = sentence.lower() - intent = obj['intent'].lower().replace(' ', '') - label = INTENT[intent] - txt = f'{sentence}\t{label}\n' - if obj['training']: - outfiles['train'].write(txt) - else: - outfiles['test'].write(txt) - for mode in modes: - outfiles[mode].close() - return outfold - - -def get_intent_labels(intent_file): - labels = {} - label = 0 - with open(intent_file, 'r') as f: - for line in f: - intent = line.strip() - labels[intent] = label - label += 1 - return labels - - -def process_twitter_airline(filename, uncased, modes=['train', 'test']): - """ Dataset from Kaggle: - https://www.kaggle.com/crowdflower/twitter-airline-sentiment - """ - pass - - -def ids2text(ids, vocab): - return ' '.join([vocab[int(id_)] for id_ in ids]) - - -def process_atis(infold, uncased, modes=['train', 'test'], dev_split=0): - """ MSFT's dataset, processed by Kaggle - https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk - """ - outfold = f'{infold}/nemo-processed' - vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') - - if uncased: - outfold = f'{outfold}-uncased' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - nemo.logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) - return outfold - nemo.logging.info(f'Processing ATIS dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - - for mode in modes: - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') - outfiles[mode].write('sentence\tlabel\n') - outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') - - queries = open(f'{infold}/atis.{mode}.query.csv', 'r').readlines() - intents = open(f'{infold}/atis.{mode}.intent.csv', 'r').readlines() - slots = open(f'{infold}/atis.{mode}.slots.csv', 'r').readlines() - - for i, query in enumerate(queries): - sentence = ids2text(query.strip().split()[1:-1], vocab) - outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') - slot = ' '.join(slots[i].strip().split()[1:-1]) - outfiles[mode + '_slots'].write(slot + '\n') - - shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') - shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') - for mode in modes: - outfiles[mode].close() - - return outfold - - -def process_jarvis_datasets( - infold, uncased, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False, -): - """ process and convert Jarvis datasets into NeMo's BIO format - """ - outfold = f'{infold}/{dataset_name}-nemo-processed' - infold = f'{infold}/' - - if uncased: - outfold = f'{outfold}-uncased' - - if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): - nemo.logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold)) - return outfold - - nemo.logging.info(f'Processing {dataset_name} dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - outfiles = {} - intents_list = {} - slots_list = {} - slots_list_all = {} - - outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w') - outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w') - - outfiles['dict_slots'].write('O\n') - slots_list["O"] = 0 - slots_list_all["O"] = 0 - - for mode in modes: - if if_exist(outfold, [f'{mode}.tsv']): - nemo.logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode)) - continue - - if not if_exist(infold, [f'{mode}.tsv']): - nemo.logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.') - continue - - outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w') - outfiles[mode].write('sentence\tlabel\n') - outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w') - - queries = open(f'{infold}/{mode}.tsv', 'r').readlines() - - for i, query in enumerate(queries): - line_splits = query.strip().split("\t") - if len(line_splits) == 3: - intent_str, slot_tags_str, sentence = line_splits - else: - intent_str, sentence = line_splits - slot_tags_str = "" - - if intent_str not in intents_list: - intents_list[intent_str] = len(intents_list) - outfiles['dict_intents'].write(f'{intent_str}\n') - - if ignore_prev_intent: - start_token = 2 - else: - start_token = 1 - sentence_cld = " ".join(sentence.strip().split()[start_token:-1]) - outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n') - - slot_tags_list = [] - if slot_tags_str.strip(): - slot_tags = slot_tags_str.strip().split(",") - for st in slot_tags: - if not st.strip(): - continue - [start_i, end_i, slot_name] = st.strip().split(":") - slot_tags_list.append([int(start_i), int(end_i), slot_name]) - if slot_name not in slots_list: - slots_list[slot_name] = len(slots_list) - slots_list_all[f'B-{slot_name}'] = len(slots_list_all) - slots_list_all[f'I-{slot_name}'] = len(slots_list_all) - outfiles['dict_slots'].write(f'B-{slot_name}\n') - outfiles['dict_slots'].write(f'I-{slot_name}\n') - - slot_tags_list.sort(key=lambda x: x[0]) - slots = [] - processed_index = 0 - for tag_start, tag_end, tag_str in slot_tags_list: - if tag_start > processed_index: - words_list = sentence[processed_index:tag_start].strip().split() - slots.extend([str(slots_list_all['O'])] * len(words_list)) - words_list = sentence[tag_start:tag_end].strip().split() - slots.append(str(slots_list_all[f'B-{tag_str}'])) - slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1)) - processed_index = tag_end - - if processed_index < len(sentence): - words_list = sentence[processed_index:].strip().split() - slots.extend([str(slots_list_all['O'])] * len(words_list)) - - slots = slots[1:-1] - slot = ' '.join(slots) - outfiles[mode + '_slots'].write(slot + '\n') - - outfiles[mode + '_slots'].close() - outfiles[mode].close() - - outfiles['dict_slots'].close() - outfiles['dict_intents'].close() - - return outfold - - -def reverse_dict(entity2value): - value2entity = {} - for entity in entity2value: - for value in entity2value[entity]: - value2entity[value] = entity - return value2entity - - -def map_entities(entity2value, entities): - for key in entities: - if 'data' in entities[key]: - if key not in entity2value: - entity2value[key] = set([]) - - values = [] - for value in entities[key]['data']: - values.append(value['value']) - values.extend(value['synonyms']) - entity2value[key] = entity2value[key] | set(values) - - return entity2value - - -def get_entities(files): - entity2value = {} - for file in files: - with open(file, 'r') as json_file: - data = json.load(json_file) - entity2value = map_entities(entity2value, data['entities']) - - value2entity = reverse_dict(entity2value) - return entity2value, value2entity - - -def get_data(files, entity2value, value2entity): - all_data, all_slots, all_intents = [], set(['O']), set() - for file in files: - file_data = [] - with open(file, 'r') as json_file: - data = json.load(json_file) - for intent in data['intents']: - all_intents.add(intent) - utterances = data['intents'][intent]['utterances'] - for utterance in utterances: - tokens, slots = [], [] - for frag in utterance['data']: - frag_tokens = frag['text'].strip().split() - tokens.extend(frag_tokens) - if 'slot_name' not in frag: - slot = 'O' - else: - slot = frag['slot_name'] - all_slots.add(slot) - slots.extend([slot] * len(frag_tokens)) - file_data.append((tokens, slots, intent)) - all_data.append(file_data) - return all_data, all_slots, all_intents - - -def get_dataset(files, dev_split=0.1): - entity2value, value2entity = get_entities(files) - data, slots, intents = get_data(files, entity2value, value2entity) - if len(data) == 1: - train, dev = partition(data[0], split=dev_split) - else: - train, dev = data[0], data[1] - return train, dev, slots, intents - - -def partition(data, split=0.1): - n = len(data) - n_dev = int(n * split) - dev_idx = set(random.sample(range(n), n_dev)) - dev, train = [], [] - - for i, item in enumerate(data): - if i in dev_idx: - dev.append(item) - else: - train.append(item) - return train, dev - - -def write_data(data, slot_dict, intent_dict, outfold, mode, uncased): - intent_file = open(f'{outfold}/{mode}.tsv', 'w') - intent_file.write('sentence\tlabel\n') - slot_file = open(f'{outfold}/{mode}_slots.tsv', 'w') - for tokens, slots, intent in data: - text = ' '.join(tokens) - if uncased: - text = text.lower() - intent_file.write(f'{text}\t{intent_dict[intent]}\n') - slots = [str(slot_dict[slot]) for slot in slots] - slot_file.write(' '.join(slots) + '\n') - intent_file.close() - slot_file.close() - - -def create_dataset(train, dev, slots, intents, uncased, outfold): - os.makedirs(outfold, exist_ok=True) - if 'O' in slots: - slots.remove('O') - slots = sorted(list(slots)) + ['O'] - intents = sorted(list(intents)) - slots = write_vocab(slots, f'{outfold}/dict.slots.csv') - intents = write_vocab(intents, f'{outfold}/dict.intents.csv') - write_data(train, slots, intents, outfold, 'train', uncased) - write_data(dev, slots, intents, outfold, 'test', uncased) - - -def process_snips(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): - if not os.path.exists(data_dir): - link = 'www.github.com/snipsco/spoken-language' - '-understanding-research-datasets' - raise ValueError(f'Data not found at {data_dir}. ' 'Resquest to download the SNIPS dataset from {link}.') - - outfold = f'{data_dir}/nemo-processed' - - if uncased: - outfold = f'{outfold}-uncased' - - exist = True - for dataset in ['light', 'speak', 'all']: - if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]): - nemo.logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset.upper(), outfold)) - else: - exist = False - if exist: - return outfold - - nemo.logging.info(f'Processing SNIPS dataset and store at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - speak_dir = 'smart-speaker-en-close-field' - light_dir = 'smart-lights-en-close-field' - - light_files = [f'{data_dir}/{light_dir}/dataset.json'] - speak_files = [f'{data_dir}/{speak_dir}/training_dataset.json'] - speak_files.append(f'{data_dir}/{speak_dir}/test_dataset.json') - - light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split) - speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files) - - create_dataset( - light_train, light_dev, light_slots, light_intents, uncased, f'{outfold}/light', - ) - create_dataset( - speak_train, speak_dev, speak_slots, speak_intents, uncased, f'{outfold}/speak', - ) - create_dataset( - light_train + speak_train, - light_dev + speak_dev, - light_slots | speak_slots, - light_intents | speak_intents, - uncased, - f'{outfold}/all', - ) - - return outfold - - -# def list2str(nums): -# return ' '.join([str(num) for num in nums]) - - -def merge(data_dir, subdirs, dataset_name, modes=['train', 'test']): - outfold = f'{data_dir}/{dataset_name}' - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - nemo.logging.info(DATABASE_EXISTS_TMP.format('SNIPS-ATIS', outfold)) - slots = get_vocab(f'{outfold}/dict.slots.csv') - none_slot = 0 - for key in slots: - if slots[key] == 'O': - none_slot = key - break - return outfold, int(none_slot) - - os.makedirs(outfold, exist_ok=True) - - data_files, slot_files = {}, {} - for mode in modes: - data_files[mode] = open(f'{outfold}/{mode}.tsv', 'w') - data_files[mode].write('sentence\tlabel\n') - slot_files[mode] = open(f'{outfold}/{mode}_slots.tsv', 'w') - - intents, slots = {}, {} - intent_shift, slot_shift = 0, 0 - none_intent, none_slot = -1, -1 - - for subdir in subdirs: - curr_intents = get_vocab(f'{data_dir}/{subdir}/dict.intents.csv') - curr_slots = get_vocab(f'{data_dir}/{subdir}/dict.slots.csv') - - for key in curr_intents: - if intent_shift > 0 and curr_intents[key] == 'O': - continue - if curr_intents[key] == 'O' and intent_shift == 0: - none_intent = int(key) - intents[int(key) + intent_shift] = curr_intents[key] - - for key in curr_slots: - if slot_shift > 0 and curr_slots[key] == 'O': - continue - if slot_shift == 0 and curr_slots[key] == 'O': - none_slot = int(key) - slots[int(key) + slot_shift] = curr_slots[key] - - for mode in modes: - with open(f'{data_dir}/{subdir}/{mode}.tsv', 'r') as f: - for line in f.readlines()[1:]: - text, label = line.strip().split('\t') - label = int(label) - if curr_intents[label] == 'O': - label = none_intent - else: - label = label + intent_shift - data_files[mode].write(f'{text}\t{label}\n') - - with open(f'{data_dir}/{subdir}/{mode}_slots.tsv', 'r') as f: - for line in f.readlines(): - labels = [int(label) for label in line.strip().split()] - shifted_labels = [] - for label in labels: - if curr_slots[label] == 'O': - shifted_labels.append(none_slot) - else: - shifted_labels.append(label + slot_shift) - slot_files[mode].write(list2str(shifted_labels) + '\n') - - intent_shift += len(curr_intents) - slot_shift += len(curr_slots) - - write_vocab_in_order(intents, f'{outfold}/dict.intents.csv') - write_vocab_in_order(slots, f'{outfold}/dict.slots.csv') - return outfold, none_slot - - -def get_intent_query_files_dialogflow(path): - fileslist = [] - for root, _, files in os.walk(path): - for file in files: - if '_usersays_en.json' in file: - fileslist.append(os.path.join(root, file)) - return fileslist - - -def get_intents_slots_dialogflow(files, slot_labels): - intent_names = [] - intent_queries = [] - slot_tags = [] - - for index, file in enumerate(files): - intent_names.append(os.path.basename(file).split('_usersays')[0]) - - with open(file) as json_file: - intent_data = json.load(json_file) - for query in intent_data: - query_text = "" - slots = "" - for segment in query['data']: - query_text = ''.join([query_text, segment['text']]) - if 'alias' in segment: - for _ in segment['text'].split(): - slots = ' '.join([slots, slot_labels.get(segment['alias'])]) - else: - for _ in segment['text'].split(): - slots = ' '.join([slots, slot_labels.get('O')]) - query_text = f'{query_text.strip()}\t{index}\n' - intent_queries.append(query_text) - slots = f'{slots.strip()}\n' - slot_tags.append(slots) - return intent_queries, intent_names, slot_tags - - -def get_slots_dialogflow(files): - slot_labels = {} - count = 0 - for file in files: - intent_head_file = ''.join([file.split('_usersays')[0], '.json']) - with open(intent_head_file) as json_file: - intent_meta_data = json.load(json_file) - for params in intent_meta_data['responses'][0]['parameters']: - if params['name'] not in slot_labels: - slot_labels[params['name']] = str(count) - count += 1 - slot_labels['O'] = str(count) - return slot_labels - - -# The following works for the specified DialogFlow and Mturk output format -def partition_data(intent_queries, slot_tags, split=0.1): - n = len(intent_queries) - n_dev = int(n * split) - dev_idx = set(random.sample(range(n), n_dev)) - dev_intents, dev_slots, train_intents, train_slots = [], [], [], [] - - dev_intents.append('sentence\tlabel\n') - train_intents.append('sentence\tlabel\n') - - for i, item in enumerate(intent_queries): - if i in dev_idx: - dev_intents.append(item) - dev_slots.append(slot_tags[i]) - else: - train_intents.append(item) - train_slots.append(slot_tags[i]) - return train_intents, train_slots, dev_intents, dev_slots - - -# The following works for the specified DialogFlow and Mturk output format -def write_files(data, outfile): - with open(outfile, 'w') as f: - for item in data: - item = f'{item.strip()}\n' - f.write(item) - - -def process_dialogflow(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): - if not os.path.exists(data_dir): - link = 'www.dialogflow.com' - raise ValueError( - f'Data not found at {data_dir}. ' 'Export your dialogflow data from' '{link} and unzip at {data_dir}.' - ) - - outfold = f'{data_dir}/dialogflow/nemo-processed' - - '''TO DO - check for nemo-processed directory - already exists. If exists, skip the entire creation steps below. ''' - - os.makedirs(outfold, exist_ok=True) - - files = get_intent_query_files_dialogflow(data_dir) - - slot_labels = get_slots_dialogflow(files) - - intent_queries, intent_names, slot_tags = get_intents_slots_dialogflow(files, slot_labels) - - train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split) - - write_files(train_queries, f'{outfold}/train.tsv') - write_files(train_slots, f'{outfold}/train_slots.tsv') - - write_files(test_queries, f'{outfold}/test.tsv') - write_files(test_slots, f'{outfold}/test_slots.tsv') - - write_files(slot_labels, f'{outfold}/dict.slots.csv') - write_files(intent_names, f'{outfold}/dict.intents.csv') - - return outfold - - -def read_csv(file_path): - rows = [] - with open(file_path, 'r') as csvfile: - read_csv = csv.reader(csvfile, delimiter=',') - for row in read_csv: - rows.append(row) - return rows - - -def get_intents_mturk(utterances, outfold): - intent_names = {} - intent_count = 0 - - agreed_all = {} - - print('Printing all intent_labels') - intent_dict = f'{outfold}/dict.intents.csv' - if os.path.exists(intent_dict): - with open(intent_dict, 'r') as f: - for intent_name in f.readlines(): - intent_names[intent_name.strip()] = intent_count - intent_count += 1 - print(intent_names) - - for i, utterance in enumerate(utterances[1:]): - - if utterance[1] not in agreed_all: - agreed_all[utterance[0]] = utterance[1] - - if utterance[1] not in intent_names: - intent_names[utterance[1]] = intent_count - intent_count += 1 - - print(f'Total number of utterance samples: {len(agreed_all)}') - - return agreed_all, intent_names - - -def get_slot_labels(slot_annotations, task_name): - slot_labels = json.loads(slot_annotations[0]) - - all_labels = {} - count = 0 - # Generating labels with the IOB format. - for label in slot_labels[task_name]['annotations']['labels']: - b_slot = 'B-' + label['label'] - i_slot = 'I-' + label['label'] - all_labels[b_slot] = str(count) - count += 1 - all_labels[i_slot] = str(count) - count += 1 - all_labels['O'] = str(count) - - return all_labels - - -def process_intent_slot_mturk(slot_annotations, agreed_all, intent_names, task_name): - slot_tags = [] - inorder_utterances = [] - all_labels = get_slot_labels(slot_annotations, task_name) - print(f'agreed_all - {len(agreed_all)}') - print(f'Slot annotations - {len(slot_annotations)}') - - for annotation in slot_annotations[0:]: - an = json.loads(annotation) - utterance = an['source'] - if len(utterance) > 2 and utterance.startswith('"') and utterance.endswith('"'): - utterance = utterance[1:-1] - - if utterance in agreed_all: - entities = {} - annotated_entities = an[task_name]['annotations']['entities'] - for i, each_anno in enumerate(annotated_entities): - entities[int(each_anno['startOffset'])] = i - - lastptr = 0 - slotlist = [] - # sorting annotations by the start offset - for i in sorted(entities.keys()): - annotated_entities = an[task_name]['annotations']['entities'] - tags = annotated_entities[entities.get(i)] - untagged_words = utterance[lastptr : tags['startOffset']] - for _ in untagged_words.split(): - slotlist.append(all_labels.get('O')) - anno_words = utterance[tags['startOffset'] : tags['endOffset']] - # tagging with the IOB format. - for j, _ in enumerate(anno_words.split()): - if j == 0: - b_slot = 'B-' + tags['label'] - slotlist.append(all_labels.get(b_slot)) - else: - i_slot = 'I-' + tags['label'] - slotlist.append(all_labels.get(i_slot)) - lastptr = tags['endOffset'] - - untagged_words = utterance[lastptr : len(utterance)] - for _ in untagged_words.split(): - slotlist.append(all_labels.get('O')) - - slotstr = ' '.join(slotlist) - slotstr = f'{slotstr.strip()}\n' - - slot_tags.append(slotstr) - intent_num = intent_names.get(agreed_all.get(utterance)) - query_text = f'{utterance.strip()}\t{intent_num}\n' - inorder_utterances.append(query_text) - # else: - # print(utterance) - - print(f'inorder utterances - {len(inorder_utterances)}') - - return all_labels, inorder_utterances, slot_tags - - -def process_mturk(data_dir, uncased, modes=['train', 'test'], dev_split=0.1): - if not os.path.exists(data_dir): - link = 'www.mturk.com' - raise ValueError( - f'Data not found at {data_dir}. ' 'Export your mturk data from' '{link} and unzip at {data_dir}.' - ) - - outfold = f'{data_dir}/nemo-processed' - - if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): - nemo.logging.info(DATABASE_EXISTS_TMP.format('mturk', outfold)) - return outfold - - nemo.logging.info(f'Processing dataset from mturk and storing at {outfold}') - - os.makedirs(outfold, exist_ok=True) - - classification_data_file = f'{data_dir}/classification.csv' - annotation_data_file = f'{data_dir}/annotation.manifest' - - if not os.path.exists(classification_data_file): - raise FileNotFoundError(f'File not found ' f'at {classification_data_file}') - - if not os.path.exists(annotation_data_file): - raise FileNotFoundError(f'File not found at {annotation_data_file}') - - utterances = [] - utterances = read_csv(classification_data_file) - - # This function assumes that the intent classification data has been - # reviewed and cleaned and only one label per utterance is present. - agreed_all, intent_names = get_intents_mturk(utterances, outfold) - - with open(annotation_data_file, 'r') as f: - slot_annotations = f.readlines() - - # This function assumes that the preprocess step would have made - # the task_name of all the annotations generic - task_name = 'retail-combined' - - # It is assumed that every utterances will have corresponding - # slot annotation information - if len(slot_annotations) < len(agreed_all): - raise ValueError(f'Every utterance must have corresponding' f'slot annotation information') - - slot_labels, intent_queries, slot_tags = process_intent_slot_mturk( - slot_annotations, agreed_all, intent_names, task_name - ) - - assert len(slot_tags) == len(intent_queries) - - dev_split = 0.1 - - train_queries, train_slots, test_queries, test_slots = partition_data(intent_queries, slot_tags, split=dev_split) - - write_files(train_queries, f'{outfold}/train.tsv') - write_files(train_slots, f'{outfold}/train_slots.tsv') - - write_files(test_queries, f'{outfold}/test.tsv') - write_files(test_slots, f'{outfold}/test_slots.tsv') - - write_files(slot_labels, f'{outfold}/dict.slots.csv') - write_files(intent_names, f'{outfold}/dict.intents.csv') - - return outfold - - -# The following works for the DialogFlow and Mturk output format -# def write_files(data, outfile): -# with open(f'{outfile}', 'w') as f: -# for item in data: -# item = f'{item.strip()}\n' -# f.write(item) - - -def calc_class_weights(label_freq): - """ - Goal is to give more weight to the classes with less samples - so as to match the one with the higest frequency. We achieve this by - dividing the highest frequency by the freq of each label. - Example - - [12, 5, 3] -> [12/12, 12/5, 12/3] -> [1, 2.4, 4] - - Here label_freq is assumed to be sorted by the frequency. I.e. - label_freq[0] is the most frequent element. - - """ - - most_common_label_freq = label_freq[0] - weighted_slots = sorted([(index, most_common_label_freq[1] / freq) for (index, freq) in label_freq]) - return [weight for (_, weight) in weighted_slots] - - -class JointIntentSlotDataDesc: - """ Convert the raw data to the standard format supported by - JointIntentSlotDataset. - - By default, the None label for slots is 'O'. - - JointIntentSlotDataset requires two files: - - input_file: file to sequence + label. - the first line is header (sentence [tab] label) - each line should be [sentence][tab][label] - - slot_file: file to slot labels, each line corresponding to - slot labels for a sentence in input_file. No header. - - To keep the mapping from label index to label consistent during - training and inferencing, we require the following files: - dicts.intents.csv: each line is an intent. The first line - corresponding to the 0 intent label, the second line - corresponding to the 1 intent label, and so on. - - dicts.slots.csv: each line is a slot. The first line - corresponding to the 0 slot label, the second line - corresponding to the 1 slot label, and so on. - - Args: - data_dir (str): the directory of the dataset - do_lower_case (bool): whether to set your dataset to lowercase - dataset_name (str): the name of the dataset. If it's a dataset - that follows the standard JointIntentSlotDataset format, - you can set the name as 'default'. - none_slot_label (str): the label for slots that aren't indentified - defaulted to 'O' - pad_label (int): the int used for padding. If set to -1, - it'll be set to the whatever the None label is. - - """ - - def __init__( - self, data_dir, do_lower_case=False, dataset_name='default', none_slot_label='O', pad_label=-1, - ): - if dataset_name == 'atis': - self.data_dir = process_atis(data_dir, do_lower_case) - elif dataset_name == 'snips-atis': - self.data_dir, self.pad_label = merge( - data_dir, ['ATIS/nemo-processed-uncased', 'snips/nemo-processed-uncased/all',], dataset_name, - ) - elif dataset_name == 'dialogflow': - self.data_dir = process_dialogflow(data_dir, do_lower_case) - elif dataset_name == 'mturk-processed': - self.data_dir = process_mturk(data_dir, do_lower_case) - elif dataset_name in set(['snips-light', 'snips-speak', 'snips-all']): - self.data_dir = process_snips(data_dir, do_lower_case) - if dataset_name.endswith('light'): - self.data_dir = f'{self.data_dir}/light' - elif dataset_name.endswith('speak'): - self.data_dir = f'{self.data_dir}/speak' - elif dataset_name.endswith('all'): - self.data_dir = f'{self.data_dir}/all' - elif dataset_name.startswith('jarvis'): - self.data_dir = process_jarvis_datasets( - data_dir, do_lower_case, dataset_name, modes=["train", "test", "eval"], ignore_prev_intent=False, - ) - else: - if not if_exist(data_dir, ['dict.intents.csv', 'dict.slots.csv']): - raise FileNotFoundError( - "Make sure that your data follows the standard format " - "supported by JointIntentSlotDataset. Your data must " - "contain dict.intents.csv and dict.slots.csv." - ) - self.data_dir = data_dir - - self.intent_dict_file = self.data_dir + '/dict.intents.csv' - self.slot_dict_file = self.data_dir + '/dict.slots.csv' - self.num_intents = len(get_vocab(self.intent_dict_file)) - slots = label2idx(self.slot_dict_file) - self.num_slots = len(slots) - - for mode in ['train', 'test', 'eval']: - - if not if_exist(self.data_dir, [f'{mode}.tsv']): - nemo.logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') - continue - - slot_file = f'{self.data_dir}/{mode}_slots.tsv' - with open(slot_file, 'r') as f: - slot_lines = f.readlines() - - input_file = f'{self.data_dir}/{mode}.tsv' - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] # Skipping headers at index 0 - - if len(slot_lines) != len(input_lines): - raise ValueError( - "Make sure that the number of slot lines match the " - "number of intent lines. There should be a 1-1 " - "correspondence between every slot and intent lines." - ) - - dataset = list(zip(slot_lines, input_lines)) - - raw_slots, queries, raw_intents = [], [], [] - for slot_line, input_line in dataset: - slot_list = [int(slot) for slot in slot_line.strip().split()] - raw_slots.append(slot_list) - parts = input_line.strip().split() - raw_intents.append(int(parts[-1])) - queries.append(' '.join(parts[:-1])) - - infold = input_file[: input_file.rfind('/')] - - nemo.logging.info(f'Three most popular intents during {mode}ing') - total_intents, intent_label_freq = get_label_stats(raw_intents, infold + f'/{mode}_intent_stats.tsv') - merged_slots = itertools.chain.from_iterable(raw_slots) - - nemo.logging.info(f'Three most popular slots during {mode}ing') - slots_total, slots_label_freq = get_label_stats(merged_slots, infold + f'/{mode}_slot_stats.tsv') - - if mode == 'train': - self.slot_weights = calc_class_weights(slots_label_freq) - nemo.logging.info(f'Slot weights are - {self.slot_weights}') - - self.intent_weights = calc_class_weights(intent_label_freq) - nemo.logging.info(f'Intent weights are - {self.intent_weights}') - - nemo.logging.info(f'Total intents - {total_intents}') - nemo.logging.info(f'Intent label frequency - {intent_label_freq}') - nemo.logging.info(f'Total Slots - {slots_total}') - nemo.logging.info(f'Slots label frequency - {slots_label_freq}') - - if pad_label != -1: - self.pad_label = pad_label - else: - if none_slot_label not in slots: - raise ValueError(f'none_slot_label {none_slot_label} not ' f'found in {self.slot_dict_file}.') - self.pad_label = slots[none_slot_label] - - -class SentenceClassificationDataDesc: - def __init__(self, dataset_name, data_dir, do_lower_case): - if dataset_name == 'sst-2': - self.data_dir = process_sst_2(data_dir) - self.num_labels = 2 - self.eval_file = self.data_dir + '/dev.tsv' - elif dataset_name == 'imdb': - self.num_labels = 2 - self.data_dir = process_imdb(data_dir, do_lower_case) - self.eval_file = self.data_dir + '/test.tsv' - elif dataset_name == 'thucnews': - self.num_labels = 14 - self.data_dir = process_thucnews(data_dir) - self.eval_file = self.data_dir + '/test.tsv' - elif dataset_name.startswith('nlu-'): - if dataset_name.endswith('chat'): - self.data_dir = f'{data_dir}/ChatbotCorpus.json' - self.num_labels = 2 - elif dataset_name.endswith('ubuntu'): - self.data_dir = f'{data_dir}/AskUbuntuCorpus.json' - self.num_labels = 5 - elif dataset_name.endswith('web'): - data_dir = f'{data_dir}/WebApplicationsCorpus.json' - self.num_labels = 8 - self.data_dir = process_nlu(data_dir, do_lower_case, dataset_name=dataset_name) - self.eval_file = self.data_dir + '/test.tsv' - elif dataset_name.startswith('jarvis'): - self.data_dir = process_jarvis_datasets( - data_dir, do_lower_case, dataset_name, modes=['train', 'test', 'eval'], ignore_prev_intent=False, - ) - - intents = get_intent_labels(f'{self.data_dir}/dict.intents.csv') - self.num_labels = len(intents) - else: - raise ValueError( - "Looks like you passed a dataset name that isn't " - "already supported by NeMo. Please make sure " - "that you build the preprocessing method for it." - ) - - self.train_file = self.data_dir + '/train.tsv' - - for mode in ['train', 'test', 'eval']: - - if not if_exist(self.data_dir, [f'{mode}.tsv']): - nemo.logging.info(f' Stats calculation for {mode} mode' f' is skipped as {mode}.tsv was not found.') - continue - - input_file = f'{self.data_dir}/{mode}.tsv' - with open(input_file, 'r') as f: - input_lines = f.readlines()[1:] # Skipping headers at index 0 - - queries, raw_sentences = [], [] - for input_line in input_lines: - parts = input_line.strip().split() - raw_sentences.append(int(parts[-1])) - queries.append(' '.join(parts[:-1])) - - infold = input_file[: input_file.rfind('/')] - - nemo.logging.info(f'Three most popular classes during {mode}ing') - total_sents, sent_label_freq = get_label_stats(raw_sentences, infold + f'/{mode}_sentence_stats.tsv') - - if mode == 'train': - self.class_weights = calc_class_weights(sent_label_freq) - nemo.logging.info(f'Class weights are - {self.class_weights}') - - nemo.logging.info(f'Total Sentences - {total_sents}') - nemo.logging.info(f'Sentence class frequencies - {sent_label_freq}') - - -def create_vocab_lm(data_dir, do_lower_case): - if if_exist(data_dir, ['train.txt', 'vocab.txt']): - nemo.logging.info("Vocabulary has been created.") - with open(os.path.join(data_dir, 'vocab.txt'), 'r') as f: - vocab_size = len(f.readlines()) - return vocab_size - - nemo.logging.info(f'Creating vocabulary from training data at {data_dir}') - - with open(f'{data_dir}/train.txt', 'r') as f: - txt = f.read() - if do_lower_case: - txt = txt.lower() - lines = re.split(r'[\n]', txt) - sentences = [line.strip().split() for line in lines if line.strip()] - - vocab = {"[PAD]": 0, "[SEP]": 1, "[CLS]": 2, "[MASK]": 3} - idx = 4 - for sentence in sentences: - for word in sentence: - if word not in vocab: - vocab[word] = idx - idx += 1 - - with open(f'{data_dir}/vocab.txt', 'w') as f: - for word in sorted(vocab.keys()): - f.write(word + '\n') - nemo.logging.info(f"Created vocabulary of size {len(vocab)}") - - return len(vocab) - - -def download_wkt2(data_dir): - os.makedirs('data/lm', exist_ok=True) - nemo.logging.warning(f'Data not found at {data_dir}. ' f'Downloading wikitext-2 to data/lm') - data_dir = 'data/lm/wikitext-2' - subprocess.call('scripts/get_wkt2.sh') - return data_dir - - -class LanguageModelDataDesc: - def __init__(self, dataset_name, data_dir, do_lower_case): - if dataset_name == 'wikitext-2': - if not os.path.exists(data_dir): - data_dir = download_wkt2(data_dir) - self.vocab_size = create_vocab_lm(data_dir, do_lower_case) - self.data_dir = data_dir - else: - nemo.logging.warning( - "Looks like you passed a dataset name that isn't " - "already supported by NeMo. Please make sure that " - "you build the preprocessing method for it." - ) - - -def create_vocab_mlm( - data_dir, vocab_size, sample_size, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], train_file='', -): - vocab = special_tokens[:] - bert_dir = f'{data_dir}/bert' - if if_exist(bert_dir, ['tokenizer.model']): - nemo.logging.info(DATABASE_EXISTS_TMP.format('WikiText_BERT', bert_dir)) - return data_dir, f'{bert_dir}/tokenizer.model' - nemo.logging.info(f'Processing WikiText dataset and store at {bert_dir}') - os.makedirs(bert_dir, exist_ok=True) - - if not train_file: - files = glob.glob(f'{data_dir}/*.txt') - train_file = f'{bert_dir}/merged.txt' - nemo.logging.info(f"Merging {len(files)} txt files into {train_file}") - - with open(train_file, "w") as merged: - for file in tqdm(files): - with open(file, 'r') as inf: - content = inf.read().strip() - merged.write(content + '\n\n\n') - else: - train_file = f'{data_dir}/{train_file}' - - cmd = ( - f"--input={train_file} --model_prefix={bert_dir}/tokenizer " - f"--vocab_size={vocab_size - len(vocab)} " - f"--input_sentence_size={sample_size} " - f"--shuffle_input_sentence=true --hard_vocab_limit=false " - f"--bos_id=-1 --eos_id=-1" - ) - SPT.Train(cmd) - - # Add BERT control symbols - tokens = [] - - with open(f"{bert_dir}/tokenizer.vocab", "r") as f: - f.readline() # skip first token - - # Read tokens from each line and parse for vocab - for line in f: - piece = line.split("\t")[0] - token = piece[1:] if piece.startswith("▁") else f"##{piece}" - tokens.append(token) - - vocab.extend(tokens) - - # Save vocabulary to output file - with open(f'{bert_dir}/vocab.txt', "w") as f: - for token in vocab: - f.write(f"{token}\n".format()) - return data_dir, f'{bert_dir}/tokenizer.model' - - -class BERTPretrainingDataDesc: - def __init__( - self, dataset_name, data_dir, vocab_size, sample_size, special_tokens, train_file='', - ): - if dataset_name == 'wikitext-2': - if not os.path.exists(data_dir): - data_dir = download_wkt2(data_dir) - self.data_dir, self.tokenizer_model = create_vocab_mlm( - data_dir, vocab_size, sample_size, special_tokens, train_file - ) - else: - nemo.logging.warning( - "Looks like you passed a dataset name that isn't " - "already supported by NeMo. Please make sure that " - "you build the preprocessing method for it." - ) - - self.train_file = f'{data_dir}/train.txt' - self.eval_file = f'{data_dir}/valid.txt' - self.test_file = f'{data_dir}/test.txt' - - -""" -Utility functions for GLUE tasks -This code was adapted from the HuggingFace library at -https://github.com/huggingface/transformers -""" - - -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. - For single sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second - sequence. Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding="utf-8-sig") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - # if sys.version_info[0] == 2: - # line = list(unicode(cell, 'utf-8') for cell in line) - lines.append(line) - return lines - - -class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - nemo.logging.info(f'LOOKING AT {os.path.join(data_dir, "train.tsv")}') - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[3] - text_b = line[4] - label = line[0] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), "dev_matched",) - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[8] - text_b = line[9] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MnliMismatchedProcessor(MnliProcessor): - """Processor for the MultiNLI Mismatched data set (GLUE version).""" - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), "dev_matched",) - - -class ColaProcessor(DataProcessor): - """Processor for the CoLA data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s" % (set_type, i) - text_a = line[3] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class Sst2Processor(DataProcessor): - """Processor for the SST-2 data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = line[0] - label = line[1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -class StsbProcessor(DataProcessor): - """Processor for the STS-B data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return [None] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[7] - text_b = line[8] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QqpProcessor(DataProcessor): - """Processor for the QQP data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - try: - text_a = line[3] - text_b = line[4] - label = line[5] - except IndexError: - continue - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class QnliProcessor(DataProcessor): - """Processor for the QNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev_matched") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class RteProcessor(DataProcessor): - """Processor for the RTE data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["entailment", "not_entailment"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class WnliProcessor(DataProcessor): - """Processor for the WNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples(self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, line[0]) - text_a = line[1] - text_b = line[2] - label = line[-1] - examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mnli-mm": MnliMismatchedProcessor, - "mrpc": MrpcProcessor, - "sst-2": Sst2Processor, - "sts-b": StsbProcessor, - "qqp": QqpProcessor, - "qnli": QnliProcessor, - "rte": RteProcessor, - "wnli": WnliProcessor, -} - -output_modes = { - "cola": "classification", - "mnli": "classification", - "mnli-mm": "classification", - "mrpc": "classification", - "sst-2": "classification", - "sts-b": "regression", - "qqp": "classification", - "qnli": "classification", - "rte": "classification", - "wnli": "classification", -} - -GLUE_TASKS_NUM_LABELS = { - "cola": 2, - "mnli": 3, - "mrpc": 2, - "sst-2": 2, - "sts-b": 1, - "qqp": 2, - "qnli": 2, - "rte": 2, - "wnli": 2, -} diff --git a/nemo/collections/nlp/data/tokenizers/__init__.py b/nemo/collections/nlp/data/tokenizers/__init__.py index ba9baba6c89c..4affa23c5655 100644 --- a/nemo/collections/nlp/data/tokenizers/__init__.py +++ b/nemo/collections/nlp/data/tokenizers/__init__.py @@ -1,6 +1,22 @@ -from .bert_tokenizer import NemoBertTokenizer -from .char_tokenizer import CharTokenizer -from .gpt2_tokenizer import NemoGPT2Tokenizer -from .spc_tokenizer import SentencePieceTokenizer -from .word_tokenizer import WordTokenizer -from .yttm_tokenizer import YouTokenToMeTokenizer +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data.tokenizers.bert_tokenizer import NemoBertTokenizer +from nemo.collections.nlp.data.tokenizers.char_tokenizer import CharTokenizer +from nemo.collections.nlp.data.tokenizers.gpt2_tokenizer import NemoGPT2Tokenizer +from nemo.collections.nlp.data.tokenizers.sentencepiece_tokenizer import SentencePieceTokenizer +from nemo.collections.nlp.data.tokenizers.word_tokenizer import WordTokenizer +from nemo.collections.nlp.data.tokenizers.youtokentome_tokenizer import YouTokenToMeTokenizer diff --git a/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py b/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py index cc6b20e875a8..abb6e27dfd06 100644 --- a/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py +++ b/nemo/collections/nlp/data/tokenizers/bert_tokenizer.py @@ -1,8 +1,26 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import re from transformers import BertTokenizer -from .tokenizer_spec import TokenizerSpec +from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec + +__all__ = ['NemoBertTokenizer'] def handle_quotes(text): diff --git a/nemo/collections/nlp/data/tokenizers/char_tokenizer.py b/nemo/collections/nlp/data/tokenizers/char_tokenizer.py index d634277bd3d5..f2d525a5d6e5 100644 --- a/nemo/collections/nlp/data/tokenizers/char_tokenizer.py +++ b/nemo/collections/nlp/data/tokenizers/char_tokenizer.py @@ -1,4 +1,22 @@ -from .tokenizer_spec import TokenizerSpec +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec + +__all__ = ['CharTokenizer'] class CharTokenizer(TokenizerSpec): diff --git a/nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py b/nemo/collections/nlp/data/tokenizers/fairseq_tokenizer.py similarity index 79% rename from nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py rename to nemo/collections/nlp/data/tokenizers/fairseq_tokenizer.py index f6bfdfad9473..be654de9421a 100644 --- a/nemo/collections/nlp/utils/metrics/fairseq_tokenizer.py +++ b/nemo/collections/nlp/data/tokenizers/fairseq_tokenizer.py @@ -1,3 +1,19 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + """ Code from https://github.com/NVIDIA/DeepLearningExamples/blob/ master/PyTorch/Translation/Transformer/fairseq/tokenizer.py @@ -8,6 +24,8 @@ import unicodedata from collections import defaultdict +__all__ = ['get_unicode_categories', 'tokenize_en'] + def get_unicode_categories(): cats = defaultdict(list) diff --git a/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py b/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py index 60e6c3cf3cd5..fe443d90db35 100644 --- a/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py +++ b/nemo/collections/nlp/data/tokenizers/gpt2_tokenizer.py @@ -1,6 +1,24 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + from transformers import GPT2Tokenizer -from .tokenizer_spec import TokenizerSpec +from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec + +__all__ = ['NemoGPT2Tokenizer'] class NemoGPT2Tokenizer(TokenizerSpec): diff --git a/nemo/collections/nlp/data/tokenizers/spc_tokenizer.py b/nemo/collections/nlp/data/tokenizers/sentencepiece_tokenizer.py similarity index 79% rename from nemo/collections/nlp/data/tokenizers/spc_tokenizer.py rename to nemo/collections/nlp/data/tokenizers/sentencepiece_tokenizer.py index 67a2c00bda3e..0cc7e9b62cf2 100644 --- a/nemo/collections/nlp/data/tokenizers/spc_tokenizer.py +++ b/nemo/collections/nlp/data/tokenizers/sentencepiece_tokenizer.py @@ -1,6 +1,24 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import sentencepiece as spm -from .tokenizer_spec import TokenizerSpec +from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec + +__all__ = ['SentencePieceTokenizer'] class SentencePieceTokenizer(TokenizerSpec): diff --git a/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py b/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py index eeadf617c189..c9035933ca6c 100644 --- a/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py +++ b/nemo/collections/nlp/data/tokenizers/tokenizer_spec.py @@ -1,6 +1,24 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + from abc import ABC, abstractmethod from typing import List +__all__ = ['TokenizerSpec'] + class TokenizerSpec(ABC): @abstractmethod diff --git a/nemo/collections/nlp/data/tokenizers/word_tokenizer.py b/nemo/collections/nlp/data/tokenizers/word_tokenizer.py index f45940f03c58..0d037f981dc6 100644 --- a/nemo/collections/nlp/data/tokenizers/word_tokenizer.py +++ b/nemo/collections/nlp/data/tokenizers/word_tokenizer.py @@ -1,4 +1,22 @@ -from .tokenizer_spec import TokenizerSpec +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec + +__all__ = ['WordTokenizer'] class WordTokenizer(TokenizerSpec): diff --git a/nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py b/nemo/collections/nlp/data/tokenizers/youtokentome_tokenizer.py similarity index 58% rename from nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py rename to nemo/collections/nlp/data/tokenizers/youtokentome_tokenizer.py index 94acc3e4b1ae..ffc62be9ff28 100644 --- a/nemo/collections/nlp/data/tokenizers/yttm_tokenizer.py +++ b/nemo/collections/nlp/data/tokenizers/youtokentome_tokenizer.py @@ -1,6 +1,24 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + import youtokentome as yttm -from .tokenizer_spec import TokenizerSpec +from nemo.collections.nlp.data.tokenizers.tokenizer_spec import TokenizerSpec + +__all__ = ['YouTokenToMeTokenizer'] class YouTokenToMeTokenizer(TokenizerSpec): diff --git a/nemo/collections/nlp/data/utils.py b/nemo/collections/nlp/data/utils.py deleted file mode 100644 index 1119f48a91aa..000000000000 --- a/nemo/collections/nlp/data/utils.py +++ /dev/null @@ -1,125 +0,0 @@ -import os -import pickle -import re -import string - -import numpy as np - -import nemo - - -def dataset_to_ids(dataset, tokenizer, cache_ids=False, add_bos_eos=True): - """ - Reads dataset from file line by line, tokenizes each line with tokenizer, - and returns list of lists which corresponds to ids of tokenized strings. - - Args: - dataset: path to dataset - tokenizer: tokenizer to convert text into ids - cache_ids: if True, ids are saved to disk as pickle file - with similar name (e.g., data.txt --> data.txt.pkl) - add_bos_eos: bool, whether to add and symbols (e.g., for NMT) - Returns: - ids: list of ids which correspond to tokenized strings of the dataset - """ - - cached_ids_dataset = dataset + str(".pkl") - if os.path.isfile(cached_ids_dataset): - nemo.logging.info("Loading cached tokenized dataset ...") - ids = pickle.load(open(cached_ids_dataset, "rb")) - else: - nemo.logging.info("Tokenizing dataset ...") - data = open(dataset, "rb").readlines() - ids = [] - for sentence in data: - sent_ids = tokenizer.text_to_ids(sentence.decode("utf-8")) - if add_bos_eos: - sent_ids = [tokenizer.bos_id()] + sent_ids + [tokenizer.eos_id()] - ids.append(sent_ids) - if cache_ids: - nemo.logging.info("Caching tokenized dataset ...") - pickle.dump(ids, open(cached_ids_dataset, "wb")) - return ids - - -def clean_src_and_target( - src_ids, tgt_ids, max_tokens=128, min_tokens=3, max_tokens_diff=25, max_tokens_ratio=2.5, -): - """ - Cleans source and target sentences to get rid of noisy data. - Specifically, a pair of sentences is removed if - -- either source or target is longer than *max_tokens* - -- either source or target is shorter than *min_tokens* - -- absolute difference between source and target is larger than - *max_tokens_diff* - -- one sentence is *max_tokens_ratio* times longer than the other - """ - - if len(src_ids) != len(tgt_ids): - raise ValueError("Source and target corpora have different lengths!") - src_ids_, tgt_ids_ = [], [] - for i in range(len(src_ids)): - src_len, tgt_len = len(src_ids[i]), len(tgt_ids[i]) - if ( - src_len > max_tokens - or tgt_len > max_tokens - or src_len < min_tokens - or tgt_len < min_tokens - or (src_ids[i] == tgt_ids[i]) - or np.abs(src_len - tgt_len) > max_tokens_diff - ): - continue - ratio = max(src_len - 2, 1) / max(tgt_len - 2, 1) - if ratio > max_tokens_ratio or ratio < (1 / max_tokens_ratio): - continue - src_ids_.append(src_ids[i]) - tgt_ids_.append(tgt_ids[i]) - return src_ids_, tgt_ids_ - - -def remove_punctuation_from_sentence(sentence): - sentence = re.sub('[' + string.punctuation + ']', '', sentence) - sentence = sentence.lower() - return sentence - - -def check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token. - - Because of the sliding window approach taken to scoring documents, - a single token can appear in multiple documents. - - Example: - Doc: the man went to the store and bought a gallon of milk - Span A: the man went to the - Span B: to the store and bought - Span C: and bought a gallon of - ... - - Now the word 'bought' will have two scores from spans B and C. We only - want to consider the score with "maximum context", which we define as - the *minimum* of its left and right context (the *sum* of left and - right context will always be the same, of course). - - In the example the maximum context for 'bought' would be span C since - it has 1 left context and 3 right context, while span B has 4 left context - and 0 right context. - - Code adapted from the code by the Google AI and HuggingFace. - """ - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index diff --git a/nemo/collections/nlp/huggingface/__init__.py b/nemo/collections/nlp/huggingface/__init__.py deleted file mode 100644 index 5074307bd60a..000000000000 --- a/nemo/collections/nlp/huggingface/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .bert import BERT diff --git a/nemo/collections/nlp/metrics/__init__.py b/nemo/collections/nlp/metrics/__init__.py new file mode 100644 index 000000000000..4b9cfe094485 --- /dev/null +++ b/nemo/collections/nlp/metrics/__init__.py @@ -0,0 +1,17 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.metrics.bleu import * diff --git a/nemo/collections/nlp/utils/metrics/bleu.py b/nemo/collections/nlp/metrics/bleu.py similarity index 88% rename from nemo/collections/nlp/utils/metrics/bleu.py rename to nemo/collections/nlp/metrics/bleu.py index 04e67d1788d6..bab9c5f4c0f6 100644 --- a/nemo/collections/nlp/utils/metrics/bleu.py +++ b/nemo/collections/nlp/metrics/bleu.py @@ -22,17 +22,7 @@ import collections import math - -def compound_split(segment): - segment = segment.replace(".", " . ") - segment = segment.replace(",", " , ") - segment = segment.replace(":", " : ") - segment = segment.replace("!", " ! ") - segment = segment.replace("?", " ? ") - segment = segment.replace("-", " ##AT##-##AT## ") - segment = segment.replace("\"", " "e ") - segment = segment.replace("%", " % ") - return segment.split() +__all__ = ['compute_bleu'] def _get_ngrams(segment, max_order): @@ -117,11 +107,4 @@ def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=False precisions = [p * 100 for p in precisions] - return ( - bleu * 100, - precisions, - bp, - ratio, - translation_length, - reference_length, - ) + return (bleu * 100, precisions, bp, ratio, translation_length, reference_length) diff --git a/nemo/collections/nlp/utils/metrics/sacrebleu.py b/nemo/collections/nlp/metrics/sacrebleu.py similarity index 90% rename from nemo/collections/nlp/utils/metrics/sacrebleu.py rename to nemo/collections/nlp/metrics/sacrebleu.py index 411743a91d34..586b19bf2d30 100755 --- a/nemo/collections/nlp/utils/metrics/sacrebleu.py +++ b/nemo/collections/nlp/metrics/sacrebleu.py @@ -36,8 +36,8 @@ from itertools import zip_longest from typing import Iterable, List, Tuple, Union -from .fairseq_tokenizer import tokenize_en from nemo import logging +from nemo.collections.nlp.data.tokenizers.fairseq_tokenizer import tokenize_en VERSION = '1.3.5' @@ -117,10 +117,10 @@ '\n publisher = "Association for Computational Linguistics",\n pages = "543--553",' '\n location = "Brussels, Belgium",\n url = "http://aclweb.org/anthology/D18-1050"\n}', 'md5': ['8ce1831ac584979ba8cdcd9d4be43e1d'], - 'en-fr': ['1:MTNT/valid/valid.en-fr.tsv', '2:MTNT/valid/valid.en-fr.tsv',], - 'fr-en': ['1:MTNT/valid/valid.fr-en.tsv', '2:MTNT/valid/valid.fr-en.tsv',], - 'en-ja': ['1:MTNT/valid/valid.en-ja.tsv', '2:MTNT/valid/valid.en-ja.tsv',], - 'ja-en': ['1:MTNT/valid/valid.ja-en.tsv', '2:MTNT/valid/valid.ja-en.tsv',], + 'en-fr': ['1:MTNT/valid/valid.en-fr.tsv', '2:MTNT/valid/valid.en-fr.tsv'], + 'fr-en': ['1:MTNT/valid/valid.fr-en.tsv', '2:MTNT/valid/valid.fr-en.tsv'], + 'en-ja': ['1:MTNT/valid/valid.en-ja.tsv', '2:MTNT/valid/valid.en-ja.tsv'], + 'ja-en': ['1:MTNT/valid/valid.ja-en.tsv', '2:MTNT/valid/valid.ja-en.tsv'], }, 'mtnt1.1/train': { 'data': ['https://github.com/pmichel31415/mtnt/releases/download/v1.1/MTNT.1.1.tar.gz'], @@ -132,44 +132,44 @@ '\n publisher = "Association for Computational Linguistics",\n pages = "543--553",' '\n location = "Brussels, Belgium",\n url = "http://aclweb.org/anthology/D18-1050"\n}', 'md5': ['8ce1831ac584979ba8cdcd9d4be43e1d'], - 'en-fr': ['1:MTNT/train/train.en-fr.tsv', '2:MTNT/train/train.en-fr.tsv',], - 'fr-en': ['1:MTNT/train/train.fr-en.tsv', '2:MTNT/train/train.fr-en.tsv',], - 'en-ja': ['1:MTNT/train/train.en-ja.tsv', '2:MTNT/train/train.en-ja.tsv',], - 'ja-en': ['1:MTNT/train/train.ja-en.tsv', '2:MTNT/train/train.ja-en.tsv',], + 'en-fr': ['1:MTNT/train/train.en-fr.tsv', '2:MTNT/train/train.en-fr.tsv'], + 'fr-en': ['1:MTNT/train/train.fr-en.tsv', '2:MTNT/train/train.fr-en.tsv'], + 'en-ja': ['1:MTNT/train/train.en-ja.tsv', '2:MTNT/train/train.en-ja.tsv'], + 'ja-en': ['1:MTNT/train/train.ja-en.tsv', '2:MTNT/train/train.ja-en.tsv'], }, 'wmt19': { 'data': ['http://data.statmt.org/wmt19/translation-task/test.tgz'], 'md5': ['84de7162d158e28403103b01aeefc39a'], - 'cs-de': ['sgm/newstest2019-csde-src.cs.sgm', 'sgm/newstest2019-csde-ref.de.sgm',], - 'de-cs': ['sgm/newstest2019-decs-src.de.sgm', 'sgm/newstest2019-decs-ref.cs.sgm',], - 'de-en': ['sgm/newstest2019-deen-src.de.sgm', 'sgm/newstest2019-deen-ref.en.sgm',], - 'de-fr': ['sgm/newstest2019-defr-src.de.sgm', 'sgm/newstest2019-defr-ref.fr.sgm',], - 'en-cs': ['sgm/newstest2019-encs-src.en.sgm', 'sgm/newstest2019-encs-ref.cs.sgm',], - 'en-de': ['sgm/newstest2019-ende-src.en.sgm', 'sgm/newstest2019-ende-ref.de.sgm',], - 'en-fi': ['sgm/newstest2019-enfi-src.en.sgm', 'sgm/newstest2019-enfi-ref.fi.sgm',], - 'en-gu': ['sgm/newstest2019-engu-src.en.sgm', 'sgm/newstest2019-engu-ref.gu.sgm',], - 'en-kk': ['sgm/newstest2019-enkk-src.en.sgm', 'sgm/newstest2019-enkk-ref.kk.sgm',], - 'en-lt': ['sgm/newstest2019-enlt-src.en.sgm', 'sgm/newstest2019-enlt-ref.lt.sgm',], - 'en-ru': ['sgm/newstest2019-enru-src.en.sgm', 'sgm/newstest2019-enru-ref.ru.sgm',], - 'en-zh': ['sgm/newstest2019-enzh-src.en.sgm', 'sgm/newstest2019-enzh-ref.zh.sgm',], - 'fi-en': ['sgm/newstest2019-fien-src.fi.sgm', 'sgm/newstest2019-fien-ref.en.sgm',], - 'fr-de': ['sgm/newstest2019-frde-src.fr.sgm', 'sgm/newstest2019-frde-ref.de.sgm',], - 'gu-en': ['sgm/newstest2019-guen-src.gu.sgm', 'sgm/newstest2019-guen-ref.en.sgm',], - 'kk-en': ['sgm/newstest2019-kken-src.kk.sgm', 'sgm/newstest2019-kken-ref.en.sgm',], - 'lt-en': ['sgm/newstest2019-lten-src.lt.sgm', 'sgm/newstest2019-lten-ref.en.sgm',], - 'ru-en': ['sgm/newstest2019-ruen-src.ru.sgm', 'sgm/newstest2019-ruen-ref.en.sgm',], - 'zh-en': ['sgm/newstest2019-zhen-src.zh.sgm', 'sgm/newstest2019-zhen-ref.en.sgm',], + 'cs-de': ['sgm/newstest2019-csde-src.cs.sgm', 'sgm/newstest2019-csde-ref.de.sgm'], + 'de-cs': ['sgm/newstest2019-decs-src.de.sgm', 'sgm/newstest2019-decs-ref.cs.sgm'], + 'de-en': ['sgm/newstest2019-deen-src.de.sgm', 'sgm/newstest2019-deen-ref.en.sgm'], + 'de-fr': ['sgm/newstest2019-defr-src.de.sgm', 'sgm/newstest2019-defr-ref.fr.sgm'], + 'en-cs': ['sgm/newstest2019-encs-src.en.sgm', 'sgm/newstest2019-encs-ref.cs.sgm'], + 'en-de': ['sgm/newstest2019-ende-src.en.sgm', 'sgm/newstest2019-ende-ref.de.sgm'], + 'en-fi': ['sgm/newstest2019-enfi-src.en.sgm', 'sgm/newstest2019-enfi-ref.fi.sgm'], + 'en-gu': ['sgm/newstest2019-engu-src.en.sgm', 'sgm/newstest2019-engu-ref.gu.sgm'], + 'en-kk': ['sgm/newstest2019-enkk-src.en.sgm', 'sgm/newstest2019-enkk-ref.kk.sgm'], + 'en-lt': ['sgm/newstest2019-enlt-src.en.sgm', 'sgm/newstest2019-enlt-ref.lt.sgm'], + 'en-ru': ['sgm/newstest2019-enru-src.en.sgm', 'sgm/newstest2019-enru-ref.ru.sgm'], + 'en-zh': ['sgm/newstest2019-enzh-src.en.sgm', 'sgm/newstest2019-enzh-ref.zh.sgm'], + 'fi-en': ['sgm/newstest2019-fien-src.fi.sgm', 'sgm/newstest2019-fien-ref.en.sgm'], + 'fr-de': ['sgm/newstest2019-frde-src.fr.sgm', 'sgm/newstest2019-frde-ref.de.sgm'], + 'gu-en': ['sgm/newstest2019-guen-src.gu.sgm', 'sgm/newstest2019-guen-ref.en.sgm'], + 'kk-en': ['sgm/newstest2019-kken-src.kk.sgm', 'sgm/newstest2019-kken-ref.en.sgm'], + 'lt-en': ['sgm/newstest2019-lten-src.lt.sgm', 'sgm/newstest2019-lten-ref.en.sgm'], + 'ru-en': ['sgm/newstest2019-ruen-src.ru.sgm', 'sgm/newstest2019-ruen-ref.en.sgm'], + 'zh-en': ['sgm/newstest2019-zhen-src.zh.sgm', 'sgm/newstest2019-zhen-ref.en.sgm'], }, 'wmt19/dev': { 'data': ['http://data.statmt.org/wmt19/translation-task/dev.tgz'], 'description': 'Development data for tasks new to 2019.', 'md5': ['f2ec7af5947c19e0cacb3882eb208002'], - 'lt-en': ['dev/newsdev2019-lten-src.lt.sgm', 'dev/newsdev2019-lten-ref.en.sgm',], - 'en-lt': ['dev/newsdev2019-enlt-src.en.sgm', 'dev/newsdev2019-enlt-ref.lt.sgm',], - 'gu-en': ['dev/newsdev2019-guen-src.gu.sgm', 'dev/newsdev2019-guen-ref.en.sgm',], - 'en-gu': ['dev/newsdev2019-engu-src.en.sgm', 'dev/newsdev2019-engu-ref.gu.sgm',], - 'kk-en': ['dev/newsdev2019-kken-src.kk.sgm', 'dev/newsdev2019-kken-ref.en.sgm',], - 'en-kk': ['dev/newsdev2019-enkk-src.en.sgm', 'dev/newsdev2019-enkk-ref.kk.sgm',], + 'lt-en': ['dev/newsdev2019-lten-src.lt.sgm', 'dev/newsdev2019-lten-ref.en.sgm'], + 'en-lt': ['dev/newsdev2019-enlt-src.en.sgm', 'dev/newsdev2019-enlt-ref.lt.sgm'], + 'gu-en': ['dev/newsdev2019-guen-src.gu.sgm', 'dev/newsdev2019-guen-ref.en.sgm'], + 'en-gu': ['dev/newsdev2019-engu-src.en.sgm', 'dev/newsdev2019-engu-ref.gu.sgm'], + 'kk-en': ['dev/newsdev2019-kken-src.kk.sgm', 'dev/newsdev2019-kken-ref.en.sgm'], + 'en-kk': ['dev/newsdev2019-enkk-src.en.sgm', 'dev/newsdev2019-enkk-ref.kk.sgm'], }, 'wmt18': { 'data': ['http://data.statmt.org/wmt18/translation-task/test.tgz'], @@ -183,20 +183,20 @@ '\n address = "Belgium, Brussels",\n publisher = "Association for Computational ' 'Linguistics",\n url = "https://www.aclweb.org/anthology/W18-6401",\n pages = "272--303",' '\n}', - 'cs-en': ['test/newstest2018-csen-src.cs.sgm', 'test/newstest2018-csen-ref.en.sgm',], - 'de-en': ['test/newstest2018-deen-src.de.sgm', 'test/newstest2018-deen-ref.en.sgm',], - 'en-cs': ['test/newstest2018-encs-src.en.sgm', 'test/newstest2018-encs-ref.cs.sgm',], - 'en-de': ['test/newstest2018-ende-src.en.sgm', 'test/newstest2018-ende-ref.de.sgm',], - 'en-et': ['test/newstest2018-enet-src.en.sgm', 'test/newstest2018-enet-ref.et.sgm',], - 'en-fi': ['test/newstest2018-enfi-src.en.sgm', 'test/newstest2018-enfi-ref.fi.sgm',], - 'en-ru': ['test/newstest2018-enru-src.en.sgm', 'test/newstest2018-enru-ref.ru.sgm',], - 'et-en': ['test/newstest2018-eten-src.et.sgm', 'test/newstest2018-eten-ref.en.sgm',], - 'fi-en': ['test/newstest2018-fien-src.fi.sgm', 'test/newstest2018-fien-ref.en.sgm',], - 'ru-en': ['test/newstest2018-ruen-src.ru.sgm', 'test/newstest2018-ruen-ref.en.sgm',], - 'en-tr': ['test/newstest2018-entr-src.en.sgm', 'test/newstest2018-entr-ref.tr.sgm',], - 'tr-en': ['test/newstest2018-tren-src.tr.sgm', 'test/newstest2018-tren-ref.en.sgm',], - 'en-zh': ['test/newstest2018-enzh-src.en.sgm', 'test/newstest2018-enzh-ref.zh.sgm',], - 'zh-en': ['test/newstest2018-zhen-src.zh.sgm', 'test/newstest2018-zhen-ref.en.sgm',], + 'cs-en': ['test/newstest2018-csen-src.cs.sgm', 'test/newstest2018-csen-ref.en.sgm'], + 'de-en': ['test/newstest2018-deen-src.de.sgm', 'test/newstest2018-deen-ref.en.sgm'], + 'en-cs': ['test/newstest2018-encs-src.en.sgm', 'test/newstest2018-encs-ref.cs.sgm'], + 'en-de': ['test/newstest2018-ende-src.en.sgm', 'test/newstest2018-ende-ref.de.sgm'], + 'en-et': ['test/newstest2018-enet-src.en.sgm', 'test/newstest2018-enet-ref.et.sgm'], + 'en-fi': ['test/newstest2018-enfi-src.en.sgm', 'test/newstest2018-enfi-ref.fi.sgm'], + 'en-ru': ['test/newstest2018-enru-src.en.sgm', 'test/newstest2018-enru-ref.ru.sgm'], + 'et-en': ['test/newstest2018-eten-src.et.sgm', 'test/newstest2018-eten-ref.en.sgm'], + 'fi-en': ['test/newstest2018-fien-src.fi.sgm', 'test/newstest2018-fien-ref.en.sgm'], + 'ru-en': ['test/newstest2018-ruen-src.ru.sgm', 'test/newstest2018-ruen-ref.en.sgm'], + 'en-tr': ['test/newstest2018-entr-src.en.sgm', 'test/newstest2018-entr-ref.tr.sgm'], + 'tr-en': ['test/newstest2018-tren-src.tr.sgm', 'test/newstest2018-tren-ref.en.sgm'], + 'en-zh': ['test/newstest2018-enzh-src.en.sgm', 'test/newstest2018-enzh-ref.zh.sgm'], + 'zh-en': ['test/newstest2018-zhen-src.zh.sgm', 'test/newstest2018-zhen-ref.en.sgm'], }, 'wmt18/test-ts': { 'data': ['http://data.statmt.org/wmt18/translation-task/test-ts.tgz'], @@ -221,8 +221,8 @@ 'data': ['http://data.statmt.org/wmt18/translation-task/dev.tgz'], 'md5': ['486f391da54a7a3247f02ebd25996f24'], 'description': 'Development data (Estonian<>English).', - 'et-en': ['dev/newsdev2018-eten-src.et.sgm', 'dev/newsdev2018-eten-ref.en.sgm',], - 'en-et': ['dev/newsdev2018-enet-src.en.sgm', 'dev/newsdev2018-enet-ref.et.sgm',], + 'et-en': ['dev/newsdev2018-eten-src.et.sgm', 'dev/newsdev2018-eten-ref.en.sgm'], + 'en-et': ['dev/newsdev2018-enet-src.en.sgm', 'dev/newsdev2018-enet-ref.et.sgm'], }, 'wmt17': { 'data': ['http://data.statmt.org/wmt17/translation-task/test.tgz'], @@ -238,26 +238,26 @@ '\n address = {Copenhagen, Denmark},\n publisher = {Association for Computational ' 'Linguistics},\n pages = {169--214},\n url = {' 'http://www.aclweb.org/anthology/W17-4717}\n}', - 'cs-en': ['test/newstest2017-csen-src.cs.sgm', 'test/newstest2017-csen-ref.en.sgm',], - 'de-en': ['test/newstest2017-deen-src.de.sgm', 'test/newstest2017-deen-ref.en.sgm',], - 'en-cs': ['test/newstest2017-encs-src.en.sgm', 'test/newstest2017-encs-ref.cs.sgm',], - 'en-de': ['test/newstest2017-ende-src.en.sgm', 'test/newstest2017-ende-ref.de.sgm',], - 'en-fi': ['test/newstest2017-enfi-src.en.sgm', 'test/newstest2017-enfi-ref.fi.sgm',], - 'en-lv': ['test/newstest2017-enlv-src.en.sgm', 'test/newstest2017-enlv-ref.lv.sgm',], - 'en-ru': ['test/newstest2017-enru-src.en.sgm', 'test/newstest2017-enru-ref.ru.sgm',], - 'en-tr': ['test/newstest2017-entr-src.en.sgm', 'test/newstest2017-entr-ref.tr.sgm',], - 'en-zh': ['test/newstest2017-enzh-src.en.sgm', 'test/newstest2017-enzh-ref.zh.sgm',], - 'fi-en': ['test/newstest2017-fien-src.fi.sgm', 'test/newstest2017-fien-ref.en.sgm',], - 'lv-en': ['test/newstest2017-lven-src.lv.sgm', 'test/newstest2017-lven-ref.en.sgm',], - 'ru-en': ['test/newstest2017-ruen-src.ru.sgm', 'test/newstest2017-ruen-ref.en.sgm',], - 'tr-en': ['test/newstest2017-tren-src.tr.sgm', 'test/newstest2017-tren-ref.en.sgm',], - 'zh-en': ['test/newstest2017-zhen-src.zh.sgm', 'test/newstest2017-zhen-ref.en.sgm',], + 'cs-en': ['test/newstest2017-csen-src.cs.sgm', 'test/newstest2017-csen-ref.en.sgm'], + 'de-en': ['test/newstest2017-deen-src.de.sgm', 'test/newstest2017-deen-ref.en.sgm'], + 'en-cs': ['test/newstest2017-encs-src.en.sgm', 'test/newstest2017-encs-ref.cs.sgm'], + 'en-de': ['test/newstest2017-ende-src.en.sgm', 'test/newstest2017-ende-ref.de.sgm'], + 'en-fi': ['test/newstest2017-enfi-src.en.sgm', 'test/newstest2017-enfi-ref.fi.sgm'], + 'en-lv': ['test/newstest2017-enlv-src.en.sgm', 'test/newstest2017-enlv-ref.lv.sgm'], + 'en-ru': ['test/newstest2017-enru-src.en.sgm', 'test/newstest2017-enru-ref.ru.sgm'], + 'en-tr': ['test/newstest2017-entr-src.en.sgm', 'test/newstest2017-entr-ref.tr.sgm'], + 'en-zh': ['test/newstest2017-enzh-src.en.sgm', 'test/newstest2017-enzh-ref.zh.sgm'], + 'fi-en': ['test/newstest2017-fien-src.fi.sgm', 'test/newstest2017-fien-ref.en.sgm'], + 'lv-en': ['test/newstest2017-lven-src.lv.sgm', 'test/newstest2017-lven-ref.en.sgm'], + 'ru-en': ['test/newstest2017-ruen-src.ru.sgm', 'test/newstest2017-ruen-ref.en.sgm'], + 'tr-en': ['test/newstest2017-tren-src.tr.sgm', 'test/newstest2017-tren-ref.en.sgm'], + 'zh-en': ['test/newstest2017-zhen-src.zh.sgm', 'test/newstest2017-zhen-ref.en.sgm'], }, 'wmt17/B': { 'data': ['http://data.statmt.org/wmt17/translation-task/test.tgz'], 'md5': ['86a1724c276004aa25455ae2a04cef26'], 'description': 'Additional reference for EN-FI and FI-EN.', - 'en-fi': ['test/newstestB2017-enfi-src.en.sgm', 'test/newstestB2017-enfi-ref.fi.sgm',], + 'en-fi': ['test/newstestB2017-enfi-src.en.sgm', 'test/newstestB2017-enfi-ref.fi.sgm'], }, 'wmt17/tworefs': { 'data': ['http://data.statmt.org/wmt17/translation-task/test.tgz'], @@ -273,24 +273,24 @@ 'data': ['http://data.statmt.org/wmt17/translation-task/test-update-1.tgz'], 'md5': ['91dbfd5af99bc6891a637a68e04dfd41'], 'description': 'Improved zh-en and en-zh translations.', - 'en-zh': ['newstest2017-enzh-src.en.sgm', 'newstest2017-enzh-ref.zh.sgm',], - 'zh-en': ['newstest2017-zhen-src.zh.sgm', 'newstest2017-zhen-ref.en.sgm',], + 'en-zh': ['newstest2017-enzh-src.en.sgm', 'newstest2017-enzh-ref.zh.sgm'], + 'zh-en': ['newstest2017-zhen-src.zh.sgm', 'newstest2017-zhen-ref.en.sgm'], }, 'wmt17/dev': { 'data': ['http://data.statmt.org/wmt17/translation-task/dev.tgz'], 'md5': ['9b1aa63c1cf49dccdd20b962fe313989'], 'description': 'Development sets released for new languages in 2017.', - 'en-lv': ['dev/newsdev2017-enlv-src.en.sgm', 'dev/newsdev2017-enlv-ref.lv.sgm',], - 'en-zh': ['dev/newsdev2017-enzh-src.en.sgm', 'dev/newsdev2017-enzh-ref.zh.sgm',], - 'lv-en': ['dev/newsdev2017-lven-src.lv.sgm', 'dev/newsdev2017-lven-ref.en.sgm',], - 'zh-en': ['dev/newsdev2017-zhen-src.zh.sgm', 'dev/newsdev2017-zhen-ref.en.sgm',], + 'en-lv': ['dev/newsdev2017-enlv-src.en.sgm', 'dev/newsdev2017-enlv-ref.lv.sgm'], + 'en-zh': ['dev/newsdev2017-enzh-src.en.sgm', 'dev/newsdev2017-enzh-ref.zh.sgm'], + 'lv-en': ['dev/newsdev2017-lven-src.lv.sgm', 'dev/newsdev2017-lven-ref.en.sgm'], + 'zh-en': ['dev/newsdev2017-zhen-src.zh.sgm', 'dev/newsdev2017-zhen-ref.en.sgm'], }, 'wmt17/ms': { 'data': [ 'https://github.com/MicrosoftTranslator/Translator-HumanParityData/archive/master.zip', 'http://data.statmt.org/wmt17/translation-task/test-update-1.tgz', ], - 'md5': ['18fdaa7a3c84cf6ef688da1f6a5fa96f', '91dbfd5af99bc6891a637a68e04dfd41',], + 'md5': ['18fdaa7a3c84cf6ef688da1f6a5fa96f', '91dbfd5af99bc6891a637a68e04dfd41'], 'description': 'Additional Chinese-English references from Microsoft Research.', 'citation': '@inproceedings{achieving-human-parity-on-automatic-chinese-to-english-news-translation,' '\n author = {Hassan Awadalla, Hany and Aue, Anthony and Chen, Chang and Chowdhary, Vishal and ' @@ -317,9 +317,9 @@ 'newstest2017-zhen-src.zh.sgm', 'newstest2017-zhen-ref.en.sgm', 'Translator-HumanParityData-master/Translator-HumanParityData/References/Translator-HumanParityData' - '-Reference-HT.txt', + + '-Reference-HT.txt', 'Translator-HumanParityData-master/Translator-HumanParityData/References/Translator-HumanParityData' - '-Reference-PE.txt', + + '-Reference-PE.txt', ], }, 'wmt16': { @@ -336,24 +336,24 @@ 'Machine Translation},\n month = {August},\n year = {2016},\n address = {Berlin, ' 'Germany},\n publisher = {Association for Computational Linguistics},\n pages = {' '131--198},\n url = {http://www.aclweb.org/anthology/W/W16/W16-2301}\n}', - 'cs-en': ['test/newstest2016-csen-src.cs.sgm', 'test/newstest2016-csen-ref.en.sgm',], - 'de-en': ['test/newstest2016-deen-src.de.sgm', 'test/newstest2016-deen-ref.en.sgm',], - 'en-cs': ['test/newstest2016-encs-src.en.sgm', 'test/newstest2016-encs-ref.cs.sgm',], - 'en-de': ['test/newstest2016-ende-src.en.sgm', 'test/newstest2016-ende-ref.de.sgm',], - 'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstest2016-enfi-ref.fi.sgm',], - 'en-ro': ['test/newstest2016-enro-src.en.sgm', 'test/newstest2016-enro-ref.ro.sgm',], - 'en-ru': ['test/newstest2016-enru-src.en.sgm', 'test/newstest2016-enru-ref.ru.sgm',], - 'en-tr': ['test/newstest2016-entr-src.en.sgm', 'test/newstest2016-entr-ref.tr.sgm',], - 'fi-en': ['test/newstest2016-fien-src.fi.sgm', 'test/newstest2016-fien-ref.en.sgm',], - 'ro-en': ['test/newstest2016-roen-src.ro.sgm', 'test/newstest2016-roen-ref.en.sgm',], - 'ru-en': ['test/newstest2016-ruen-src.ru.sgm', 'test/newstest2016-ruen-ref.en.sgm',], - 'tr-en': ['test/newstest2016-tren-src.tr.sgm', 'test/newstest2016-tren-ref.en.sgm',], + 'cs-en': ['test/newstest2016-csen-src.cs.sgm', 'test/newstest2016-csen-ref.en.sgm'], + 'de-en': ['test/newstest2016-deen-src.de.sgm', 'test/newstest2016-deen-ref.en.sgm'], + 'en-cs': ['test/newstest2016-encs-src.en.sgm', 'test/newstest2016-encs-ref.cs.sgm'], + 'en-de': ['test/newstest2016-ende-src.en.sgm', 'test/newstest2016-ende-ref.de.sgm'], + 'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstest2016-enfi-ref.fi.sgm'], + 'en-ro': ['test/newstest2016-enro-src.en.sgm', 'test/newstest2016-enro-ref.ro.sgm'], + 'en-ru': ['test/newstest2016-enru-src.en.sgm', 'test/newstest2016-enru-ref.ru.sgm'], + 'en-tr': ['test/newstest2016-entr-src.en.sgm', 'test/newstest2016-entr-ref.tr.sgm'], + 'fi-en': ['test/newstest2016-fien-src.fi.sgm', 'test/newstest2016-fien-ref.en.sgm'], + 'ro-en': ['test/newstest2016-roen-src.ro.sgm', 'test/newstest2016-roen-ref.en.sgm'], + 'ru-en': ['test/newstest2016-ruen-src.ru.sgm', 'test/newstest2016-ruen-ref.en.sgm'], + 'tr-en': ['test/newstest2016-tren-src.tr.sgm', 'test/newstest2016-tren-ref.en.sgm'], }, 'wmt16/B': { 'data': ['http://data.statmt.org/wmt16/translation-task/test.tgz'], 'md5': ['3d809cd0c2c86adb2c67034d15c4e446'], 'description': 'Additional reference for EN-FI.', - 'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstestB2016-enfi-ref.fi.sgm',], + 'en-fi': ['test/newstest2016-enfi-src.en.sgm', 'test/newstestB2016-enfi-ref.fi.sgm'], }, 'wmt16/tworefs': { 'data': ['http://data.statmt.org/wmt16/translation-task/test.tgz'], @@ -369,10 +369,10 @@ 'data': ['http://data.statmt.org/wmt16/translation-task/dev.tgz'], 'md5': ['4a3dc2760bb077f4308cce96b06e6af6'], 'description': 'Development sets released for new languages in 2016.', - 'en-ro': ['dev/newsdev2016-enro-src.en.sgm', 'dev/newsdev2016-enro-ref.ro.sgm',], - 'en-tr': ['dev/newsdev2016-entr-src.en.sgm', 'dev/newsdev2016-entr-ref.tr.sgm',], - 'ro-en': ['dev/newsdev2016-roen-src.ro.sgm', 'dev/newsdev2016-roen-ref.en.sgm',], - 'tr-en': ['dev/newsdev2016-tren-src.tr.sgm', 'dev/newsdev2016-tren-ref.en.sgm',], + 'en-ro': ['dev/newsdev2016-enro-src.en.sgm', 'dev/newsdev2016-enro-ref.ro.sgm'], + 'en-tr': ['dev/newsdev2016-entr-src.en.sgm', 'dev/newsdev2016-entr-ref.tr.sgm'], + 'ro-en': ['dev/newsdev2016-roen-src.ro.sgm', 'dev/newsdev2016-roen-ref.en.sgm'], + 'tr-en': ['dev/newsdev2016-tren-src.tr.sgm', 'dev/newsdev2016-tren-ref.en.sgm'], }, 'wmt15': { 'data': ['http://statmt.org/wmt15/test.tgz'], @@ -387,16 +387,16 @@ '\n month = {September},\n year = {2015},\n address = {Lisbon, Portugal},' '\n publisher = {Association for Computational Linguistics},\n pages = {1--46},\n url ' ' = {http://aclweb.org/anthology/W15-3001}\n}', - 'en-fr': ['test/newsdiscusstest2015-enfr-src.en.sgm', 'test/newsdiscusstest2015-enfr-ref.fr.sgm',], - 'fr-en': ['test/newsdiscusstest2015-fren-src.fr.sgm', 'test/newsdiscusstest2015-fren-ref.en.sgm',], - 'cs-en': ['test/newstest2015-csen-src.cs.sgm', 'test/newstest2015-csen-ref.en.sgm',], - 'de-en': ['test/newstest2015-deen-src.de.sgm', 'test/newstest2015-deen-ref.en.sgm',], - 'en-cs': ['test/newstest2015-encs-src.en.sgm', 'test/newstest2015-encs-ref.cs.sgm',], - 'en-de': ['test/newstest2015-ende-src.en.sgm', 'test/newstest2015-ende-ref.de.sgm',], - 'en-fi': ['test/newstest2015-enfi-src.en.sgm', 'test/newstest2015-enfi-ref.fi.sgm',], - 'en-ru': ['test/newstest2015-enru-src.en.sgm', 'test/newstest2015-enru-ref.ru.sgm',], - 'fi-en': ['test/newstest2015-fien-src.fi.sgm', 'test/newstest2015-fien-ref.en.sgm',], - 'ru-en': ['test/newstest2015-ruen-src.ru.sgm', 'test/newstest2015-ruen-ref.en.sgm',], + 'en-fr': ['test/newsdiscusstest2015-enfr-src.en.sgm', 'test/newsdiscusstest2015-enfr-ref.fr.sgm'], + 'fr-en': ['test/newsdiscusstest2015-fren-src.fr.sgm', 'test/newsdiscusstest2015-fren-ref.en.sgm'], + 'cs-en': ['test/newstest2015-csen-src.cs.sgm', 'test/newstest2015-csen-ref.en.sgm'], + 'de-en': ['test/newstest2015-deen-src.de.sgm', 'test/newstest2015-deen-ref.en.sgm'], + 'en-cs': ['test/newstest2015-encs-src.en.sgm', 'test/newstest2015-encs-ref.cs.sgm'], + 'en-de': ['test/newstest2015-ende-src.en.sgm', 'test/newstest2015-ende-ref.de.sgm'], + 'en-fi': ['test/newstest2015-enfi-src.en.sgm', 'test/newstest2015-enfi-ref.fi.sgm'], + 'en-ru': ['test/newstest2015-enru-src.en.sgm', 'test/newstest2015-enru-ref.ru.sgm'], + 'fi-en': ['test/newstest2015-fien-src.fi.sgm', 'test/newstest2015-fien-ref.en.sgm'], + 'ru-en': ['test/newstest2015-ruen-src.ru.sgm', 'test/newstest2015-ruen-ref.en.sgm'], }, 'wmt14': { 'data': ['http://statmt.org/wmt14/test-filtered.tgz'], @@ -410,31 +410,31 @@ 'on Statistical Machine Translation},\n month = {June},\n year = {2014},\n address ' '= {Baltimore, Maryland, USA},\n publisher = {Association for Computational Linguistics},' '\n pages = {12--58},\n url = {http://www.aclweb.org/anthology/W/W14/W14-3302}\n}', - 'cs-en': ['test/newstest2014-csen-src.cs.sgm', 'test/newstest2014-csen-ref.en.sgm',], - 'en-cs': ['test/newstest2014-csen-src.en.sgm', 'test/newstest2014-csen-ref.cs.sgm',], - 'de-en': ['test/newstest2014-deen-src.de.sgm', 'test/newstest2014-deen-ref.en.sgm',], - 'en-de': ['test/newstest2014-deen-src.en.sgm', 'test/newstest2014-deen-ref.de.sgm',], - 'en-fr': ['test/newstest2014-fren-src.en.sgm', 'test/newstest2014-fren-ref.fr.sgm',], - 'fr-en': ['test/newstest2014-fren-src.fr.sgm', 'test/newstest2014-fren-ref.en.sgm',], - 'en-hi': ['test/newstest2014-hien-src.en.sgm', 'test/newstest2014-hien-ref.hi.sgm',], - 'hi-en': ['test/newstest2014-hien-src.hi.sgm', 'test/newstest2014-hien-ref.en.sgm',], - 'en-ru': ['test/newstest2014-ruen-src.en.sgm', 'test/newstest2014-ruen-ref.ru.sgm',], - 'ru-en': ['test/newstest2014-ruen-src.ru.sgm', 'test/newstest2014-ruen-ref.en.sgm',], + 'cs-en': ['test/newstest2014-csen-src.cs.sgm', 'test/newstest2014-csen-ref.en.sgm'], + 'en-cs': ['test/newstest2014-csen-src.en.sgm', 'test/newstest2014-csen-ref.cs.sgm'], + 'de-en': ['test/newstest2014-deen-src.de.sgm', 'test/newstest2014-deen-ref.en.sgm'], + 'en-de': ['test/newstest2014-deen-src.en.sgm', 'test/newstest2014-deen-ref.de.sgm'], + 'en-fr': ['test/newstest2014-fren-src.en.sgm', 'test/newstest2014-fren-ref.fr.sgm'], + 'fr-en': ['test/newstest2014-fren-src.fr.sgm', 'test/newstest2014-fren-ref.en.sgm'], + 'en-hi': ['test/newstest2014-hien-src.en.sgm', 'test/newstest2014-hien-ref.hi.sgm'], + 'hi-en': ['test/newstest2014-hien-src.hi.sgm', 'test/newstest2014-hien-ref.en.sgm'], + 'en-ru': ['test/newstest2014-ruen-src.en.sgm', 'test/newstest2014-ruen-ref.ru.sgm'], + 'ru-en': ['test/newstest2014-ruen-src.ru.sgm', 'test/newstest2014-ruen-ref.en.sgm'], }, 'wmt14/full': { 'data': ['http://statmt.org/wmt14/test-full.tgz'], 'md5': ['a8cd784e006feb32ac6f3d9ec7eb389a'], 'description': 'Evaluation data released after official evaluation for further research.', - 'cs-en': ['test-full/newstest2014-csen-src.cs.sgm', 'test-full/newstest2014-csen-ref.en.sgm',], - 'en-cs': ['test-full/newstest2014-csen-src.en.sgm', 'test-full/newstest2014-csen-ref.cs.sgm',], - 'de-en': ['test-full/newstest2014-deen-src.de.sgm', 'test-full/newstest2014-deen-ref.en.sgm',], - 'en-de': ['test-full/newstest2014-deen-src.en.sgm', 'test-full/newstest2014-deen-ref.de.sgm',], - 'en-fr': ['test-full/newstest2014-fren-src.en.sgm', 'test-full/newstest2014-fren-ref.fr.sgm',], - 'fr-en': ['test-full/newstest2014-fren-src.fr.sgm', 'test-full/newstest2014-fren-ref.en.sgm',], - 'en-hi': ['test-full/newstest2014-hien-src.en.sgm', 'test-full/newstest2014-hien-ref.hi.sgm',], - 'hi-en': ['test-full/newstest2014-hien-src.hi.sgm', 'test-full/newstest2014-hien-ref.en.sgm',], - 'en-ru': ['test-full/newstest2014-ruen-src.en.sgm', 'test-full/newstest2014-ruen-ref.ru.sgm',], - 'ru-en': ['test-full/newstest2014-ruen-src.ru.sgm', 'test-full/newstest2014-ruen-ref.en.sgm',], + 'cs-en': ['test-full/newstest2014-csen-src.cs.sgm', 'test-full/newstest2014-csen-ref.en.sgm'], + 'en-cs': ['test-full/newstest2014-csen-src.en.sgm', 'test-full/newstest2014-csen-ref.cs.sgm'], + 'de-en': ['test-full/newstest2014-deen-src.de.sgm', 'test-full/newstest2014-deen-ref.en.sgm'], + 'en-de': ['test-full/newstest2014-deen-src.en.sgm', 'test-full/newstest2014-deen-ref.de.sgm'], + 'en-fr': ['test-full/newstest2014-fren-src.en.sgm', 'test-full/newstest2014-fren-ref.fr.sgm'], + 'fr-en': ['test-full/newstest2014-fren-src.fr.sgm', 'test-full/newstest2014-fren-ref.en.sgm'], + 'en-hi': ['test-full/newstest2014-hien-src.en.sgm', 'test-full/newstest2014-hien-ref.hi.sgm'], + 'hi-en': ['test-full/newstest2014-hien-src.hi.sgm', 'test-full/newstest2014-hien-ref.en.sgm'], + 'en-ru': ['test-full/newstest2014-ruen-src.en.sgm', 'test-full/newstest2014-ruen-ref.ru.sgm'], + 'ru-en': ['test-full/newstest2014-ruen-src.ru.sgm', 'test-full/newstest2014-ruen-ref.en.sgm'], }, 'wmt13': { 'data': ['http://statmt.org/wmt13/test.tgz'], @@ -448,16 +448,16 @@ '\n month = {August},\n year = {2013},\n address = {Sofia, Bulgaria},\n publisher ' '= {Association for Computational Linguistics},\n pages = {1--44},\n url = {' 'http://www.aclweb.org/anthology/W13-2201}\n}', - 'cs-en': ['test/newstest2013-src.cs.sgm', 'test/newstest2013-src.en.sgm',], - 'en-cs': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.cs.sgm',], - 'de-en': ['test/newstest2013-src.de.sgm', 'test/newstest2013-src.en.sgm',], - 'en-de': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.de.sgm',], - 'es-en': ['test/newstest2013-src.es.sgm', 'test/newstest2013-src.en.sgm',], - 'en-es': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.es.sgm',], - 'fr-en': ['test/newstest2013-src.fr.sgm', 'test/newstest2013-src.en.sgm',], - 'en-fr': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.fr.sgm',], - 'ru-en': ['test/newstest2013-src.ru.sgm', 'test/newstest2013-src.en.sgm',], - 'en-ru': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.ru.sgm',], + 'cs-en': ['test/newstest2013-src.cs.sgm', 'test/newstest2013-src.en.sgm'], + 'en-cs': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.cs.sgm'], + 'de-en': ['test/newstest2013-src.de.sgm', 'test/newstest2013-src.en.sgm'], + 'en-de': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.de.sgm'], + 'es-en': ['test/newstest2013-src.es.sgm', 'test/newstest2013-src.en.sgm'], + 'en-es': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.es.sgm'], + 'fr-en': ['test/newstest2013-src.fr.sgm', 'test/newstest2013-src.en.sgm'], + 'en-fr': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.fr.sgm'], + 'ru-en': ['test/newstest2013-src.ru.sgm', 'test/newstest2013-src.en.sgm'], + 'en-ru': ['test/newstest2013-src.en.sgm', 'test/newstest2013-src.ru.sgm'], }, 'wmt12': { 'data': ['http://statmt.org/wmt12/test.tgz'], @@ -470,14 +470,14 @@ '\n month = {June},\n year = {2012},\n address = {Montr{\'e}al, Canada},' '\n publisher = {Association for Computational Linguistics},\n pages = {10--51},' '\n url = {http://www.aclweb.org/anthology/W12-3102}\n}', - 'cs-en': ['test/newstest2012-src.cs.sgm', 'test/newstest2012-src.en.sgm',], - 'en-cs': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.cs.sgm',], - 'de-en': ['test/newstest2012-src.de.sgm', 'test/newstest2012-src.en.sgm',], - 'en-de': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.de.sgm',], - 'es-en': ['test/newstest2012-src.es.sgm', 'test/newstest2012-src.en.sgm',], - 'en-es': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.es.sgm',], - 'fr-en': ['test/newstest2012-src.fr.sgm', 'test/newstest2012-src.en.sgm',], - 'en-fr': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.fr.sgm',], + 'cs-en': ['test/newstest2012-src.cs.sgm', 'test/newstest2012-src.en.sgm'], + 'en-cs': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.cs.sgm'], + 'de-en': ['test/newstest2012-src.de.sgm', 'test/newstest2012-src.en.sgm'], + 'en-de': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.de.sgm'], + 'es-en': ['test/newstest2012-src.es.sgm', 'test/newstest2012-src.en.sgm'], + 'en-es': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.es.sgm'], + 'fr-en': ['test/newstest2012-src.fr.sgm', 'test/newstest2012-src.en.sgm'], + 'en-fr': ['test/newstest2012-src.en.sgm', 'test/newstest2012-src.fr.sgm'], }, 'wmt11': { 'data': ['http://statmt.org/wmt11/test.tgz'], @@ -510,14 +510,14 @@ '\n address = {Uppsala, Sweden},\n publisher = {Association for Computational Linguistics},' '\n pages = {17--53},\n note = {Revised August 2010},\n url = {' 'http://www.aclweb.org/anthology/W10-1703}\n}', - 'cs-en': ['test/newstest2010-src.cz.sgm', 'test/newstest2010-src.en.sgm',], - 'en-cs': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.cz.sgm',], - 'de-en': ['test/newstest2010-src.de.sgm', 'test/newstest2010-src.en.sgm',], - 'en-de': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.de.sgm',], - 'es-en': ['test/newstest2010-src.es.sgm', 'test/newstest2010-src.en.sgm',], - 'en-es': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.es.sgm',], - 'fr-en': ['test/newstest2010-src.fr.sgm', 'test/newstest2010-src.en.sgm',], - 'en-fr': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.fr.sgm',], + 'cs-en': ['test/newstest2010-src.cz.sgm', 'test/newstest2010-src.en.sgm'], + 'en-cs': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.cz.sgm'], + 'de-en': ['test/newstest2010-src.de.sgm', 'test/newstest2010-src.en.sgm'], + 'en-de': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.de.sgm'], + 'es-en': ['test/newstest2010-src.es.sgm', 'test/newstest2010-src.en.sgm'], + 'en-es': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.es.sgm'], + 'fr-en': ['test/newstest2010-src.fr.sgm', 'test/newstest2010-src.en.sgm'], + 'en-fr': ['test/newstest2010-src.en.sgm', 'test/newstest2010-src.fr.sgm'], }, 'wmt09': { 'data': ['http://statmt.org/wmt09/test.tgz'], @@ -530,18 +530,18 @@ '2009},\n address = {Athens, Greece},\n publisher = {Association for Computational ' 'Linguistics},\n pages = {1--28},\n url = {' 'http://www.aclweb.org/anthology/W/W09/W09-0401}\n}', - 'cs-en': ['test/newstest2009-src.cz.sgm', 'test/newstest2009-src.en.sgm',], - 'en-cs': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.cz.sgm',], - 'de-en': ['test/newstest2009-src.de.sgm', 'test/newstest2009-src.en.sgm',], - 'en-de': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.de.sgm',], - 'es-en': ['test/newstest2009-src.es.sgm', 'test/newstest2009-src.en.sgm',], - 'en-es': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.es.sgm',], - 'fr-en': ['test/newstest2009-src.fr.sgm', 'test/newstest2009-src.en.sgm',], - 'en-fr': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.fr.sgm',], - 'hu-en': ['test/newstest2009-src.hu.sgm', 'test/newstest2009-src.en.sgm',], - 'en-hu': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.hu.sgm',], - 'it-en': ['test/newstest2009-src.it.sgm', 'test/newstest2009-src.en.sgm',], - 'en-it': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.it.sgm',], + 'cs-en': ['test/newstest2009-src.cz.sgm', 'test/newstest2009-src.en.sgm'], + 'en-cs': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.cz.sgm'], + 'de-en': ['test/newstest2009-src.de.sgm', 'test/newstest2009-src.en.sgm'], + 'en-de': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.de.sgm'], + 'es-en': ['test/newstest2009-src.es.sgm', 'test/newstest2009-src.en.sgm'], + 'en-es': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.es.sgm'], + 'fr-en': ['test/newstest2009-src.fr.sgm', 'test/newstest2009-src.en.sgm'], + 'en-fr': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.fr.sgm'], + 'hu-en': ['test/newstest2009-src.hu.sgm', 'test/newstest2009-src.en.sgm'], + 'en-hu': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.hu.sgm'], + 'it-en': ['test/newstest2009-src.it.sgm', 'test/newstest2009-src.en.sgm'], + 'en-it': ['test/newstest2009-src.en.sgm', 'test/newstest2009-src.it.sgm'], }, 'wmt08': { 'data': ['http://statmt.org/wmt08/test.tgz'], @@ -553,23 +553,23 @@ 'Workshop on Statistical Machine Translation},\n month = {June},\n year = {2008},' '\n address = {Columbus, Ohio},\n publisher = {Association for Computational Linguistics},' '\n pages = {70--106},\n url = {http://www.aclweb.org/anthology/W/W08/W08-0309}\n}', - 'cs-en': ['test/newstest2008-src.cz.sgm', 'test/newstest2008-src.en.sgm',], - 'en-cs': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.cz.sgm',], - 'de-en': ['test/newstest2008-src.de.sgm', 'test/newstest2008-src.en.sgm',], - 'en-de': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.de.sgm',], - 'es-en': ['test/newstest2008-src.es.sgm', 'test/newstest2008-src.en.sgm',], - 'en-es': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.es.sgm',], - 'fr-en': ['test/newstest2008-src.fr.sgm', 'test/newstest2008-src.en.sgm',], - 'en-fr': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.fr.sgm',], - 'hu-en': ['test/newstest2008-src.hu.sgm', 'test/newstest2008-src.en.sgm',], - 'en-hu': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.hu.sgm',], + 'cs-en': ['test/newstest2008-src.cz.sgm', 'test/newstest2008-src.en.sgm'], + 'en-cs': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.cz.sgm'], + 'de-en': ['test/newstest2008-src.de.sgm', 'test/newstest2008-src.en.sgm'], + 'en-de': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.de.sgm'], + 'es-en': ['test/newstest2008-src.es.sgm', 'test/newstest2008-src.en.sgm'], + 'en-es': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.es.sgm'], + 'fr-en': ['test/newstest2008-src.fr.sgm', 'test/newstest2008-src.en.sgm'], + 'en-fr': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.fr.sgm'], + 'hu-en': ['test/newstest2008-src.hu.sgm', 'test/newstest2008-src.en.sgm'], + 'en-hu': ['test/newstest2008-src.en.sgm', 'test/newstest2008-src.hu.sgm'], }, 'wmt08/nc': { 'data': ['http://statmt.org/wmt08/test.tgz'], 'md5': ['0582e4e894a3342044059c894e1aea3d'], 'description': 'Official evaluation data (news commentary).', - 'cs-en': ['test/nc-test2008-src.cz.sgm', 'test/nc-test2008-src.en.sgm',], - 'en-cs': ['test/nc-test2008-src.en.sgm', 'test/nc-test2008-src.cz.sgm',], + 'cs-en': ['test/nc-test2008-src.cz.sgm', 'test/nc-test2008-src.en.sgm'], + 'en-cs': ['test/nc-test2008-src.en.sgm', 'test/nc-test2008-src.cz.sgm'], }, 'wmt08/europarl': { 'data': ['http://statmt.org/wmt08/test.tgz'], @@ -618,12 +618,12 @@ '\n booktitle = {14th International Workshop on Spoken Language Translation},\n month = {' 'December},\n year = {2017},\n address = {Tokyo, Japan},\n pages = {2--14},' '\n url = {http://workshop2017.iwslt.org/downloads/iwslt2017_proceeding_v2.pdf}\n}', - 'en-fr': ['en-fr/IWSLT17.TED.tst2017.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml',], - 'fr-en': ['fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2017.en-fr.en.xml',], - 'en-de': ['en-de/IWSLT17.TED.tst2017.en-de.en.xml', 'de-en/IWSLT17.TED.tst2017.de-en.de.xml',], - 'de-en': ['de-en/IWSLT17.TED.tst2017.de-en.de.xml', 'en-de/IWSLT17.TED.tst2017.en-de.en.xml',], - 'en-zh': ['en-zh/IWSLT17.TED.tst2017.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml',], - 'zh-en': ['zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2017.en-zh.en.xml',], + 'en-fr': ['en-fr/IWSLT17.TED.tst2017.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml'], + 'fr-en': ['fr-en/IWSLT17.TED.tst2017.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2017.en-fr.en.xml'], + 'en-de': ['en-de/IWSLT17.TED.tst2017.en-de.en.xml', 'de-en/IWSLT17.TED.tst2017.de-en.de.xml'], + 'de-en': ['de-en/IWSLT17.TED.tst2017.de-en.de.xml', 'en-de/IWSLT17.TED.tst2017.en-de.en.xml'], + 'en-zh': ['en-zh/IWSLT17.TED.tst2017.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml'], + 'zh-en': ['zh-en/IWSLT17.TED.tst2017.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2017.en-zh.en.xml'], }, 'iwslt17/tst2016': { 'data': [ @@ -643,12 +643,12 @@ "cc51d9b7fe1ff2af858c6a0dd80b8815", ], 'description': 'Development data for IWSLT 2017.', - 'en-fr': ['en-fr/IWSLT17.TED.tst2016.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml',], - 'fr-en': ['fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2016.en-fr.en.xml',], - 'en-de': ['en-de/IWSLT17.TED.tst2016.en-de.en.xml', 'de-en/IWSLT17.TED.tst2016.de-en.de.xml',], - 'de-en': ['de-en/IWSLT17.TED.tst2016.de-en.de.xml', 'en-de/IWSLT17.TED.tst2016.en-de.en.xml',], - 'en-zh': ['en-zh/IWSLT17.TED.tst2016.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml',], - 'zh-en': ['zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2016.en-zh.en.xml',], + 'en-fr': ['en-fr/IWSLT17.TED.tst2016.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml'], + 'fr-en': ['fr-en/IWSLT17.TED.tst2016.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2016.en-fr.en.xml'], + 'en-de': ['en-de/IWSLT17.TED.tst2016.en-de.en.xml', 'de-en/IWSLT17.TED.tst2016.de-en.de.xml'], + 'de-en': ['de-en/IWSLT17.TED.tst2016.de-en.de.xml', 'en-de/IWSLT17.TED.tst2016.en-de.en.xml'], + 'en-zh': ['en-zh/IWSLT17.TED.tst2016.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml'], + 'zh-en': ['zh-en/IWSLT17.TED.tst2016.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2016.en-zh.en.xml'], }, 'iwslt17/tst2015': { 'data': [ @@ -668,12 +668,12 @@ "1c0ae40171d52593df8a6963d3828116", ], 'description': 'Development data for IWSLT 2017.', - 'en-fr': ['en-fr/IWSLT17.TED.tst2015.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml',], - 'fr-en': ['fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2015.en-fr.en.xml',], - 'en-de': ['en-de/IWSLT17.TED.tst2015.en-de.en.xml', 'de-en/IWSLT17.TED.tst2015.de-en.de.xml',], - 'de-en': ['de-en/IWSLT17.TED.tst2015.de-en.de.xml', 'en-de/IWSLT17.TED.tst2015.en-de.en.xml',], - 'en-zh': ['en-zh/IWSLT17.TED.tst2015.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml',], - 'zh-en': ['zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2015.en-zh.en.xml',], + 'en-fr': ['en-fr/IWSLT17.TED.tst2015.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml'], + 'fr-en': ['fr-en/IWSLT17.TED.tst2015.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2015.en-fr.en.xml'], + 'en-de': ['en-de/IWSLT17.TED.tst2015.en-de.en.xml', 'de-en/IWSLT17.TED.tst2015.de-en.de.xml'], + 'de-en': ['de-en/IWSLT17.TED.tst2015.de-en.de.xml', 'en-de/IWSLT17.TED.tst2015.en-de.en.xml'], + 'en-zh': ['en-zh/IWSLT17.TED.tst2015.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml'], + 'zh-en': ['zh-en/IWSLT17.TED.tst2015.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2015.en-zh.en.xml'], }, 'iwslt17/tst2014': { 'data': [ @@ -693,12 +693,12 @@ "1c0ae40171d52593df8a6963d3828116", ], 'description': 'Development data for IWSLT 2017.', - 'en-fr': ['en-fr/IWSLT17.TED.tst2014.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml',], - 'fr-en': ['fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2014.en-fr.en.xml',], - 'en-de': ['en-de/IWSLT17.TED.tst2014.en-de.en.xml', 'de-en/IWSLT17.TED.tst2014.de-en.de.xml',], - 'de-en': ['de-en/IWSLT17.TED.tst2014.de-en.de.xml', 'en-de/IWSLT17.TED.tst2014.en-de.en.xml',], - 'en-zh': ['en-zh/IWSLT17.TED.tst2014.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml',], - 'zh-en': ['zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2014.en-zh.en.xml',], + 'en-fr': ['en-fr/IWSLT17.TED.tst2014.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml'], + 'fr-en': ['fr-en/IWSLT17.TED.tst2014.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2014.en-fr.en.xml'], + 'en-de': ['en-de/IWSLT17.TED.tst2014.en-de.en.xml', 'de-en/IWSLT17.TED.tst2014.de-en.de.xml'], + 'de-en': ['de-en/IWSLT17.TED.tst2014.de-en.de.xml', 'en-de/IWSLT17.TED.tst2014.en-de.en.xml'], + 'en-zh': ['en-zh/IWSLT17.TED.tst2014.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml'], + 'zh-en': ['zh-en/IWSLT17.TED.tst2014.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2014.en-zh.en.xml'], }, 'iwslt17/tst2013': { 'data': [ @@ -718,12 +718,12 @@ "1c0ae40171d52593df8a6963d3828116", ], 'description': 'Development data for IWSLT 2017.', - 'en-fr': ['en-fr/IWSLT17.TED.tst2013.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml',], - 'fr-en': ['fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2013.en-fr.en.xml',], - 'en-de': ['en-de/IWSLT17.TED.tst2013.en-de.en.xml', 'de-en/IWSLT17.TED.tst2013.de-en.de.xml',], - 'de-en': ['de-en/IWSLT17.TED.tst2013.de-en.de.xml', 'en-de/IWSLT17.TED.tst2013.en-de.en.xml',], - 'en-zh': ['en-zh/IWSLT17.TED.tst2013.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml',], - 'zh-en': ['zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2013.en-zh.en.xml',], + 'en-fr': ['en-fr/IWSLT17.TED.tst2013.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml'], + 'fr-en': ['fr-en/IWSLT17.TED.tst2013.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2013.en-fr.en.xml'], + 'en-de': ['en-de/IWSLT17.TED.tst2013.en-de.en.xml', 'de-en/IWSLT17.TED.tst2013.de-en.de.xml'], + 'de-en': ['de-en/IWSLT17.TED.tst2013.de-en.de.xml', 'en-de/IWSLT17.TED.tst2013.en-de.en.xml'], + 'en-zh': ['en-zh/IWSLT17.TED.tst2013.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml'], + 'zh-en': ['zh-en/IWSLT17.TED.tst2013.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2013.en-zh.en.xml'], }, 'iwslt17/tst2012': { 'data': [ @@ -743,12 +743,12 @@ "1c0ae40171d52593df8a6963d3828116", ], 'description': 'Development data for IWSLT 2017.', - 'en-fr': ['en-fr/IWSLT17.TED.tst2012.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml',], - 'fr-en': ['fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2012.en-fr.en.xml',], - 'en-de': ['en-de/IWSLT17.TED.tst2012.en-de.en.xml', 'de-en/IWSLT17.TED.tst2012.de-en.de.xml',], - 'de-en': ['de-en/IWSLT17.TED.tst2012.de-en.de.xml', 'en-de/IWSLT17.TED.tst2012.en-de.en.xml',], - 'en-zh': ['en-zh/IWSLT17.TED.tst2012.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml',], - 'zh-en': ['zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2012.en-zh.en.xml',], + 'en-fr': ['en-fr/IWSLT17.TED.tst2012.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml'], + 'fr-en': ['fr-en/IWSLT17.TED.tst2012.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2012.en-fr.en.xml'], + 'en-de': ['en-de/IWSLT17.TED.tst2012.en-de.en.xml', 'de-en/IWSLT17.TED.tst2012.de-en.de.xml'], + 'de-en': ['de-en/IWSLT17.TED.tst2012.de-en.de.xml', 'en-de/IWSLT17.TED.tst2012.en-de.en.xml'], + 'en-zh': ['en-zh/IWSLT17.TED.tst2012.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml'], + 'zh-en': ['zh-en/IWSLT17.TED.tst2012.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2012.en-zh.en.xml'], }, 'iwslt17/tst2011': { 'data': [ @@ -768,12 +768,12 @@ "1c0ae40171d52593df8a6963d3828116", ], 'description': 'Development data for IWSLT 2017.', - 'en-fr': ['en-fr/IWSLT17.TED.tst2011.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml',], - 'fr-en': ['fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2011.en-fr.en.xml',], - 'en-de': ['en-de/IWSLT17.TED.tst2011.en-de.en.xml', 'de-en/IWSLT17.TED.tst2011.de-en.de.xml',], - 'de-en': ['de-en/IWSLT17.TED.tst2011.de-en.de.xml', 'en-de/IWSLT17.TED.tst2011.en-de.en.xml',], - 'en-zh': ['en-zh/IWSLT17.TED.tst2011.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml',], - 'zh-en': ['zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2011.en-zh.en.xml',], + 'en-fr': ['en-fr/IWSLT17.TED.tst2011.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml'], + 'fr-en': ['fr-en/IWSLT17.TED.tst2011.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2011.en-fr.en.xml'], + 'en-de': ['en-de/IWSLT17.TED.tst2011.en-de.en.xml', 'de-en/IWSLT17.TED.tst2011.de-en.de.xml'], + 'de-en': ['de-en/IWSLT17.TED.tst2011.de-en.de.xml', 'en-de/IWSLT17.TED.tst2011.en-de.en.xml'], + 'en-zh': ['en-zh/IWSLT17.TED.tst2011.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml'], + 'zh-en': ['zh-en/IWSLT17.TED.tst2011.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2011.en-zh.en.xml'], }, 'iwslt17/tst2010': { 'data': [ @@ -793,12 +793,12 @@ "1c0ae40171d52593df8a6963d3828116", ], 'description': 'Development data for IWSLT 2017.', - 'en-fr': ['en-fr/IWSLT17.TED.tst2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml',], - 'fr-en': ['fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2010.en-fr.en.xml',], - 'en-de': ['en-de/IWSLT17.TED.tst2010.en-de.en.xml', 'de-en/IWSLT17.TED.tst2010.de-en.de.xml',], - 'de-en': ['de-en/IWSLT17.TED.tst2010.de-en.de.xml', 'en-de/IWSLT17.TED.tst2010.en-de.en.xml',], - 'en-zh': ['en-zh/IWSLT17.TED.tst2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml',], - 'zh-en': ['zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2010.en-zh.en.xml',], + 'en-fr': ['en-fr/IWSLT17.TED.tst2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml'], + 'fr-en': ['fr-en/IWSLT17.TED.tst2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.tst2010.en-fr.en.xml'], + 'en-de': ['en-de/IWSLT17.TED.tst2010.en-de.en.xml', 'de-en/IWSLT17.TED.tst2010.de-en.de.xml'], + 'de-en': ['de-en/IWSLT17.TED.tst2010.de-en.de.xml', 'en-de/IWSLT17.TED.tst2010.en-de.en.xml'], + 'en-zh': ['en-zh/IWSLT17.TED.tst2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml'], + 'zh-en': ['zh-en/IWSLT17.TED.tst2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.tst2010.en-zh.en.xml'], }, 'iwslt17/dev2010': { 'data': [ @@ -818,12 +818,12 @@ "1c0ae40171d52593df8a6963d3828116", ], 'description': 'Development data for IWSLT 2017.', - 'en-fr': ['en-fr/IWSLT17.TED.dev2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml',], - 'fr-en': ['fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.dev2010.en-fr.en.xml',], - 'en-de': ['en-de/IWSLT17.TED.dev2010.en-de.en.xml', 'de-en/IWSLT17.TED.dev2010.de-en.de.xml',], - 'de-en': ['de-en/IWSLT17.TED.dev2010.de-en.de.xml', 'en-de/IWSLT17.TED.dev2010.en-de.en.xml',], - 'en-zh': ['en-zh/IWSLT17.TED.dev2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml',], - 'zh-en': ['zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.dev2010.en-zh.en.xml',], + 'en-fr': ['en-fr/IWSLT17.TED.dev2010.en-fr.en.xml', 'fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml'], + 'fr-en': ['fr-en/IWSLT17.TED.dev2010.fr-en.fr.xml', 'en-fr/IWSLT17.TED.dev2010.en-fr.en.xml'], + 'en-de': ['en-de/IWSLT17.TED.dev2010.en-de.en.xml', 'de-en/IWSLT17.TED.dev2010.de-en.de.xml'], + 'de-en': ['de-en/IWSLT17.TED.dev2010.de-en.de.xml', 'en-de/IWSLT17.TED.dev2010.en-de.en.xml'], + 'en-zh': ['en-zh/IWSLT17.TED.dev2010.en-zh.en.xml', 'zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml'], + 'zh-en': ['zh-en/IWSLT17.TED.dev2010.zh-en.zh.xml', 'en-zh/IWSLT17.TED.dev2010.en-zh.en.xml'], }, } @@ -1087,15 +1087,7 @@ def bleu_signature(args, numrefs): """ # Abbreviations for the signature - abbr = { - 'test': 't', - 'lang': 'l', - 'smooth': 's', - 'case': 'c', - 'tok': 'tok', - 'numrefs': '#', - 'version': 'v', - } + abbr = {'test': 't', 'lang': 'l', 'smooth': 's', 'case': 'c', 'tok': 'tok', 'numrefs': '#', 'version': 'v'} signature = { 'tok': args.tokenize, @@ -1124,15 +1116,7 @@ def chrf_signature(args, numrefs): """ # Abbreviations for the signature - abbr = { - 'test': 't', - 'lang': 'l', - 'numchars': 'n', - 'space': 's', - 'case': 'c', - 'numrefs': '#', - 'version': 'v', - } + abbr = {'test': 't', 'lang': 'l', 'numchars': 'n', 'space': 's', 'case': 'c', 'numrefs': '#', 'version': 'v'} signature = { 'tok': args.tokenize, @@ -1225,24 +1209,20 @@ def process_to_text(rawfile, txtfile, field: int = None): with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout: for line in fin: if line.startswith('(.*).*?', '\\1', line)), file=fout, - ) + logging.info(_clean(re.sub(r'(.*).*?', '\\1', line)), file=fout) elif rawfile.endswith('.xml'): # IWSLT with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout: for line in fin: if line.startswith('(.*).*?', '\\1', line)), file=fout, - ) + logging.info(_clean(re.sub(r'(.*).*?', '\\1', line)), file=fout) elif rawfile.endswith('.txt'): # wmt17/ms with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout: for line in fin: - print(line.rstrip(), file=fout) + logging.info(line.rstrip(), file=fout) elif rawfile.endswith('.tsv'): # MTNT with smart_open(rawfile) as fin, smart_open(txtfile, 'wt') as fout: for line in fin: - print(line.rstrip().split('\t')[field], file=fout) + logging.info(line.rstrip().split('\t')[field], file=fout) def print_test_set(test_set, langpair, side): @@ -1260,7 +1240,7 @@ def print_test_set(test_set, langpair, side): streams = [smart_open(file) for file in files] for lines in zip(*streams): - print('\t'.join(map(lambda x: x.rstrip(), lines))) + logging.info('\t'.join(map(lambda x: x.rstrip(), lines))) def download_test_set(test_set, langpair=None): @@ -1586,7 +1566,7 @@ def delete_whitespace(text: str) -> str: def get_sentence_statistics( - hypothesis: str, reference: str, order: int = CHRF_ORDER, remove_whitespace: bool = True, + hypothesis: str, reference: str, order: int = CHRF_ORDER, remove_whitespace: bool = True ) -> List[float]: hypothesis = delete_whitespace(hypothesis) if remove_whitespace else hypothesis reference = delete_whitespace(reference) if remove_whitespace else reference @@ -1603,11 +1583,11 @@ def get_sentence_statistics( def get_corpus_statistics( - hypotheses: Iterable[str], references: Iterable[str], order: int = CHRF_ORDER, remove_whitespace: bool = True, + hypotheses: Iterable[str], references: Iterable[str], order: int = CHRF_ORDER, remove_whitespace: bool = True ) -> List[float]: corpus_statistics = [0] * (order * 3) for hypothesis, reference in zip(hypotheses, references): - statistics = get_sentence_statistics(hypothesis, reference, order=order, remove_whitespace=remove_whitespace,) + statistics = get_sentence_statistics(hypothesis, reference, order=order, remove_whitespace=remove_whitespace) for i in range(len(statistics)): corpus_statistics[i] += statistics[i] return corpus_statistics @@ -1656,15 +1636,13 @@ def corpus_chrf( :param beta: Defines importance of recall w.r.t precision. If beta=1, same importance. :return: Chrf score. """ - corpus_statistics = get_corpus_statistics( - hypotheses, references, order=order, remove_whitespace=remove_whitespace, - ) + corpus_statistics = get_corpus_statistics(hypotheses, references, order=order, remove_whitespace=remove_whitespace) avg_precision, avg_recall = _avg_precision_and_recall(corpus_statistics, order) return _chrf(avg_precision, avg_recall, beta=beta) def sentence_chrf( - hypothesis: str, reference: str, order: int = CHRF_ORDER, beta: float = CHRF_BETA, remove_whitespace: bool = True, + hypothesis: str, reference: str, order: int = CHRF_ORDER, beta: float = CHRF_BETA, remove_whitespace: bool = True ) -> float: """ Computes ChrF on a single sentence pair. @@ -1688,10 +1666,10 @@ def main(): ' cat output.detok.de | ./sacreBLEU -t wmt14 -l en-de' ) arg_parser.add_argument( - '--test-set', '-t', type=str, default=None, choices=DATASETS.keys(), help='the test set to use', + '--test-set', '-t', type=str, default=None, choices=DATASETS.keys(), help='the test set to use' ) arg_parser.add_argument( - '-lc', action='store_true', default=False, help='use case-insensitive BLEU (default: actual case)', + '-lc', action='store_true', default=False, help='use case-insensitive BLEU (default: actual case)' ) arg_parser.add_argument( '--smooth', @@ -1709,7 +1687,7 @@ def main(): help='The value to pass to the smoothing technique, when relevant. ' 'Default: %(default)s. ', ) arg_parser.add_argument( - '--tokenize', '-tok', choices=TOKENIZERS.keys(), default=None, help='tokenization method to use', + '--tokenize', '-tok', choices=TOKENIZERS.keys(), default=None, help='tokenization method to use' ) arg_parser.add_argument( '--language-pair', @@ -1718,9 +1696,7 @@ def main(): default=None, help='source-target language pair (2-char ISO639-1 codes)', ) - arg_parser.add_argument( - '--download', type=str, default=None, help='download a test set and quit', - ) + arg_parser.add_argument('--download', type=str, default=None, help='download a test set and quit') arg_parser.add_argument( '--echo', choices=['src', 'ref', 'both'], @@ -1728,9 +1704,7 @@ def main(): default=None, help='output the source (src), reference (ref), or both (both, ' 'pasted) to STDOUT and quit ', ) - arg_parser.add_argument( - '--input', '-i', type=str, default='-', help='Read input from a file instead of STDIN', - ) + arg_parser.add_argument('--input', '-i', type=str, default='-', help='Read input from a file instead of STDIN') arg_parser.add_argument( 'refs', nargs='*', @@ -1746,10 +1720,10 @@ def main(): help='metrics to compute (default: bleu)', ) arg_parser.add_argument( - '--chrf-order', type=int, default=CHRF_ORDER, help='chrf character order (default: %(default)s)', + '--chrf-order', type=int, default=CHRF_ORDER, help='chrf character order (default: %(default)s)' ) arg_parser.add_argument( - '--chrf-beta', type=int, default=CHRF_BETA, help='chrf BETA parameter (default: %(default)s)', + '--chrf-beta', type=int, default=CHRF_BETA, help='chrf BETA parameter (default: %(default)s)' ) arg_parser.add_argument( '--chrf-whitespace', @@ -1758,17 +1732,15 @@ def main(): help='include whitespace in chrF calculation (default: %(default)s)', ) arg_parser.add_argument( - '--short', default=False, action='store_true', help='produce a shorter (less human readable) signature', + '--short', default=False, action='store_true', help='produce a shorter (less human readable) signature' ) arg_parser.add_argument( - '--score-only', '-b', default=False, action='store_true', help='output only the BLEU score', + '--score-only', '-b', default=False, action='store_true', help='output only the BLEU score' ) arg_parser.add_argument( - '--force', default=False, action='store_true', help='insist that your tokenized input is actually detokenized', - ) - arg_parser.add_argument( - '--quiet', '-q', default=False, action='store_true', help='suppress informative output', + '--force', default=False, action='store_true', help='insist that your tokenized input is actually detokenized' ) + arg_parser.add_argument('--quiet', '-q', default=False, action='store_true', help='suppress informative output') arg_parser.add_argument( '--encoding', '-e', @@ -1777,18 +1749,14 @@ def main(): help='open text files with specified encoding (default: %(default)s)', ) arg_parser.add_argument( - '--citation', '--cite', default=False, action='store_true', help='dump the bibtex citation and quit.', - ) - arg_parser.add_argument( - '--width', '-w', type=int, default=1, help='floating point width (default: %(default)s)', - ) - arg_parser.add_argument( - '-V', '--version', action='version', version='%(prog)s {}'.format(VERSION), + '--citation', '--cite', default=False, action='store_true', help='dump the bibtex citation and quit.' ) + arg_parser.add_argument('--width', '-w', type=int, default=1, help='floating point width (default: %(default)s)') + arg_parser.add_argument('-V', '--version', action='version', version='%(prog)s {}'.format(VERSION)) args = arg_parser.parse_args() # Explicitly set the encoding - sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True, newline="\n",) + sys.stdin = open(sys.stdin.fileno(), mode='r', encoding='utf-8', buffering=True, newline="\n") sys.stdout = open(sys.stdout.fileno(), mode='w', encoding='utf-8', buffering=True) if not args.quiet: @@ -1806,7 +1774,7 @@ def main(): logging.error('No citation found for %s', args.test_set) sys.exit(1) - print(DATASETS[args.test_set]['citation']) + logging.info(DATASETS[args.test_set]['citation']) sys.exit(0) if args.test_set is not None and args.test_set not in DATASETS: @@ -1871,7 +1839,7 @@ def main(): if args.test_set: _, *refs = download_test_set(args.test_set, args.langpair) if len(refs) == 0: - print('No references found for test set {}/{}.'.format(args.test_set, args.langpair)) + logging.info('No references found for test set {}/{}.'.format(args.test_set, args.langpair)) sys.exit(1) else: refs = args.refs @@ -1899,11 +1867,7 @@ def main(): ) if 'chrf' in args.metrics: chrf = corpus_chrf( - system, - refs[0], - beta=args.chrf_beta, - order=args.chrf_order, - remove_whitespace=not args.chrf_whitespace, + system, refs[0], beta=args.chrf_beta, order=args.chrf_order, remove_whitespace=not args.chrf_whitespace ) except EOFError: logging.error('The input and reference stream(s) were of different lengths.\n') @@ -1927,17 +1891,17 @@ def main(): for metric in args.metrics: if metric == 'bleu': if args.score_only: - print('{0:.{1}f}'.format(bleu.score, width)) + logging.info('{0:.{1}f}'.format(bleu.score, width)) else: version_str = bleu_signature(args, len(refs)) - print(bleu.format(width).replace('BLEU', 'BLEU+' + version_str)) + logging.info(bleu.format(width).replace('BLEU', 'BLEU+' + version_str)) elif metric == 'chrf': if args.score_only: - print('{0:.{1}f}'.format(chrf, width)) + logging.info('{0:.{1}f}'.format(chrf, width)) else: version_str = chrf_signature(args, len(refs)) - print('chrF{0:d}+{1} = {2:.{3}f}'.format(args.chrf_beta, version_str, chrf, width)) + logging.info('chrF{0:d}+{1} = {2:.{3}f}'.format(args.chrf_beta, version_str, chrf, width)) if __name__ == '__main__': diff --git a/nemo/collections/nlp/utils/metrics/squad_metrics.py b/nemo/collections/nlp/metrics/squad_metrics.py similarity index 85% rename from nemo/collections/nlp/utils/metrics/squad_metrics.py rename to nemo/collections/nlp/metrics/squad_metrics.py index 13eb29de1931..e5f0af1e2517 100644 --- a/nemo/collections/nlp/utils/metrics/squad_metrics.py +++ b/nemo/collections/nlp/metrics/squad_metrics.py @@ -15,13 +15,27 @@ See the License for the specific language governing permissions and limitations under the License. """ + import collections -import math -import re -import string from transformers.tokenization_bert import BasicTokenizer +from nemo import logging +from nemo.collections.nlp.data.datasets.datasets_utils import get_tokens, normalize_answer + +__all__ = [ + 'f1_score', + 'exact_match_score', + 'apply_no_ans_threshold', + 'make_eval_dict', + 'merge_eval', + 'find_all_best_thresh', + 'find_best_thresh', + 'normalize_answer', + '_get_best_indexes', + 'get_final_text', +] + def _get_best_indexes(logits, n_best_size): """Get the n-best logits from a list.""" @@ -35,74 +49,6 @@ def _get_best_indexes(logits, n_best_size): return best_indexes -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs - - -def get_tokens(s): - if not s: - return [] - return normalize_answer(s).split() - - -def f1_score(prediction, ground_truth): - prediction_tokens = get_tokens(prediction) - ground_truth_tokens = get_tokens(ground_truth) - common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens) - num_same = sum(common.values()) - if len(ground_truth_tokens) == 0 or len(prediction_tokens) == 0: - # If either is no-answer, then F1 is 1 if they agree, 0 otherwise - return int(ground_truth_tokens == prediction_tokens) - if num_same == 0: - return 0 - precision = 1.0 * num_same / len(prediction_tokens) - recall = 1.0 * num_same / len(ground_truth_tokens) - f1 = (2 * precision * recall) / (precision + recall) - return f1 - - -def exact_match_score(prediction, ground_truth): - return int(normalize_answer(prediction) == normalize_answer(ground_truth)) - - -def normalize_answer(s): - """Lower text and remove punctuation, articles and extra whitespace.""" - - def remove_articles(text): - return re.sub(r'\b(a|an|the)\b', ' ', text) - - def white_space_fix(text): - return ' '.join(text.split()) - - def remove_punc(text): - exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) - - def lower(text): - return text.lower() - - return white_space_fix(remove_articles(remove_punc(lower(s)))) - - def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): """Project the tokenized prediction back to the original text.""" @@ -154,7 +100,7 @@ def _strip_spaces(text): start_position = tok_text.find(pred_text) if start_position == -1: if verbose_logging: - print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + logging.warning("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) return orig_text end_position = start_position + len(pred_text) - 1 @@ -163,7 +109,7 @@ def _strip_spaces(text): if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: - print( + logging.warning( "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text, ) return orig_text @@ -182,7 +128,7 @@ def _strip_spaces(text): if orig_start_position is None: if verbose_logging: - print("Couldn't map start position") + logging.warning("Couldn't map start position") return orig_text orig_end_position = None @@ -193,13 +139,33 @@ def _strip_spaces(text): if orig_end_position is None: if verbose_logging: - print("Couldn't map end position") + logging.warning("Couldn't map end position") return orig_text output_text = orig_text[orig_start_position : (orig_end_position + 1)] return output_text +def f1_score(prediction, ground_truth): + prediction_tokens = get_tokens(prediction) + ground_truth_tokens = get_tokens(ground_truth) + common = collections.Counter(prediction_tokens) & collections.Counter(ground_truth_tokens) + num_same = sum(common.values()) + if len(ground_truth_tokens) == 0 or len(prediction_tokens) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(ground_truth_tokens == prediction_tokens) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def exact_match_score(prediction, ground_truth): + return int(normalize_answer(prediction) == normalize_answer(ground_truth)) + + def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): new_scores = {} for qid, s in scores.items(): @@ -225,7 +191,7 @@ def make_eval_dict(exact_scores, f1_scores, qid_list=None): total = len(qid_list) return collections.OrderedDict( [ - ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total,), + ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total), ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total), ("total", total), ] diff --git a/nemo/collections/nlp/modules/__init__.py b/nemo/collections/nlp/modules/__init__.py deleted file mode 100644 index 97328a8b6cbf..000000000000 --- a/nemo/collections/nlp/modules/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .classifiers import * -from .losses import * -from .transformer_nm import * diff --git a/nemo/collections/nlp/modules/classifiers.py b/nemo/collections/nlp/modules/classifiers.py deleted file mode 100644 index 0d6259cdd31a..000000000000 --- a/nemo/collections/nlp/modules/classifiers.py +++ /dev/null @@ -1,363 +0,0 @@ -__all__ = [ - 'TokenClassifier', - 'BertTokenClassifier', - 'SequenceClassifier', - 'JointIntentSlotClassifier', - 'SequenceRegression', -] - -import torch.nn as nn - -from ..transformer.utils import transformer_weights_init -from nemo.backends.pytorch.common import MultiLayerPerceptron -from nemo.backends.pytorch.nm import LossNM, TrainableNM -from nemo.collections.nlp.transformer.utils import gelu -from nemo.core.neural_types import * - -ACT2FN = {"gelu": gelu, "relu": nn.functional.relu} - - -class BertTokenClassifier(TrainableNM): - """ - Neural module which consists of MLP followed by softmax classifier for each - token in the sequence. - - Args: - hidden_size (int): hidden size (d_model) of the Transformer - num_classes (int): number of classes in softmax classifier, e.g. size - of the vocabulary in language modeling objective - num_layers (int): number of layers in classifier MLP - activation (str): activation function applied in classifier MLP layers - log_softmax (bool): whether to apply log_softmax to MLP output - dropout (float): dropout ratio applied to MLP - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - hidden_states: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - """ - return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} - - @property - def output_ports(self): - """Returns definitions of module output ports. - - logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - """ - return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} - - def __init__( - self, - hidden_size, - num_classes, - activation='relu', - log_softmax=True, - dropout=0.0, - use_transformer_pretrained=True, - ): - super().__init__() - if activation not in ACT2FN: - raise ValueError(f'activation "{activation}" not found') - self.dense = nn.Linear(hidden_size, hidden_size) - self.act = ACT2FN[activation] - self.norm = nn.LayerNorm(hidden_size, eps=1e-12) - self.mlp = MultiLayerPerceptron( - hidden_size, num_classes, self._device, num_layers=1, activation=activation, log_softmax=log_softmax, - ) - self.dropout = nn.Dropout(dropout) - if use_transformer_pretrained: - self.apply(lambda module: transformer_weights_init(module, xavier=False)) - self.to(self._device) - - def forward(self, hidden_states): - hidden_states = self.dropout(hidden_states) - hidden_states = self.dense(hidden_states) - hidden_states = self.act(hidden_states) - transform = self.norm(hidden_states) - logits = self.mlp(transform) - return logits - - -class TokenClassifier(TrainableNM): - """ - Neural module which consists of MLP followed by softmax classifier for each - token in the sequence. - - Args: - hidden_size (int): hidden size (d_model) of the Transformer - num_classes (int): number of classes in softmax classifier, e.g. size - of the vocabulary in language modeling objective - num_layers (int): number of layers in classifier MLP - activation (str): activation function applied in classifier MLP layers - log_softmax (bool): whether to apply log_softmax to MLP output - dropout (float): dropout ratio applied to MLP - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - hidden_states: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - """ - return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} - - @property - def output_ports(self): - """Returns definitions of module output ports. - - logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - """ - return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} - - def __init__( - self, - hidden_size, - num_classes, - name=None, - num_layers=2, - activation='relu', - log_softmax=True, - dropout=0.0, - use_transformer_pretrained=True, - ): - super().__init__() - - self.name = name - self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax,) - self.dropout = nn.Dropout(dropout) - if use_transformer_pretrained: - self.apply(lambda module: transformer_weights_init(module, xavier=False)) - # self.to(self._device) # sometimes this is necessary - - def __str__(self): - name = TrainableNM.__str__(self) - - if self.name: - name = self.name + name - return name - - def forward(self, hidden_states): - hidden_states = self.dropout(hidden_states) - logits = self.mlp(hidden_states) - return logits - - -class SequenceClassifier(TrainableNM): - """ - Neural module which consists of MLP followed by softmax classifier for each - sequence in the batch. - - Args: - hidden_size (int): hidden size (d_model) of the Transformer - num_classes (int): number of classes in softmax classifier, e.g. number - of different sentiments - num_layers (int): number of layers in classifier MLP - activation (str): activation function applied in classifier MLP layers - log_softmax (bool): whether to apply log_softmax to MLP output - dropout (float): dropout ratio applied to MLP - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - hidden_states: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - """ - return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} - - @property - def output_ports(self): - """Returns definitions of module output ports. - - logits: - 0: AxisType(BatchTag) - - 1: AxisType(ChannelTag) - """ - return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})} - - def __init__( - self, - hidden_size, - num_classes, - num_layers=2, - activation='relu', - log_softmax=True, - dropout=0.0, - use_transformer_pretrained=True, - ): - super().__init__() - self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax,) - self.dropout = nn.Dropout(dropout) - if use_transformer_pretrained: - self.apply(lambda module: transformer_weights_init(module, xavier=False)) - # self.to(self._device) # sometimes this is necessary - - def forward(self, hidden_states, idx_conditioned_on=0): - hidden_states = self.dropout(hidden_states) - logits = self.mlp(hidden_states[:, idx_conditioned_on]) - return logits - - -class JointIntentSlotClassifier(TrainableNM): - """ - The softmax classifier for the joint intent classification and slot - filling task which consists of a dense layer + relu + softmax for - predicting the slots and similar for predicting the intents. - - Args: - hidden_size (int): the size of the hidden state for the dense layer - num_intents (int): number of intents - num_slots (int): number of slots - dropout (float): dropout to be applied to the layer - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - hidden_states: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - """ - return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} - - @property - def output_ports(self): - """Returns definitions of module output ports. - - intent_logits: - 0: AxisType(BatchTag) - - 1: AxisType(ChannelTag) - - slot_logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - """ - return { - "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), - "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}), - } - - def __init__(self, hidden_size, num_intents, num_slots, dropout=0.0, use_transformer_pretrained=True): - super().__init__() - self.dropout = nn.Dropout(dropout) - self.slot_mlp = MultiLayerPerceptron( - hidden_size, - num_classes=num_slots, - device=self._device, - num_layers=2, - activation='relu', - log_softmax=False, - ) - self.intent_mlp = MultiLayerPerceptron( - hidden_size, - num_classes=num_intents, - device=self._device, - num_layers=2, - activation='relu', - log_softmax=False, - ) - if use_transformer_pretrained: - self.apply(lambda module: transformer_weights_init(module, xavier=False)) - # self.to(self._device) - - def forward(self, hidden_states): - hidden_states = self.dropout(hidden_states) - intent_logits = self.intent_mlp(hidden_states[:, 0]) - slot_logits = self.slot_mlp(hidden_states) - return intent_logits, slot_logits - - -class SequenceRegression(TrainableNM): - """ - Neural module which consists of MLP, generates a single number prediction - that could be used for a regression task. An example of this task would be - semantic textual similatity task, for example, STS-B (from GLUE tasks). - - Args: - hidden_size (int): the size of the hidden state for the dense layer - num_layers (int): number of layers in classifier MLP - activation (str): activation function applied in classifier MLP layers - dropout (float): dropout ratio applied to MLP - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - hidden_states: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - """ - return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} - - @property - def output_ports(self): - """Returns definitions of module output ports. - - preds: - 0: AxisType(RegressionTag) - """ - return { - "preds": NeuralType({0: AxisType(RegressionTag)}), - } - - def __init__( - self, hidden_size, num_layers=2, activation='relu', dropout=0.0, use_transformer_pretrained=True, - ): - super().__init__() - self.mlp = MultiLayerPerceptron( - hidden_size, - num_classes=1, - device=self._device, - num_layers=num_layers, - activation=activation, - log_softmax=False, - ) - self.dropout = nn.Dropout(dropout) - if use_transformer_pretrained: - self.apply(lambda module: transformer_weights_init(module, xavier=False)) - # self.to(self._device) # sometimes this is necessary - - def forward(self, hidden_states, idx_conditioned_on=0): - hidden_states = self.dropout(hidden_states) - preds = self.mlp(hidden_states[:, idx_conditioned_on]) - return preds.view(-1) diff --git a/nemo/collections/nlp/modules/losses.py b/nemo/collections/nlp/modules/losses.py deleted file mode 100644 index 34912f609fa4..000000000000 --- a/nemo/collections/nlp/modules/losses.py +++ /dev/null @@ -1,422 +0,0 @@ -import torch -from torch import nn - -from ..utils.nlp_utils import mask_padded_tokens -from .pytorch_utils import SmoothedCrossEntropyLoss -from nemo.backends.pytorch.nm import LossNM -from nemo.core.neural_types import * - -__all__ = [ - 'JointIntentSlotLoss', - 'LossAggregatorNM', - 'MaskedLanguageModelingLossNM', - 'PaddedSmoothedCrossEntropyLossNM', - 'QuestionAnsweringLoss', - 'TokenClassificationLoss', -] - - -class QuestionAnsweringLoss(LossNM): - """ - Neural module which implements QuestionAnswering loss. - Args: - logits: Output of question answering head, which is a token classfier. - start_positions: Ground truth start positions of the answer w.r.t. - input sequence. If question is unanswerable, this will be - pointing to start token, e.g. [CLS], of the input sequence. - end_positions: Ground truth end positions of the answer w.r.t. - input sequence. If question is unanswerable, this will be - pointing to start token, e.g. [CLS], of the input sequence. - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - - start_positions: - 0: AxisType(BatchTag) - - end_positions: - 0: AxisType(BatchTag) - """ - return { - "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}), - "start_positions": NeuralType({0: AxisType(BatchTag)}), - "end_positions": NeuralType({0: AxisType(BatchTag)}), - } - - @property - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - - start_logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - end_logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - """ - return { - "loss": NeuralType(None), - "start_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "end_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - def __init__(self): - super().__init__() - - def _loss_function(self, **kwargs): - logits = kwargs['logits'] - start_positions = kwargs['start_positions'] - end_positions = kwargs['end_positions'] - start_logits, end_logits = logits.split(1, dim=-1) - start_logits = start_logits.squeeze(-1) - end_logits = end_logits.squeeze(-1) - # If we are on multi-GPU, split add a dimension - if len(start_positions.size()) > 1: - start_positions = start_positions.squeeze(-1) - if len(end_positions.size()) > 1: - end_positions = end_positions.squeeze(-1) - ignored_index = start_logits.size(1) - start_positions.clamp_(0, ignored_index) - end_positions.clamp_(0, ignored_index) - - loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) - start_loss = loss_fct(start_logits, start_positions) - end_loss = loss_fct(end_logits, end_positions) - total_loss = (start_loss + end_loss) / 2 - return total_loss, start_logits, end_logits - - -class MaskedLanguageModelingLossNM(LossNM): - """ - Neural module which implements Masked Language Modeling (MLM) loss. - - Args: - label_smoothing (float): label smoothing regularization coefficient - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - - output_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - output_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - """ - return { - "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}), - "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - @property - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - """ - return {"loss": NeuralType(None)} - - def __init__(self, label_smoothing=0.0): - super().__init__() - self._criterion = SmoothedCrossEntropyLoss(label_smoothing) - - def _loss_function(self, logits, output_ids, output_mask): - loss = self._criterion(logits, output_ids, output_mask) - return loss - - -class LossAggregatorNM(LossNM): - """ - Neural module which combines sums several losses into one. - - Args: - num_inputs (int): number of input losses - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - """ - input_ports = {} - for i in range(self.num_losses): - input_ports["loss_" + str(i + 1)] = NeuralType(None) - - return input_ports - - @property - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - """ - return {"loss": NeuralType(None)} - - def __init__(self, num_inputs=2): - super().__init__() - # Store number of inputs/losses. - self.num_losses = num_inputs - - def _loss_function(self, **kwargs): - values = [kwargs[x] for x in sorted(kwargs.keys())] - loss = values[0] - for loss_i in values[1:]: - loss = loss.add(loss_i) - return loss - - -class TokenClassificationLoss(LossNM): - """ - Neural module which implements Token Classification loss. - - Args: - num_classes (int): number of classes in a classifier, e.g. size - of the vocabulary in language modeling objective - logits (float): output of the classifier - labels (long): ground truth labels - loss_mask (long): to differentiate from original tokens and paddings - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - - labels: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - loss_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - """ - return { - "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}), - "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - @property - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - """ - return {"loss": NeuralType(None)} - - def __init__(self, num_classes, class_weights=None): - super().__init__() - if class_weights: - class_weights = torch.FloatTensor(class_weights).to(self._device) - - self._criterion = nn.CrossEntropyLoss(weight=class_weights) - self.num_classes = num_classes - - def _loss_function(self, logits, labels, loss_mask): - active_loss = loss_mask.view(-1) > 0.5 - active_logits = logits.view(-1, self.num_classes)[active_loss] - active_labels = labels.view(-1)[active_loss] - - loss = self._criterion(active_logits, active_labels) - return loss - - -class JointIntentSlotLoss(LossNM): - """ - Loss function for the joint intent classification and slot - filling task. - - The loss is a joint loss of both tasks, aim to maximize: - p(y^i | x)P(y^s1, y^s2, ..., y^sn | x) - - with y^i being the predicted intent and y^s1, y^s2, ..., y^sn - are the predicted slots corresponding to x1, x2, ..., xn. - - Args: - hidden_states: output of the hidden layers - intents: ground truth intents, - slots: ground truth slots. - input_mask: to differentiate from original tokens and paddings - intent_loss_weight: the loss is the sum of: - intent_loss_weight * intent_loss + - (1 - intent_loss_weight) * slot_loss - - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - intent_logits: - 0: AxisType(BatchTag) - - 1: AxisType(ChannelTag) - - slot_logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - - loss_mask: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - intents: - 0: AxisType(BatchTag) - - slots: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - """ - return { - "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), - "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}), - "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "intents": NeuralType({0: AxisType(BatchTag),}), - "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - @property - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - """ - return {"loss": NeuralType(None)} - - def __init__( - self, num_slots, slot_classes_loss_weights=None, intent_classes_loss_weights=None, intent_loss_weight=0.6 - ): - super().__init__() - self.num_slots = num_slots - self.intent_loss_weight = intent_loss_weight - self.slot_classes_loss_weights = slot_classes_loss_weights - self.intent_classes_loss_weights = intent_classes_loss_weights - - # For weighted loss to tackle class imbalance - if slot_classes_loss_weights: - self.slot_classes_loss_weights = torch.FloatTensor(slot_classes_loss_weights).to(self._device) - - if intent_classes_loss_weights: - self.intent_classes_loss_weights = torch.FloatTensor(intent_classes_loss_weights).to(self._device) - - self._criterion_intent = nn.CrossEntropyLoss(weight=self.intent_classes_loss_weights) - self._criterion_slot = nn.CrossEntropyLoss(weight=self.slot_classes_loss_weights) - - def _loss_function(self, intent_logits, slot_logits, loss_mask, intents, slots): - intent_loss = self._criterion_intent(intent_logits, intents) - - active_loss = loss_mask.view(-1) > 0.5 - active_logits = slot_logits.view(-1, self.num_slots)[active_loss] - active_labels = slots.view(-1)[active_loss] - - # To support empty active_labels - if len(active_labels) == 0: - slot_loss = 0.0 - else: - slot_loss = self._criterion_slot(active_logits, active_labels) - loss = intent_loss * self.intent_loss_weight + slot_loss * (1 - self.intent_loss_weight) - - return loss - - -class PaddedSmoothedCrossEntropyLossNM(LossNM): - """ - Neural module which calculates CrossEntropyLoss and - 1) excludes padding tokens from loss calculation - 2) allows to use label smoothing regularization - 3) allows to calculate loss for the desired number of last tokens - - Args: - label_smoothing (float): label smoothing regularization coefficient - predict_last_k (int): how many last tokens to use for the loss - calculation, important for fast evaluation of LM perplexity - """ - - @property - def input_ports(self): - """Returns definitions of module input ports. - - logits: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - - 2: AxisType(ChannelTag) - - target_ids: - 0: AxisType(BatchTag) - - 1: AxisType(TimeTag) - """ - return { - "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}), - "target_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - } - - @property - def output_ports(self): - """Returns definitions of module output ports. - - loss: - NeuralType(None) - """ - return {"loss": NeuralType(None)} - - def __init__(self, pad_id, label_smoothing=0, predict_last_k=0): - super().__init__() - - # Create the loss function object. - loss_params = {"label_smoothing": label_smoothing, "predict_last_k": predict_last_k} - self._loss_fn = SmoothedCrossEntropyLoss(**loss_params) - # Store padding. - self._pad_id = pad_id - - def _loss_function(self, logits, target_ids): - target_mask = mask_padded_tokens(target_ids, self._pad_id).to(logits.dtype) - loss = self._loss_fn(logits, target_ids, target_mask) - return loss diff --git a/nemo/collections/nlp/nm/__init__.py b/nemo/collections/nlp/nm/__init__.py new file mode 100644 index 000000000000..88ccabb8a58a --- /dev/null +++ b/nemo/collections/nlp/nm/__init__.py @@ -0,0 +1,19 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import nemo.collections.nlp.nm.data_layers +import nemo.collections.nlp.nm.losses +import nemo.collections.nlp.nm.trainables diff --git a/nemo/collections/nlp/nm/data_layers/__init__.py b/nemo/collections/nlp/nm/data_layers/__init__.py new file mode 100644 index 000000000000..897974506fae --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/__init__.py @@ -0,0 +1,26 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.nm.data_layers.glue_benchmark_datalayer import * +from nemo.collections.nlp.nm.data_layers.joint_intent_slot_datalayer import * +from nemo.collections.nlp.nm.data_layers.lm_bert_datalayer import * +from nemo.collections.nlp.nm.data_layers.lm_transformer_datalayer import * +from nemo.collections.nlp.nm.data_layers.machine_translation_datalayer import * +from nemo.collections.nlp.nm.data_layers.punctuation_capitalization_datalayer import * +from nemo.collections.nlp.nm.data_layers.qa_squad_datalayer import * +from nemo.collections.nlp.nm.data_layers.text_classification_datalayer import * +from nemo.collections.nlp.nm.data_layers.text_datalayer import * +from nemo.collections.nlp.nm.data_layers.token_classification_datalayer import * diff --git a/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py b/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py new file mode 100644 index 000000000000..baf55f55c047 --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/glue_benchmark_datalayer.py @@ -0,0 +1,152 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data import GLUEDataset +from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer +from nemo.core import AxisType, BatchTag, CategoricalTag, NeuralType, RegressionTag, TimeTag + +__all__ = ['GlueClassificationDataLayer', 'GlueRegressionDataLayer'] + + +class GlueClassificationDataLayer(TextDataLayer): + """ + Creates the data layer to use for the GLUE classification tasks, + more details here: https://gluebenchmark.com/tasks + + All the data processing is done in GLUEDataset. + + Args: + dataset_type (GLUEDataset): + the dataset that needs to be converted to DataLayerNM + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + labels: + 0: AxisType(CategoricalTag) + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "labels": NeuralType({0: AxisType(CategoricalTag)}), + } + + def __init__( + self, + data_dir, + tokenizer, + max_seq_length, + processor, + evaluate=False, + token_params={}, + shuffle=False, + batch_size=64, + dataset_type=GLUEDataset, + ): + dataset_params = { + 'data_dir': data_dir, + 'output_mode': 'classification', + 'processor': processor, + 'evaluate': evaluate, + 'token_params': token_params, + 'tokenizer': tokenizer, + 'max_seq_length': max_seq_length, + } + super().__init__(dataset_type, dataset_params, batch_size, shuffle) + + +class GlueRegressionDataLayer(TextDataLayer): + """ + Creates the data layer to use for the GLUE STS-B regression task, + more details here: https://gluebenchmark.com/tasks + + All the data processing is done in GLUEDataset. + + Args: + dataset_type (GLUEDataset): + the dataset that needs to be converted to DataLayerNM + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + labels: + 0: AxisType(RegressionTag) + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "labels": NeuralType({0: AxisType(RegressionTag)}), + } + + def __init__( + self, + data_dir, + tokenizer, + max_seq_length, + processor, + evaluate=False, + token_params={}, + shuffle=False, + batch_size=64, + dataset_type=GLUEDataset, + ): + dataset_params = { + 'data_dir': data_dir, + 'output_mode': 'regression', + 'processor': processor, + 'evaluate': evaluate, + 'token_params': token_params, + 'tokenizer': tokenizer, + 'max_seq_length': max_seq_length, + } + + super().__init__(dataset_type, dataset_params, batch_size, shuffle) diff --git a/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py b/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py new file mode 100644 index 000000000000..354be6b32a5f --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/joint_intent_slot_datalayer.py @@ -0,0 +1,177 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data import BertJointIntentSlotDataset, BertJointIntentSlotInferDataset +from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer +from nemo.core import AxisType, BatchTag, NeuralType, TimeTag + +__all__ = ['BertJointIntentSlotDataLayer', 'BertJointIntentSlotInferDataLayer'] + + +class BertJointIntentSlotDataLayer(TextDataLayer): + """ + Creates the data layer to use for the task of joint intent + and slot classification with pretrained model. + + All the data processing is done in BertJointIntentSlotDataset. + + input_mask: used to ignore some of the input tokens like paddings + + loss_mask: used to mask and ignore tokens in the loss function + + subtokens_mask: used to ignore the outputs of unwanted tokens in + the inference and evaluation like the start and end tokens + + Args: + dataset (BertJointIntentSlotDataset): + the dataset that needs to be converted to DataLayerNM + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + loss_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + subtokens_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + intents: + 0: AxisType(BatchTag) + + slots: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "intents": NeuralType({0: AxisType(BatchTag)}), + "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + def __init__( + self, + input_file, + slot_file, + pad_label, + tokenizer, + max_seq_length, + num_samples=-1, + shuffle=False, + batch_size=64, + ignore_extra_tokens=False, + ignore_start_end=False, + dataset_type=BertJointIntentSlotDataset, + ): + dataset_params = { + 'input_file': input_file, + 'slot_file': slot_file, + 'pad_label': pad_label, + 'tokenizer': tokenizer, + 'max_seq_length': max_seq_length, + 'num_samples': num_samples, + 'shuffle': shuffle, + 'ignore_extra_tokens': ignore_extra_tokens, + 'ignore_start_end': ignore_start_end, + } + super().__init__(dataset_type, dataset_params, batch_size, shuffle) + + +class BertJointIntentSlotInferDataLayer(TextDataLayer): + """ + Creates the data layer to use for the task of joint intent + and slot classification with pretrained model. This is for + + All the data processing is done in BertJointIntentSlotInferDataset. + + input_mask: used to ignore some of the input tokens like paddings + + loss_mask: used to mask and ignore tokens in the loss function + + subtokens_mask: used to ignore the outputs of unwanted tokens in + the inference and evaluation like the start and end tokens + + Args: + dataset (BertJointIntentSlotInferDataset): + the dataset that needs to be converted to DataLayerNM + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + loss_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + subtokens_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + def __init__(self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertJointIntentSlotInferDataset): + dataset_params = {'queries': queries, 'tokenizer': tokenizer, 'max_seq_length': max_seq_length} + super().__init__(dataset_type, dataset_params, batch_size, shuffle=False) diff --git a/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py b/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py new file mode 100644 index 000000000000..7034c7c18c38 --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/lm_bert_datalayer.py @@ -0,0 +1,225 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import os +import random + +import h5py +import numpy as np +import torch +from torch.utils import data as pt_data + +from nemo.backends.pytorch import DataLayerNM +from nemo.collections.nlp.data import BertPretrainingDataset, BertPretrainingPreprocessedDataset +from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer +from nemo.core import AxisType, BatchTag, NeuralType, TimeTag + +__all__ = ['BertPretrainingDataLayer', 'BertPretrainingPreprocessedDataLayer'] + + +class BertPretrainingDataLayer(TextDataLayer): + """ + Data layer for masked language modeling task. + + Args: + tokenizer (TokenizerSpec): tokenizer + dataset (str): directory or a single file with dataset documents + max_seq_length (int): maximum allowed length of the text segments + mask_probability (float): probability of masking input sequence tokens + batch_size (int): batch size in segments + short_seeq_prob (float): Probability of creating sequences which are + shorter than the maximum length. + Defualts to 0.1. + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: indices of tokens which constitute batches of text segments + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: indices of token types (e.g., sentences A & B in BERT) + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: bool tensor with 0s in place of tokens to be masked + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + output_ids: indices of output tokens which should be predicted + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + output_mask: bool tensor with 0s in place of tokens to be excluded + from loss calculation + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + labels: indices of classes to be predicted from [CLS] token of text + segments (e.g, 0 or 1 in next sentence prediction task) + 0: AxisType(BatchTag) + + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "labels": NeuralType({0: AxisType(BatchTag)}), + } + + def __init__(self, tokenizer, dataset, max_seq_length, mask_probability, short_seq_prob=0.1, batch_size=64): + dataset_params = { + 'tokenizer': tokenizer, + 'dataset': dataset, + 'max_seq_length': max_seq_length, + 'mask_probability': mask_probability, + 'short_seq_prob': short_seq_prob, + } + super().__init__(BertPretrainingDataset, dataset_params, batch_size, shuffle=False) + + +class BertPretrainingPreprocessedDataLayer(DataLayerNM): + """ + Data layer for masked language modeling task. + + Args: + tokenizer (TokenizerSpec): tokenizer + dataset (str): directory or a single file with dataset documents + max_seq_length (int): maximum allowed length of the text segments + mask_probability (float): probability of masking input sequence tokens + batch_size (int): batch size in segments + short_seeq_prob (float): Probability of creating sequences which are + shorter than the maximum length. + Defualts to 0.1. + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: indices of tokens which constitute batches of text segments + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: indices of token types (e.g., sentences A & B in BERT) + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: bool tensor with 0s in place of tokens to be masked + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + output_ids: indices of output tokens which should be predicted + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + output_mask: bool tensor with 0s in place of tokens to be excluded + from loss calculation + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + labels: indices of classes to be predicted from [CLS] token of text + segments (e.g, 0 or 1 in next sentence prediction task) + 0: AxisType(BatchTag) + + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "labels": NeuralType({0: AxisType(BatchTag)}), + } + + def __init__(self, dataset, max_pred_length, batch_size=64, training=True): + + if os.path.isdir(dataset): + self.files = [ + os.path.join(dataset, f) for f in os.listdir(dataset) if os.path.isfile(os.path.join(dataset, f)) + ] + else: + self.files = [dataset] + self.files.sort() + self.num_files = len(self.files) + self._batch_size = batch_size + self.max_pred_length = max_pred_length + self.training = training + total_length = 0 + for f in self.files: + fp = h5py.File(f, 'r') + total_length += len(fp['input_ids']) + fp.close() + self.total_length = total_length + super().__init__() + + def _collate_fn(self, x): + num_components = len(x[0]) + components = [[] for _ in range(num_components)] + batch_size = len(x) + for i in range(batch_size): + for j in range(num_components): + components[j].append(x[i][j]) + src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids = [np.stack(x, axis=0) for x in components] + src_ids = torch.Tensor(src_ids).long().to(self._device) + src_segment_ids = torch.Tensor(src_segment_ids).long().to(self._device) + src_mask = torch.Tensor(src_mask).long().to(self._device) + tgt_ids = torch.Tensor(tgt_ids).long().to(self._device) + tgt_mask = torch.Tensor(tgt_mask).long().to(self._device) + sent_ids = torch.Tensor(sent_ids).long().to(self._device) + return src_ids, src_segment_ids, src_mask, tgt_ids, tgt_mask, sent_ids + + def __len__(self): + return self.total_length + + @property + def dataset(self): + return None + + @property + def data_iterator(self): + while True: + if self.training: + random.shuffle(self.files) + for f_id in range(self.num_files): + data_file = self.files[f_id] + train_data = BertPretrainingPreprocessedDataset( + input_file=data_file, max_pred_length=self.max_pred_length + ) + train_sampler = pt_data.RandomSampler(train_data) + train_dataloader = pt_data.DataLoader( + dataset=train_data, + batch_size=self._batch_size, + collate_fn=self._collate_fn, + shuffle=False, + sampler=train_sampler, + ) + for x in train_dataloader: + yield x diff --git a/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py b/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py new file mode 100644 index 000000000000..64e79ffea9f1 --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/lm_transformer_datalayer.py @@ -0,0 +1,72 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data import LanguageModelingDataset +from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer +from nemo.core import AxisType, BatchTag, NeuralType, TimeTag + +__all__ = ['LanguageModelingDataLayer'] + + +class LanguageModelingDataLayer(TextDataLayer): + """ + Data layer for standard language modeling task. + + Args: + dataset (str): path to text document with data + tokenizer (TokenizerSpec): tokenizer + max_seq_length (int): maximum allowed length of the text segments + batch_step (int): how many tokens to skip between two successive + segments of text when constructing batches + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: indices of tokens which constitute batches of text segments + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: bool tensor with 0s in place of tokens to be masked + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + labels: indices of tokens which should be predicted from each of the + corresponding tokens in input_ids; for left-to-right language + modeling equals to input_ids shifted by 1 to the right + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + def __init__( + self, dataset, tokenizer, max_seq_length, batch_size, batch_step=128, dataset_type=LanguageModelingDataset + ): + dataset_params = { + 'dataset': dataset, + 'tokenizer': tokenizer, + 'max_seq_length': max_seq_length, + 'batch_step': batch_step, + } + super().__init__(dataset_type, dataset_params, batch_size, shuffle=False) diff --git a/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py b/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py new file mode 100644 index 000000000000..23aa1c54e913 --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/machine_translation_datalayer.py @@ -0,0 +1,137 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import torch +from torch.utils import data as pt_data + +import nemo +from nemo.collections.nlp.data import TranslationDataset +from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer +from nemo.core import AxisType, BatchTag, NeuralType, TimeTag + +__all__ = ['TranslationDataLayer'] + + +class TranslationDataLayer(TextDataLayer): + """ + Data layer for neural machine translation from source (src) language to + target (tgt) language. + + Args: + tokenizer_src (TokenizerSpec): source language tokenizer + tokenizer_tgt (TokenizerSpec): target language tokenizer + dataset_src (str): path to source data + dataset_tgt (str): path to target data + tokens_in_batch (int): maximum allowed number of tokens in batches, + batches will be constructed to minimize the use of tokens + clean (bool): whether to use parallel data cleaning such as removing + pairs with big difference in sentences length, removing pairs with + the same tokens in src and tgt, etc; useful for training data layer + and should not be used in evaluation data layer + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + src_ids: indices of tokens which correspond to source sentences + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + src_mask: bool tensor with 0s in place of source tokens to be masked + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + tgt_ids: indices of tokens which correspond to target sentences + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + tgt_mask: bool tensor with 0s in place of target tokens to be masked + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + labels: indices of tokens which should be predicted from each of the + corresponding target tokens in tgt_ids; for standard neural + machine translation equals to tgt_ids shifted by 1 to the right + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + sent_ids: indices of the sentences in a batch; important for + evaluation with external metrics, such as SacreBLEU + 0: AxisType(BatchTag) + + """ + return { + "src_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "src_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "tgt_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "tgt_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "sent_ids": NeuralType({0: AxisType(BatchTag)}), + } + + def __init__( + self, + tokenizer_src, + tokenizer_tgt, + dataset_src, + dataset_tgt, + tokens_in_batch=1024, + shuffle=False, + clean=False, + dataset_type=TranslationDataset, + ): + dataset_params = { + 'tokenizer_src': tokenizer_src, + 'tokenizer_tgt': tokenizer_tgt, + 'dataset_src': dataset_src, + 'dataset_tgt': dataset_tgt, + 'tokens_in_batch': tokens_in_batch, + 'clean': clean, + } + super().__init__(dataset_type, dataset_params, batch_size=1, shuffle=shuffle) + + if self._placement == nemo.core.DeviceType.AllGpu: + sampler = pt_data.distributed.DistributedSampler(self._dataset) + else: + sampler = None + + self._dataloader = pt_data.DataLoader( + dataset=self._dataset, batch_size=1, collate_fn=self._collate_fn, shuffle=sampler is None, sampler=sampler + ) + + def _collate_fn(self, x): + src_ids, src_mask, tgt_ids, tgt_mask, labels, sent_ids = x[0] + src_ids = torch.Tensor(src_ids).long().to(self._device) + src_mask = torch.Tensor(src_mask).float().to(self._device) + tgt_ids = torch.Tensor(tgt_ids).long().to(self._device) + tgt_mask = torch.Tensor(tgt_mask).float().to(self._device) + labels = torch.Tensor(labels).long().to(self._device) + sent_ids = torch.Tensor(sent_ids).long().to(self._device) + return src_ids, src_mask, tgt_ids, tgt_mask, labels, sent_ids + + @property + def dataset(self): + return None + + @property + def data_iterator(self): + return self._dataloader diff --git a/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py b/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py new file mode 100644 index 000000000000..41b952827043 --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/punctuation_capitalization_datalayer.py @@ -0,0 +1,106 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data import BertPunctuationCapitalizationDataset +from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer +from nemo.core import AxisType, BatchTag, NeuralType, TimeTag + +__all__ = ['PunctuationCapitalizationDataLayer'] + + +class PunctuationCapitalizationDataLayer(TextDataLayer): + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + loss_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + subtokens_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + punct_labels: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + capit_labels: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "punct_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "capit_labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + def __init__( + self, + text_file, + label_file, + tokenizer, + max_seq_length, + pad_label='O', + punct_label_ids=None, + capit_label_ids=None, + num_samples=-1, + shuffle=False, + batch_size=64, + ignore_extra_tokens=False, + ignore_start_end=False, + use_cache=False, + dataset_type=BertPunctuationCapitalizationDataset, + ): + dataset_params = { + 'text_file': text_file, + 'label_file': label_file, + 'max_seq_length': max_seq_length, + 'tokenizer': tokenizer, + 'num_samples': num_samples, + 'shuffle': shuffle, + 'pad_label': pad_label, + 'punct_label_ids': punct_label_ids, + 'capit_label_ids': capit_label_ids, + 'ignore_extra_tokens': ignore_extra_tokens, + 'ignore_start_end': ignore_start_end, + 'use_cache': use_cache, + } + super().__init__(dataset_type, dataset_params, batch_size, shuffle) diff --git a/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py new file mode 100644 index 000000000000..56d912a35a6d --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/qa_squad_datalayer.py @@ -0,0 +1,108 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data import SquadDataset +from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer +from nemo.core import AxisType, BatchTag, NeuralType, TimeTag + +__all__ = ['BertQuestionAnsweringDataLayer'] + + +class BertQuestionAnsweringDataLayer(TextDataLayer): + """ + Creates the data layer to use for Question Answering classification task. + + Args: + data_dir (str): Directory that contains train.*.json and dev.*.json. + tokenizer (obj): Tokenizer object, e.g. NemoBertTokenizer. + version_2_with_negative (bool): True if training should allow + unanswerable questions. + doc_stride (int): When splitting up a long document into chunks, + how much stride to take between chunks. + max_query_length (iny): All training files which have a duration less + than min_duration are dropped. Can't be used if the `utt2dur` file + does not exist. Defaults to None. + max_seq_length (int): All training files which have a duration more + than max_duration are dropped. Can't be used if the `utt2dur` file + does not exist. Defaults to None. + mode (str): Use "train" or "dev" to define between + training and evaluation. + batch_size (int): Batch size. Defaults to 64. + dataset_type (class): Question Answering class. + Defaults to SquadDataset. + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + start_positions: + 0: AxisType(BatchTag) + + end_positions: + 0: AxisType(BatchTag) + + unique_ids: + 0: AxisType(BatchTag) + + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "start_positions": NeuralType({0: AxisType(BatchTag)}), + "end_positions": NeuralType({0: AxisType(BatchTag)}), + "unique_ids": NeuralType({0: AxisType(BatchTag)}), + } + + def __init__( + self, + data_dir, + tokenizer, + version_2_with_negative, + doc_stride, + max_query_length, + max_seq_length, + mode="train", + batch_size=64, + dataset_type=SquadDataset, + ): + dataset_params = { + 'data_dir': data_dir, + 'mode': mode, + 'tokenizer': tokenizer, + 'version_2_with_negative': version_2_with_negative, + 'max_query_length': max_query_length, + 'max_seq_length': max_seq_length, + 'doc_stride': doc_stride, + } + + super().__init__(dataset_type, dataset_params, batch_size, shuffle=False) diff --git a/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py new file mode 100644 index 000000000000..738144586dd5 --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/text_classification_datalayer.py @@ -0,0 +1,83 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data import BertTextClassificationDataset +from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer +from nemo.core import AxisType, BatchTag, NeuralType, TimeTag + +__all__ = ['BertSentenceClassificationDataLayer'] + + +class BertSentenceClassificationDataLayer(TextDataLayer): + """ + Creates the data layer to use for the task of sentence classification + with pretrained model. + + All the data processing is done BertSentenceClassificationDataset. + + Args: + dataset (BertTextClassificationDataset): + the dataset that needs to be converted to DataLayerNM + """ + + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + labels: + 0: AxisType(BatchTag) + + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "labels": NeuralType({0: AxisType(BatchTag)}), + } + + def __init__( + self, + input_file, + tokenizer, + max_seq_length, + num_samples=-1, + shuffle=False, + batch_size=64, + dataset_type=BertTextClassificationDataset, + ): + dataset_params = { + 'input_file': input_file, + 'tokenizer': tokenizer, + 'max_seq_length': max_seq_length, + 'num_samples': num_samples, + 'shuffle': shuffle, + } + super().__init__(dataset_type, dataset_params, batch_size, shuffle) diff --git a/nemo/collections/nlp/nm/data_layers/text_datalayer.py b/nemo/collections/nlp/nm/data_layers/text_datalayer.py new file mode 100644 index 000000000000..a2f2cccf2a64 --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/text_datalayer.py @@ -0,0 +1,47 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.backends.pytorch import DataLayerNM +from nemo.collections.nlp.data.datasets import * + +__all__ = ['TextDataLayer'] + + +class TextDataLayer(DataLayerNM): + """ + Generic Text Data Layer NM which wraps PyTorch's dataset + + Args: + dataset_type: type of dataset used for this datalayer + dataset_params (dict): all the params for the dataset + """ + + def __init__(self, dataset_type, dataset_params, batch_size, shuffle=False): + super().__init__() + self._dataset = dataset_type(**dataset_params) + self._batch_size = batch_size + self._shuffle = shuffle + + def __len__(self): + return len(self._dataset) + + @property + def dataset(self): + return self._dataset + + @property + def data_iterator(self): + return None diff --git a/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py b/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py new file mode 100644 index 000000000000..b4e0d6ecc51a --- /dev/null +++ b/nemo/collections/nlp/nm/data_layers/token_classification_datalayer.py @@ -0,0 +1,143 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.data import BertTokenClassificationDataset, BertTokenClassificationInferDataset +from nemo.collections.nlp.nm.data_layers.text_datalayer import TextDataLayer +from nemo.core import AxisType, BatchTag, NeuralType, TimeTag + +__all__ = ['BertTokenClassificationDataLayer', 'BertTokenClassificationInferDataLayer'] + + +class BertTokenClassificationDataLayer(TextDataLayer): + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + loss_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + subtokens_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + labels: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + def __init__( + self, + text_file, + label_file, + tokenizer, + max_seq_length, + pad_label='O', + label_ids=None, + num_samples=-1, + shuffle=False, + batch_size=64, + ignore_extra_tokens=False, + ignore_start_end=False, + use_cache=False, + dataset_type=BertTokenClassificationDataset, + ): + dataset_params = { + 'text_file': text_file, + 'label_file': label_file, + 'max_seq_length': max_seq_length, + 'tokenizer': tokenizer, + 'num_samples': num_samples, + 'shuffle': shuffle, + 'pad_label': pad_label, + 'label_ids': label_ids, + 'ignore_extra_tokens': ignore_extra_tokens, + 'ignore_start_end': ignore_start_end, + 'use_cache': use_cache, + } + super().__init__(dataset_type, dataset_params, batch_size, shuffle) + + +class BertTokenClassificationInferDataLayer(TextDataLayer): + @property + def output_ports(self): + """Returns definitions of module output ports. + + input_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_type_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + input_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + loss_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + subtokens_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + """ + return { + "input_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_type_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "input_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "subtokens_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + def __init__( + self, queries, tokenizer, max_seq_length, batch_size=1, dataset_type=BertTokenClassificationInferDataset, + ): + dataset_params = {'queries': queries, 'tokenizer': tokenizer, 'max_seq_length': max_seq_length} + super().__init__(dataset_type, dataset_params, batch_size, shuffle=False) diff --git a/nemo/collections/nlp/nm/losses/__init__.py b/nemo/collections/nlp/nm/losses/__init__.py new file mode 100644 index 000000000000..76e04131232f --- /dev/null +++ b/nemo/collections/nlp/nm/losses/__init__.py @@ -0,0 +1,23 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.nm.losses.aggregator_loss import * +from nemo.collections.nlp.nm.losses.joint_intent_slot_loss import * +from nemo.collections.nlp.nm.losses.masked_language_modeling_loss import * +from nemo.collections.nlp.nm.losses.padded_smoothed_cross_entropy_loss import * +from nemo.collections.nlp.nm.losses.qa_squad_loss import * +from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import * +from nemo.collections.nlp.nm.losses.token_classification_loss import * diff --git a/nemo/collections/nlp/nm/losses/aggregator_loss.py b/nemo/collections/nlp/nm/losses/aggregator_loss.py new file mode 100644 index 000000000000..7a66c3cb85f1 --- /dev/null +++ b/nemo/collections/nlp/nm/losses/aggregator_loss.py @@ -0,0 +1,61 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.backends.pytorch import LossNM +from nemo.core import NeuralType + +__all__ = ['LossAggregatorNM'] + + +class LossAggregatorNM(LossNM): + """ + Neural module which combines sums several losses into one. + + Args: + num_inputs (int): number of input losses + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + """ + input_ports = {} + for i in range(self.num_losses): + input_ports["loss_" + str(i + 1)] = NeuralType(None) + + return input_ports + + @property + def output_ports(self): + """Returns definitions of module output ports. + + loss: + NeuralType(None) + """ + return {"loss": NeuralType(None)} + + def __init__(self, num_inputs=2): + # Store number of inputs/losses. + self.num_losses = num_inputs + LossNM.__init__(self) + + def _loss_function(self, **kwargs): + values = [kwargs[x] for x in sorted(kwargs.keys())] + loss = values[0] + for loss_i in values[1:]: + loss = loss.add(loss_i) + return loss diff --git a/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py b/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py new file mode 100644 index 000000000000..1eb8b7e5610f --- /dev/null +++ b/nemo/collections/nlp/nm/losses/joint_intent_slot_loss.py @@ -0,0 +1,127 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import torch +from torch import nn + +from nemo.backends.pytorch import LossNM +from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag + +__all__ = ['JointIntentSlotLoss'] + + +class JointIntentSlotLoss(LossNM): + """ + Loss function for the joint intent classification and slot + filling task. + + The loss is a joint loss of both tasks, aim to maximize: + p(y^i | x)P(y^s1, y^s2, ..., y^sn | x) + + with y^i being the predicted intent and y^s1, y^s2, ..., y^sn + are the predicted slots corresponding to x1, x2, ..., xn. + + Args: + hidden_states: output of the hidden layers + intents: ground truth intents, + slots: ground truth slots. + input_mask: to differentiate from original tokens and paddings + intent_loss_weight: the loss is the sum of: + intent_loss_weight * intent_loss + + (1 - intent_loss_weight) * slot_loss + + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + intent_logits: + 0: AxisType(BatchTag) + + 1: AxisType(ChannelTag) + + slot_logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + + loss_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + intents: + 0: AxisType(BatchTag) + + slots: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + """ + return { + "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), + "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), + "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "intents": NeuralType({0: AxisType(BatchTag)}), + "slots": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + @property + def output_ports(self): + """Returns definitions of module output ports. + + loss: + NeuralType(None) + """ + return {"loss": NeuralType(None)} + + def __init__( + self, num_slots, slot_classes_loss_weights=None, intent_classes_loss_weights=None, intent_loss_weight=0.6, + ): + LossNM.__init__(self) + self.num_slots = num_slots + self.intent_loss_weight = intent_loss_weight + self.slot_classes_loss_weights = slot_classes_loss_weights + self.intent_classes_loss_weights = intent_classes_loss_weights + + # For weighted loss to tackle class imbalance + if slot_classes_loss_weights: + self.slot_classes_loss_weights = torch.FloatTensor(slot_classes_loss_weights).to(self._device) + + if intent_classes_loss_weights: + self.intent_classes_loss_weights = torch.FloatTensor(intent_classes_loss_weights).to(self._device) + + self._criterion_intent = nn.CrossEntropyLoss(weight=self.intent_classes_loss_weights) + self._criterion_slot = nn.CrossEntropyLoss(weight=self.slot_classes_loss_weights) + + def _loss_function(self, intent_logits, slot_logits, loss_mask, intents, slots): + intent_loss = self._criterion_intent(intent_logits, intents) + + active_loss = loss_mask.view(-1) > 0.5 + active_logits = slot_logits.view(-1, self.num_slots)[active_loss] + active_labels = slots.view(-1)[active_loss] + + # To support empty active_labels + if len(active_labels) == 0: + slot_loss = 0.0 + else: + slot_loss = self._criterion_slot(active_logits, active_labels) + loss = intent_loss * self.intent_loss_weight + slot_loss * (1 - self.intent_loss_weight) + + return loss diff --git a/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py b/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py new file mode 100644 index 000000000000..e5516d9f33c7 --- /dev/null +++ b/nemo/collections/nlp/nm/losses/masked_language_modeling_loss.py @@ -0,0 +1,74 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.backends.pytorch import LossNM +from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import SmoothedCrossEntropyLoss +from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag + +__all__ = ['MaskedLanguageModelingLossNM'] + + +class MaskedLanguageModelingLossNM(LossNM): + """ + Neural module which implements Masked Language Modeling (MLM) loss. + + Args: + label_smoothing (float): label smoothing regularization coefficient + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + + output_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + output_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + """ + return { + "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), + "output_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "output_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + @property + def output_ports(self): + """Returns definitions of module output ports. + + loss: + NeuralType(None) + """ + return {"loss": NeuralType(None)} + + def __init__(self, label_smoothing=0.0): + LossNM.__init__(self) + self._criterion = SmoothedCrossEntropyLoss(label_smoothing) + + def _loss_function(self, logits, output_ids, output_mask): + loss = self._criterion(logits, output_ids, output_mask) + return loss diff --git a/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py b/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py new file mode 100644 index 000000000000..0ad66e21106d --- /dev/null +++ b/nemo/collections/nlp/nm/losses/padded_smoothed_cross_entropy_loss.py @@ -0,0 +1,77 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.backends.pytorch import LossNM +from nemo.collections.nlp.nm.losses.smoothed_cross_entropy_loss import SmoothedCrossEntropyLoss +from nemo.collections.nlp.utils.common_nlp_utils import mask_padded_tokens +from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag + +__all__ = ['PaddedSmoothedCrossEntropyLossNM'] + + +class PaddedSmoothedCrossEntropyLossNM(LossNM): + """ + Neural module which calculates CrossEntropyLoss and + 1) excludes padding tokens from loss calculation + 2) allows to use label smoothing regularization + 3) allows to calculate loss for the desired number of last tokens + + Args: + label_smoothing (float): label smoothing regularization coefficient + predict_last_k (int): how many last tokens to use for the loss + calculation, important for fast evaluation of LM perplexity + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + + target_ids: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + """ + return { + "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), + "target_ids": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + @property + def output_ports(self): + """Returns definitions of module output ports. + + loss: + NeuralType(None) + """ + return {"loss": NeuralType(None)} + + def __init__(self, pad_id, label_smoothing=0, predict_last_k=0): + LossNM.__init__(self) + + self._loss_fn = SmoothedCrossEntropyLoss(label_smoothing, predict_last_k) + self._pad_id = pad_id + + def _loss_function(self, logits, target_ids): + target_mask = mask_padded_tokens(target_ids, self._pad_id).to(logits.dtype) + loss = self._loss_fn(logits, target_ids, target_mask) + return loss diff --git a/nemo/collections/nlp/nm/losses/qa_squad_loss.py b/nemo/collections/nlp/nm/losses/qa_squad_loss.py new file mode 100644 index 000000000000..5f60871d4ebb --- /dev/null +++ b/nemo/collections/nlp/nm/losses/qa_squad_loss.py @@ -0,0 +1,107 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from torch import nn + +from nemo.backends.pytorch import LossNM +from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag + +__all__ = ['QuestionAnsweringLoss'] + + +class QuestionAnsweringLoss(LossNM): + """ + Neural module which implements QuestionAnswering loss. + Args: + logits: Output of question answering head, which is a token classfier. + start_positions: Ground truth start positions of the answer w.r.t. + input sequence. If question is unanswerable, this will be + pointing to start token, e.g. [CLS], of the input sequence. + end_positions: Ground truth end positions of the answer w.r.t. + input sequence. If question is unanswerable, this will be + pointing to start token, e.g. [CLS], of the input sequence. + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + + start_positions: + 0: AxisType(BatchTag) + + end_positions: + 0: AxisType(BatchTag) + """ + return { + "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), + "start_positions": NeuralType({0: AxisType(BatchTag)}), + "end_positions": NeuralType({0: AxisType(BatchTag)}), + } + + @property + def output_ports(self): + """Returns definitions of module output ports. + + loss: + NeuralType(None) + + start_logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + end_logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + """ + return { + "loss": NeuralType(None), + "start_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "end_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + def __init__(self): + LossNM.__init__(self) + + def _loss_function(self, **kwargs): + logits = kwargs['logits'] + start_positions = kwargs['start_positions'] + end_positions = kwargs['end_positions'] + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + return total_loss, start_logits, end_logits diff --git a/nemo/collections/nlp/modules/pytorch_utils.py b/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py similarity index 72% rename from nemo/collections/nlp/modules/pytorch_utils.py rename to nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py index 58af90a6b595..b28e63e54059 100644 --- a/nemo/collections/nlp/modules/pytorch_utils.py +++ b/nemo/collections/nlp/nm/losses/smoothed_cross_entropy_loss.py @@ -1,7 +1,23 @@ -__all__ = ['SmoothedCrossEntropyLoss'] +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= import torch +__all__ = ['SmoothedCrossEntropyLoss'] + class SmoothedCrossEntropyLoss(torch.nn.Module): """ diff --git a/nemo/collections/nlp/nm/losses/token_classification_loss.py b/nemo/collections/nlp/nm/losses/token_classification_loss.py new file mode 100644 index 000000000000..5c3c3adcad22 --- /dev/null +++ b/nemo/collections/nlp/nm/losses/token_classification_loss.py @@ -0,0 +1,88 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import torch +from torch import nn + +from nemo.backends.pytorch import LossNM +from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag + +__all__ = ['TokenClassificationLoss'] + + +class TokenClassificationLoss(LossNM): + """ + Neural module which implements Token Classification loss. + + Args: + num_classes (int): number of classes in a classifier, e.g. size + of the vocabulary in language modeling objective + logits (float): output of the classifier + labels (long): ground truth labels + loss_mask (long): to differentiate from original tokens and paddings + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + + labels: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + loss_mask: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + """ + return { + "logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), + "labels": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + "loss_mask": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), + } + + @property + def output_ports(self): + """Returns definitions of module output ports. + + loss: + NeuralType(None) + """ + return {"loss": NeuralType(None)} + + def __init__(self, num_classes, class_weights=None): + LossNM.__init__(self) + if class_weights: + class_weights = torch.FloatTensor(class_weights).to(self._device) + + self._criterion = nn.CrossEntropyLoss(weight=class_weights) + self.num_classes = num_classes + + def _loss_function(self, logits, labels, loss_mask): + active_loss = loss_mask.view(-1) > 0.5 + active_logits = logits.view(-1, self.num_classes)[active_loss] + active_labels = labels.view(-1)[active_loss] + + loss = self._criterion(active_logits, active_labels) + return loss diff --git a/nemo/collections/nlp/nm/trainables/__init__.py b/nemo/collections/nlp/nm/trainables/__init__.py new file mode 100644 index 000000000000..7114bdda312f --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/__init__.py @@ -0,0 +1,18 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.nm.trainables.common import * +from nemo.collections.nlp.nm.trainables.joint_intent_slot import * diff --git a/nemo/collections/nlp/nm/trainables/common/__init__.py b/nemo/collections/nlp/nm/trainables/common/__init__.py new file mode 100644 index 000000000000..57f80bcbcae1 --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/__init__.py @@ -0,0 +1,21 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import nemo.collections.nlp.nm.trainables.common.huggingface +from nemo.collections.nlp.nm.trainables.common.sequence_classification_nm import * +from nemo.collections.nlp.nm.trainables.common.sequence_regression_nm import * +from nemo.collections.nlp.nm.trainables.common.token_classification_nm import * +from nemo.collections.nlp.nm.trainables.common.transformer import * diff --git a/nemo/collections/nlp/nm/trainables/common/huggingface/__init__.py b/nemo/collections/nlp/nm/trainables/common/huggingface/__init__.py new file mode 100644 index 000000000000..48c9a2228ee8 --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/huggingface/__init__.py @@ -0,0 +1,17 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.nm.trainables.common.huggingface.bert_nm import * diff --git a/nemo/collections/nlp/huggingface/bert.py b/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py similarity index 84% rename from nemo/collections/nlp/huggingface/bert.py rename to nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py index 616c07f60ce0..1f91576be60a 100644 --- a/nemo/collections/nlp/huggingface/bert.py +++ b/nemo/collections/nlp/nm/trainables/common/huggingface/bert_nm.py @@ -1,4 +1,19 @@ -# Copyright (c) 2019 NVIDIA Corporation +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + from typing import List, Optional from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, BertConfig, BertModel @@ -7,6 +22,8 @@ from nemo.core.neural_modules import PretrainedModelInfo from nemo.core.neural_types import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag +__all__ = ['BERT'] + class BERT(TrainableNM): """ @@ -16,6 +33,7 @@ class BERT(TrainableNM): Args: pretrained_model_name (str): If using a pretrained model, this should be the model's name. Otherwise, should be left as None. + config_filename (str): path to model configuration file. Optional. vocab_size (int): Size of the vocabulary file, if not using a pretrained model. hidden_size (int): Size of the encoder and pooler layers. @@ -64,7 +82,7 @@ def output_ports(self): 2: AxisType(ChannelTag) """ - return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} + return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} def __init__( self, @@ -156,4 +174,4 @@ def list_pretrained_models() -> Optional[List[PretrainedModelInfo]]: return pretrained_models def forward(self, input_ids, token_type_ids, attention_mask): - return self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask,)[0] + return self.bert(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0] diff --git a/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py b/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py new file mode 100644 index 000000000000..7e0c81c65388 --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/sequence_classification_nm.py @@ -0,0 +1,85 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from torch import nn as nn + +from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init +from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag + +__all__ = ['SequenceClassifier'] + + +class SequenceClassifier(TrainableNM): + """ + Neural module which consists of MLP followed by softmax classifier for each + sequence in the batch. + + Args: + hidden_size (int): hidden size (d_model) of the Transformer + num_classes (int): number of classes in softmax classifier, e.g. number + of different sentiments + num_layers (int): number of layers in classifier MLP + activation (str): activation function applied in classifier MLP layers + log_softmax (bool): whether to apply log_softmax to MLP output + dropout (float): dropout ratio applied to MLP + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + hidden_states: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + """ + return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} + + @property + def output_ports(self): + """Returns definitions of module output ports. + + logits: + 0: AxisType(BatchTag) + + 1: AxisType(ChannelTag) + """ + return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)})} + + def __init__( + self, + hidden_size, + num_classes, + num_layers=2, + activation='relu', + log_softmax=True, + dropout=0.0, + use_transformer_pretrained=True, + ): + super().__init__() + self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax) + self.dropout = nn.Dropout(dropout) + if use_transformer_pretrained: + self.apply(lambda module: transformer_weights_init(module, xavier=False)) + # self.to(self._device) # sometimes this is necessary + + def forward(self, hidden_states, idx_conditioned_on=0): + hidden_states = self.dropout(hidden_states) + logits = self.mlp(hidden_states[:, idx_conditioned_on]) + return logits diff --git a/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py b/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py new file mode 100644 index 000000000000..1032a1f2c43d --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/sequence_regression_nm.py @@ -0,0 +1,79 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from torch import nn as nn + +from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init +from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, RegressionTag, TimeTag + +__all__ = ['SequenceRegression'] + + +class SequenceRegression(TrainableNM): + """ + Neural module which consists of MLP, generates a single number prediction + that could be used for a regression task. An example of this task would be + semantic textual similatity task, for example, STS-B (from GLUE tasks). + + Args: + hidden_size (int): the size of the hidden state for the dense layer + num_layers (int): number of layers in classifier MLP + activation (str): activation function applied in classifier MLP layers + dropout (float): dropout ratio applied to MLP + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + hidden_states: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + """ + return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} + + @property + def output_ports(self): + """Returns definitions of module output ports. + + preds: + 0: AxisType(RegressionTag) + """ + return {"preds": NeuralType({0: AxisType(RegressionTag)})} + + def __init__(self, hidden_size, num_layers=2, activation='relu', dropout=0.0, use_transformer_pretrained=True): + super().__init__() + self.mlp = MultiLayerPerceptron( + hidden_size, + num_classes=1, + device=self._device, + num_layers=num_layers, + activation=activation, + log_softmax=False, + ) + self.dropout = nn.Dropout(dropout) + if use_transformer_pretrained: + self.apply(lambda module: transformer_weights_init(module, xavier=False)) + # self.to(self._device) # sometimes this is necessary + + def forward(self, hidden_states, idx_conditioned_on=0): + hidden_states = self.dropout(hidden_states) + preds = self.mlp(hidden_states[:, idx_conditioned_on]) + return preds.view(-1) diff --git a/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py b/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py new file mode 100644 index 000000000000..ba848f247eb3 --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/token_classification_nm.py @@ -0,0 +1,171 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from torch import nn as nn + +from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import gelu, transformer_weights_init +from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag + +__all__ = ['BertTokenClassifier', 'TokenClassifier'] + +ACT2FN = {"gelu": gelu, "relu": nn.functional.relu} + + +class BertTokenClassifier(TrainableNM): + """ + Neural module which consists of MLP followed by softmax classifier for each + token in the sequence. + + Args: + hidden_size (int): hidden size (d_model) of the Transformer + num_classes (int): number of classes in softmax classifier, e.g. size + of the vocabulary in language modeling objective + activation (str): activation function applied in classifier MLP layers + log_softmax (bool): whether to apply log_softmax to MLP output + dropout (float): dropout ratio applied to MLP + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + hidden_states: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + """ + return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} + + @property + def output_ports(self): + """Returns definitions of module output ports. + + logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + """ + return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} + + def __init__( + self, + hidden_size, + num_classes, + activation='relu', + log_softmax=True, + dropout=0.0, + use_transformer_pretrained=True, + ): + super().__init__() + if activation not in ACT2FN: + raise ValueError(f'activation "{activation}" not found') + self.dense = nn.Linear(hidden_size, hidden_size) + self.act = ACT2FN[activation] + self.norm = nn.LayerNorm(hidden_size, eps=1e-12) + self.mlp = MultiLayerPerceptron( + hidden_size, num_classes, self._device, num_layers=1, activation=activation, log_softmax=log_softmax + ) + self.dropout = nn.Dropout(dropout) + if use_transformer_pretrained: + self.apply(lambda module: transformer_weights_init(module, xavier=False)) + self.to(self._device) + + def forward(self, hidden_states): + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = self.act(hidden_states) + transform = self.norm(hidden_states) + logits = self.mlp(transform) + return logits + + +class TokenClassifier(TrainableNM): + """ + Neural module which consists of MLP followed by softmax classifier for each + token in the sequence. + + Args: + hidden_size (int): hidden size (d_model) of the Transformer + num_classes (int): number of classes in softmax classifier, e.g. size + of the vocabulary in language modeling objective + num_layers (int): number of layers in classifier MLP + activation (str): activation function applied in classifier MLP layers + log_softmax (bool): whether to apply log_softmax to MLP output + dropout (float): dropout ratio applied to MLP + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + hidden_states: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + """ + return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} + + @property + def output_ports(self): + """Returns definitions of module output ports. + + logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + """ + return {"logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} + + def __init__( + self, + hidden_size, + num_classes, + name=None, + num_layers=2, + activation='relu', + log_softmax=True, + dropout=0.0, + use_transformer_pretrained=True, + ): + super().__init__() + + self.name = name + self.mlp = MultiLayerPerceptron(hidden_size, num_classes, self._device, num_layers, activation, log_softmax) + self.dropout = nn.Dropout(dropout) + if use_transformer_pretrained: + self.apply(lambda module: transformer_weights_init(module, xavier=False)) + # self.to(self._device) # sometimes this is necessary + + def __str__(self): + name = TrainableNM.__str__(self) + + if self.name: + name = self.name + name + return name + + def forward(self, hidden_states): + hidden_states = self.dropout(hidden_states) + logits = self.mlp(hidden_states) + return logits diff --git a/nemo/collections/nlp/nm/trainables/common/transformer/__init__.py b/nemo/collections/nlp/nm/trainables/common/transformer/__init__.py new file mode 100644 index 000000000000..4e0a87804d4d --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/common/transformer/__init__.py @@ -0,0 +1,17 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_nm import * diff --git a/nemo/collections/nlp/transformer/decoders.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py similarity index 86% rename from nemo/collections/nlp/transformer/decoders.py rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py index ccd1b26d2f38..1f3cbf0e4f44 100644 --- a/nemo/collections/nlp/transformer/decoders.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_decoders.py @@ -1,12 +1,15 @@ -__all__ = ['TransformerDecoderBlock', 'TransformerDecoder'] - import copy import torch import torch.nn as nn -from .modules import MultiHeadAttention, PositionWiseFF -from .utils import form_attention_mask +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_modules import ( + MultiHeadAttention, + PositionWiseFF, +) +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import form_attention_mask + +__all__ = [] class TransformerDecoderBlock(nn.Module): @@ -38,16 +41,14 @@ def __init__( super().__init__() self.first_sub_layer = MultiHeadAttention( - hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout, + hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout ) self.second_sub_layer = MultiHeadAttention( - hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout, + hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout ) self.third_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act) - def forward( - self, decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask, - ): + def forward(self, decoder_query, decoder_mask, decoder_keys, encoder_states, encoder_mask): self_attn_output = self.first_sub_layer(decoder_query, decoder_keys, decoder_keys, decoder_mask) enc_dec_attn_output = self.second_sub_layer(self_attn_output, encoder_states, encoder_states, encoder_mask) output_states = self.third_sub_layer(enc_dec_attn_output) @@ -69,7 +70,7 @@ def _get_memory_states(self, decoder_states, decoder_mems_list=None, i=0): return memory_states def forward( - self, decoder_states, decoder_mask, encoder_states, encoder_mask, decoder_mems_list=None, return_mems=False, + self, decoder_states, decoder_mask, encoder_states, encoder_mask, decoder_mems_list=None, return_mems=False ): """ Args: @@ -91,9 +92,7 @@ def forward( cached_mems_list = [memory_states] for i, layer in enumerate(self.layers): - decoder_states = layer( - decoder_states, decoder_attn_mask, memory_states, encoder_states, encoder_attn_mask, - ) + decoder_states = layer(decoder_states, decoder_attn_mask, memory_states, encoder_states, encoder_attn_mask) memory_states = self._get_memory_states(decoder_states, decoder_mems_list, i + 1) cached_mems_list.append(memory_states) diff --git a/nemo/collections/nlp/transformer/encoders.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py similarity index 91% rename from nemo/collections/nlp/transformer/encoders.py rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py index 1eb63eb55124..24c6afce55ad 100644 --- a/nemo/collections/nlp/transformer/encoders.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_encoders.py @@ -1,17 +1,16 @@ -__all__ = [ - 'TransformerEncoderBlock', - 'TransformerEncoder', - 'XLNetEncoderBlock', - 'XLNetEncoder', -] - import copy import torch import torch.nn as nn -from .modules import MultiHeadAttention, PositionWiseFF, TwoStreamSelfAttention -from .utils import form_attention_mask +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_modules import ( + MultiHeadAttention, + PositionWiseFF, + TwoStreamSelfAttention, +) +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import form_attention_mask + +__all__ = [] class TransformerEncoderBlock(nn.Module): @@ -43,7 +42,7 @@ def __init__( super().__init__() self.first_sub_layer = MultiHeadAttention( - hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout, + hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout ) self.second_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act) @@ -68,9 +67,7 @@ def _get_memory_states(self, encoder_states, encoder_mems_list=None, i=0): memory_states = encoder_states return memory_states - def forward( - self, encoder_states, encoder_mask, encoder_mems_list=None, return_mems=False, - ): + def forward(self, encoder_states, encoder_mask, encoder_mems_list=None, return_mems=False): """ Args: encoder_states: output of the embedding_layer (B x L_enc x H) @@ -112,7 +109,7 @@ def __init__( super().__init__() self.first_sub_layer = TwoStreamSelfAttention( - hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout, + hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout ) self.second_sub_layer = PositionWiseFF(hidden_size, inner_size, ffn_dropout, hidden_act) @@ -135,5 +132,5 @@ def forward(self, query_states, content_states, input_mask): query_attn_mask = form_attention_mask(input_mask, diagonal=-1) content_attn_mask = form_attention_mask(input_mask, diagonal=0) for layer in self.layers: - query_states, content_states = layer(query_states, content_states, query_attn_mask, content_attn_mask,) + query_states, content_states = layer(query_states, content_states, query_attn_mask, content_attn_mask) return query_states, content_states diff --git a/nemo/collections/nlp/transformer/generators.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py similarity index 95% rename from nemo/collections/nlp/transformer/generators.py rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py index 9e427a54db61..d878ccd17655 100644 --- a/nemo/collections/nlp/transformer/generators.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_generators.py @@ -1,14 +1,10 @@ -__all__ = [ - 'GreedySequenceGenerator', - 'TopKSequenceGenerator', - 'BeamSearchSequenceGenerator', -] +__all__ = [] import torch import torch.nn as nn -from ..utils.nlp_utils import mask_padded_tokens -from .utils import NEG_INF +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import NEG_INF +from nemo.collections.nlp.utils.common_nlp_utils import mask_padded_tokens class GreedySequenceGenerator(nn.Module): @@ -92,7 +88,7 @@ def _forward( ) else: decoder_mems_list = self.decoder.forward( - decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True, + decoder_hidden_states, decoder_input_mask, decoder_mems_list, return_mems=True ) log_probs = self.log_softmax.forward(decoder_mems_list[-1]) return log_probs, decoder_mems_list @@ -124,9 +120,7 @@ def _prepare_for_search(self, decoder_input_ids=None, encoder_hidden_states=None return tgt, batch_size, max_generation_length - def forward( - self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, - ): + def forward(self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None): tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states) @@ -138,7 +132,7 @@ def forward( for i in range(max_generation_length): log_probs, decoder_mems_list = self._forward( - tgt[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i, + tgt[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i ) next_tokens = torch.argmax(log_probs[:, -1], dim=-1, keepdim=True) @@ -182,7 +176,7 @@ def _forward( pos=0, ): log_probs, decoder_mems_list = super()._forward( - decoder_input_ids, encoder_hidden_states, encoder_input_mask, decoder_mems_list, pos, + decoder_input_ids, encoder_hidden_states, encoder_input_mask, decoder_mems_list, pos ) batch_size, seq_len, vocab_size = log_probs.size() @@ -220,9 +214,7 @@ def __init__(self, embedding, decoder, log_softmax, beam_size=1, len_pen=0, **kw self.beam_size = beam_size self.len_pen = len_pen - def forward( - self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None, - ): + def forward(self, decoder_input_ids=None, encoder_hidden_states=None, encoder_input_mask=None): tgt, batch_size, max_generation_length = self._prepare_for_search(decoder_input_ids, encoder_hidden_states) @@ -261,7 +253,7 @@ def forward( # generate and score candidates for prefixes continuation log_probs, decoder_mems_list = self._forward( - prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i + 1, + prefixes[:, -1:], encoder_hidden_states, encoder_input_mask, decoder_mems_list, i + 1 ) scores_i, prefixes_i = torch.topk(log_probs[:, -1, :], self.beam_size, dim=-1) diff --git a/nemo/collections/nlp/transformer/modules.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py similarity index 92% rename from nemo/collections/nlp/transformer/modules.py rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py index e958c1951c6c..153843e1aad0 100644 --- a/nemo/collections/nlp/transformer/modules.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_modules.py @@ -22,27 +22,25 @@ http://nlp.seas.harvard.edu/2018/04/03/attention.html Copyright by the HuggingFace and Annotated Transformer authors. """ -__all__ = [ - 'FixedPositionalEncoding', - 'TransformerEmbedding', - 'MultiHeadAttention', - 'LightweightConv1d', - 'TwoStreamSelfAttention', - 'PositionWiseFF', -] import math import torch from torch import nn -from .utils import gelu +from nemo import logging +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import gelu + +__all__ = [] + try: from apex.normalization import FusedLayerNorm except (AttributeError, ModuleNotFoundError): # this is lie - it isn't fused in this case - print("Unable to import APEX. Mixed precision, distributed training and " "FusedLayerNorm are not available.") + logging.warning( + "Unable to import APEX. Mixed precision, distributed training and " "FusedLayerNorm are not available." + ) from torch.nn import LayerNorm as FusedLayerNorm @@ -114,7 +112,7 @@ def forward(self, input_ids, token_type_ids=None, start_pos=0): "Input sequence is longer than maximum allowed" " sequence length for positional encoding" ) position_ids = torch.arange( - start=start_pos, end=start_pos + seq_length, dtype=torch.long, device=input_ids.device, + start=start_pos, end=start_pos + seq_length, dtype=torch.long, device=input_ids.device ) position_ids = position_ids.unsqueeze(0).expand_as(input_ids) @@ -144,9 +142,7 @@ class MultiHeadAttention(nn.Module): whole layer, but before layer normalization """ - def __init__( - self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0, - ): + def __init__(self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0): super().__init__() if hidden_size % num_attention_heads != 0: raise ValueError( @@ -168,7 +164,7 @@ def __init__( self.layer_norm = FusedLayerNorm(hidden_size, eps=1e-5) def transpose_for_scores(self, x): - new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attn_head_size,) + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attn_head_size) x = x.view(*new_x_shape) return x.permute(0, 2, 1, 3) @@ -219,9 +215,7 @@ class LightweightConv1d(nn.Module): whole layer, but before layer normalization """ - def __init__( - self, hidden_size, num_attention_heads, kernel_size, conv_weight_dropout=0.0, conv_layer_dropout=0.0, - ): + def __init__(self, hidden_size, num_attention_heads, kernel_size, conv_weight_dropout=0.0, conv_layer_dropout=0.0): super().__init__() self.num_heads = num_attention_heads self.kernel_size = kernel_size @@ -246,7 +240,7 @@ def forward(self, hidden_states, attention_mask): weight[:, :, pivot:] = 0 output_states = output_states.contiguous().view(-1, self.num_heads, seq_len) - output_states = torch.conv1d(output_states, weight, padding=self.kernel_size // 2, groups=self.num_heads,) + output_states = torch.conv1d(output_states, weight, padding=self.kernel_size // 2, groups=self.num_heads) output_states = output_states.view(batch_size, hidden_size, seq_len) output_states = output_states.permute(0, 2, 1) @@ -270,23 +264,19 @@ class TwoStreamSelfAttention(nn.Module): whole layer, but before layer normalization """ - def __init__( - self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0, - ): + def __init__(self, hidden_size, num_attention_heads, attn_score_dropout=0.0, attn_layer_dropout=0.0): super().__init__() self.query_stream = MultiHeadAttention( - hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout, + hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout ) self.content_stream = MultiHeadAttention( - hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout, + hidden_size, num_attention_heads, attn_score_dropout, attn_layer_dropout ) - def forward( - self, query_states, content_states, query_attention_mask, content_attention_mask, - ): + def forward(self, query_states, content_states, query_attention_mask, content_attention_mask): output_query_states = self.query_stream(query_states, content_states, content_states, query_attention_mask) output_content_states = self.content_stream( - query_states, content_states, content_states, content_attention_mask, + query_states, content_states, content_states, content_attention_mask ) return output_query_states, output_content_states diff --git a/nemo/collections/nlp/modules/transformer_nm.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py similarity index 92% rename from nemo/collections/nlp/modules/transformer_nm.py rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py index e8e9897a825b..b736588a3d33 100644 --- a/nemo/collections/nlp/modules/transformer_nm.py +++ b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_nm.py @@ -2,26 +2,22 @@ """ This package contains Transformer for translation Neural Module """ -__all__ = [ - 'TransformerEncoderNM', - 'TransformerDecoderNM', - 'GreedyLanguageGeneratorNM', - 'BeamSearchTranslatorNM', -] import math -from ..transformer import ( +from nemo.backends.pytorch.nm import TrainableNM +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_decoders import TransformerDecoder +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_encoders import TransformerEncoder +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_generators import ( BeamSearchSequenceGenerator, GreedySequenceGenerator, - TransformerDecoder, - TransformerEmbedding, - TransformerEncoder, ) -from ..transformer.utils import transformer_weights_init -from nemo.backends.pytorch.nm import LossNM, TrainableNM +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_modules import TransformerEmbedding +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init from nemo.core.neural_types import * +__all__ = ['TransformerEncoderNM', 'TransformerDecoderNM', 'GreedyLanguageGeneratorNM', 'BeamSearchTranslatorNM'] + class TransformerEncoderNM(TrainableNM): """ @@ -78,7 +74,7 @@ def output_ports(self): 2: AxisType(ChannelTag) """ - return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} + return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} def __init__( self, @@ -178,7 +174,7 @@ def input_ports(self): """ return { "input_ids_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), - "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}), + "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), "input_mask_tgt": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), } @@ -194,7 +190,7 @@ def output_ports(self): 2: AxisType(ChannelTag) """ - return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),})} + return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} def __init__( self, @@ -237,7 +233,7 @@ def __init__( def forward(self, input_ids_tgt, hidden_states_src, input_mask_src, input_mask_tgt): hidden_states_tgt = self.embedding_layer(input_ids_tgt) - hidden_states = self.decoder(hidden_states_tgt, input_mask_tgt, hidden_states_src, input_mask_src,) + hidden_states = self.decoder(hidden_states_tgt, input_mask_tgt, hidden_states_src, input_mask_src) return hidden_states @@ -337,7 +333,7 @@ def input_ports(self): 1: AxisType(TimeTag) """ return { - "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}), + "hidden_states_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), "input_mask_src": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag)}), } @@ -386,5 +382,5 @@ def __init__( ) def forward(self, hidden_states_src, input_mask_src): - output_ids = self.generator(encoder_hidden_states=hidden_states_src, encoder_input_mask=input_mask_src,) + output_ids = self.generator(encoder_hidden_states=hidden_states_src, encoder_input_mask=input_mask_src) return output_ids diff --git a/nemo/collections/nlp/transformer/utils.py b/nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py similarity index 100% rename from nemo/collections/nlp/transformer/utils.py rename to nemo/collections/nlp/nm/trainables/common/transformer/transformer_utils.py diff --git a/nemo/collections/nlp/nm/trainables/joint_intent_slot/__init__.py b/nemo/collections/nlp/nm/trainables/joint_intent_slot/__init__.py new file mode 100644 index 000000000000..600a32ece82d --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/joint_intent_slot/__init__.py @@ -0,0 +1,17 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from nemo.collections.nlp.nm.trainables.joint_intent_slot.joint_intent_slot_nm import * diff --git a/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py b/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py new file mode 100644 index 000000000000..b8707646f746 --- /dev/null +++ b/nemo/collections/nlp/nm/trainables/joint_intent_slot/joint_intent_slot_nm.py @@ -0,0 +1,95 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +from torch import nn as nn + +from nemo.backends.pytorch import MultiLayerPerceptron, TrainableNM +from nemo.collections.nlp.nm.trainables.common.transformer.transformer_utils import transformer_weights_init +from nemo.core import AxisType, BatchTag, ChannelTag, NeuralType, TimeTag + +__all__ = ['JointIntentSlotClassifier'] + + +class JointIntentSlotClassifier(TrainableNM): + """ + The softmax classifier for the joint intent classification and slot + filling task which consists of a dense layer + relu + softmax for + predicting the slots and similar for predicting the intents. + + Args: + hidden_size (int): the size of the hidden state for the dense layer + num_intents (int): number of intents + num_slots (int): number of slots + dropout (float): dropout to be applied to the layer + """ + + @property + def input_ports(self): + """Returns definitions of module input ports. + + hidden_states: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + """ + return {"hidden_states": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)})} + + @property + def output_ports(self): + """Returns definitions of module output ports. + + intent_logits: + 0: AxisType(BatchTag) + + 1: AxisType(ChannelTag) + + slot_logits: + 0: AxisType(BatchTag) + + 1: AxisType(TimeTag) + + 2: AxisType(ChannelTag) + """ + return { + "intent_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(ChannelTag)}), + "slot_logits": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}), + } + + def __init__(self, hidden_size, num_intents, num_slots, dropout=0.0, use_transformer_pretrained=True, **kwargs): + super().__init__(**kwargs) + self.dropout = nn.Dropout(dropout) + self.slot_mlp = MultiLayerPerceptron( + hidden_size, num_classes=num_slots, device=self._device, num_layers=2, activation='relu', log_softmax=False + ) + self.intent_mlp = MultiLayerPerceptron( + hidden_size, + num_classes=num_intents, + device=self._device, + num_layers=2, + activation='relu', + log_softmax=False, + ) + if use_transformer_pretrained: + self.apply(lambda module: transformer_weights_init(module, xavier=False)) + # self.to(self._device) + + def forward(self, hidden_states): + hidden_states = self.dropout(hidden_states) + intent_logits = self.intent_mlp(hidden_states[:, 0]) + slot_logits = self.slot_mlp(hidden_states) + return intent_logits, slot_logits diff --git a/nemo/collections/nlp/transformer/__init__.py b/nemo/collections/nlp/transformer/__init__.py deleted file mode 100644 index 1f91c6035a59..000000000000 --- a/nemo/collections/nlp/transformer/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) 2019 NVIDIA Corporation -from .decoders import * -from .encoders import * -from .generators import * -from .modules import * diff --git a/nemo/collections/nlp/utils/__init__.py b/nemo/collections/nlp/utils/__init__.py index 894348fc3114..49948c01f0c6 100644 --- a/nemo/collections/nlp/utils/__init__.py +++ b/nemo/collections/nlp/utils/__init__.py @@ -1 +1,3 @@ -from . import callbacks, metrics, nlp_utils +from nemo.collections.nlp.utils.callback_utils import * +from nemo.collections.nlp.utils.common_nlp_utils import * +from nemo.collections.nlp.utils.loss_utils import * diff --git a/nemo/collections/nlp/utils/callback_utils.py b/nemo/collections/nlp/utils/callback_utils.py new file mode 100644 index 000000000000..a3da1106d5c9 --- /dev/null +++ b/nemo/collections/nlp/utils/callback_utils.py @@ -0,0 +1,97 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import os +import time + +import numpy as np +from matplotlib import pyplot as plt +from sklearn.metrics import confusion_matrix + +from nemo import logging + +__all__ = ['list2str', 'tensor2list', 'plot_confusion_matrix'] + + +def list2str(l): + return ' '.join([str(x) for x in l]) + + +def tensor2list(tensor): + return tensor.detach().cpu().tolist() + + +def plot_confusion_matrix(labels, preds, graph_fold, label_ids=None, normalize=False, prefix=''): + ''' + Plot confusion matrix. + Args: + label_ids (dict): label to id map, for example: {'O': 0, 'LOC': 1} + labels (list of ints): list of true labels + preds (list of ints): list of predicted labels + graph_fold (str): path to output folder + normalize (bool): flag to indicate whether to normalize confusion matrix + prefix (str): prefix for the plot name + + ''' + if label_ids is None: + _plot_confusion_matrix(labels, preds, graph_fold) + + else: + # remove labels from label_ids that don't appear in the dev set + used_labels = set(labels) | set(preds) + label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels} + + ids_to_labels = {label_ids[k]: k for k in label_ids} + classes = [ids_to_labels[id] for id in sorted(label_ids.values())] + + title = 'Confusion matrix' + cm = confusion_matrix(labels, preds) + if normalize: + sums = cm.sum(axis=1)[:, np.newaxis] + sums = np.where(sums == 0, 1, sums) + cm = cm.astype('float') / sums + title = 'Normalized ' + title + + fig = plt.figure() + ax = fig.add_subplot(111) + + cax = ax.matshow(cm) + ax.set_xticks(np.arange(-1, len(classes) + 1)) + ax.set_yticks(np.arange(-1, len(classes) + 1)) + ax.set_xticklabels([''] + classes, rotation=90) + ax.set_yticklabels([''] + classes) + ax.set_ylabel('True') + ax.set_xlabel('Predicted') + + os.makedirs(graph_fold, exist_ok=True) + fig.colorbar(cax) + + title = (prefix + ' ' + title).strip() + plt.savefig(os.path.join(graph_fold, title + '_' + time.strftime('%Y%m%d-%H%M%S'))) + + +def _plot_confusion_matrix(labels, preds, graph_fold): + cm = confusion_matrix(labels, preds) + logging.info(f'Confusion matrix:\n{cm}') + fig = plt.figure() + ax = fig.add_subplot(111) + cax = ax.matshow(cm) + plt.title('Confusion matrix of the classifier') + fig.colorbar(cax) + plt.xlabel('Predicted') + plt.ylabel('True') + os.makedirs(graph_fold, exist_ok=True) + plt.savefig(os.path.join(graph_fold, time.strftime('%Y%m%d-%H%M%S'))) diff --git a/nemo/collections/nlp/utils/callbacks/__init__.py b/nemo/collections/nlp/utils/callbacks/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/nemo/collections/nlp/utils/callbacks/language_modeling.py b/nemo/collections/nlp/utils/callbacks/language_modeling.py deleted file mode 100644 index daffe2c64d2d..000000000000 --- a/nemo/collections/nlp/utils/callbacks/language_modeling.py +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2019 NVIDIA Corporation -__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] - -import numpy as np - -import nemo - -GLOBAL_KEYS = ["eval_loss", "sys"] - - -def eval_iter_callback(tensors, global_vars): - for key in GLOBAL_KEYS: - if key not in global_vars.keys(): - global_vars[key] = [] - - for kv, v in tensors.items(): - if "loss" in kv: - for eval_loss in v: - global_vars["eval_loss"].append(eval_loss.item()) - - -def eval_epochs_done_callback(global_vars): - eval_loss = np.mean(global_vars["eval_loss"]) - eval_ppl = np.exp(eval_loss) - - nemo.logging.info("------------------------------------------------------") - nemo.logging.info("Eval loss: {0}".format(np.round(eval_loss, 3))) - nemo.logging.info("Eval ppl: {0}".format(np.round(eval_ppl, 3))) - nemo.logging.info("------------------------------------------------------") - for key in GLOBAL_KEYS: - global_vars[key] = [] - return dict({"Eval_loss": eval_loss, "Eval_ppl": eval_ppl}) diff --git a/nemo/collections/nlp/utils/callbacks/sentence_classification.py b/nemo/collections/nlp/utils/callbacks/sentence_classification.py deleted file mode 100644 index 4810bab9dde1..000000000000 --- a/nemo/collections/nlp/utils/callbacks/sentence_classification.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2019 NVIDIA Corporation -__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] - -import os -import random -import time - -import numpy as np # nopep8 -from matplotlib import pyplot as plt # nopep8 -from sklearn.metrics import classification_report, confusion_matrix # nopep8 - -import nemo - -__all__ = ['eval_iter_callback', 'eval_epochs_done_callback'] - - -def eval_iter_callback(tensors, global_vars, eval_data_layer): - if "all_preds" not in global_vars.keys(): - global_vars["all_preds"] = [] - if "all_labels" not in global_vars.keys(): - global_vars["all_labels"] = [] - - logits_lists = [] - labels_lists = [] - - for kv, v in tensors.items(): - if 'logits' in kv: - for v_tensor in v: - for logit_tensor in v_tensor: - logits_lists.append(logit_tensor.detach().cpu().tolist()) - - if 'labels' in kv: - for v_tensor in v: - for label_tensor in v_tensor: - labels_lists.append(label_tensor.detach().cpu().tolist()) - - preds = list(np.argmax(np.asarray(logits_lists), 1)) - global_vars["all_preds"].extend(preds) - global_vars["all_labels"].extend(labels_lists) - - -def list2str(l): - return ' '.join([str(j) for j in l]) - - -def eval_epochs_done_callback(global_vars, graph_fold): - labels = np.asarray(global_vars['all_labels']) - preds = np.asarray(global_vars['all_preds']) - accuracy = sum(labels == preds) / labels.shape[0] - nemo.logging.info(f'Accuracy: {accuracy}') - i = 0 - if preds.shape[0] > 21: - i = random.randint(0, preds.shape[0] - 21) - nemo.logging.info("Sampled preds: [%s]" % list2str(preds[i : i + 20])) - nemo.logging.info("Sampled labels: [%s]" % list2str(labels[i : i + 20])) - cm = confusion_matrix(labels, preds) - fig = plt.figure() - ax = fig.add_subplot(111) - cax = ax.matshow(cm) - plt.title('Confusion matrix of the classifier') - fig.colorbar(cax) - plt.xlabel('Predicted') - plt.ylabel('True') - os.makedirs(graph_fold, exist_ok=True) - plt.savefig(os.path.join(graph_fold, time.strftime('%Y%m%d-%H%M%S'))) - - nemo.logging.info(classification_report(labels, preds)) - - return dict({"accuracy": accuracy}) diff --git a/nemo/collections/nlp/utils/common_nlp_utils.py b/nemo/collections/nlp/utils/common_nlp_utils.py new file mode 100644 index 000000000000..47634ae71e83 --- /dev/null +++ b/nemo/collections/nlp/utils/common_nlp_utils.py @@ -0,0 +1,144 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import os +import re +import string + +import numpy as np + +from nemo import logging + +__all__ = [ + '_is_whitespace', + 'mask_padded_tokens', + 'read_intent_slot_outputs', + 'get_vocab', + 'write_vocab', + 'label2idx', + 'write_vocab_in_order', + 'if_exist', + 'remove_punctuation_from_sentence', + 'ids2text', + 'calc_class_weights', +] + + +def _is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + +def mask_padded_tokens(tokens, pad_id): + mask = tokens != pad_id + return mask + + +def read_intent_slot_outputs( + queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None +): + intent_dict = get_vocab(intent_file) + slot_dict = get_vocab(slot_file) + pred_intents = np.argmax(intent_logits, 1) + pred_slots = np.argmax(slot_logits, axis=2) + slot_masks = slot_masks > 0.5 + for i, query in enumerate(queries): + logging.info(f'Query: {query}') + pred = pred_intents[i] + logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}') + if intents is not None: + logging.info(f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}') + + pred_slot = pred_slots[i][slot_masks[i]] + tokens = query.strip().split() + + if len(pred_slot) != len(tokens): + raise ValueError('Pred_slot and tokens must be of the same length') + + for j, token in enumerate(tokens): + output = f'{token}\t{slot_dict[pred_slot[j]]}' + if slots is not None: + output = f'{output}\t{slot_dict[slots[i][j]]}' + logging.info(output) + + +def get_vocab(file): + lines = open(file, 'r').readlines() + lines = [line.strip() for line in lines if line.strip()] + labels = {i: lines[i] for i in range(len(lines))} + return labels + + +def write_vocab(items, outfile): + vocab = {} + idx = 0 + with open(outfile, 'w') as f: + for item in items: + f.write(item + '\n') + vocab[item] = idx + idx += 1 + return vocab + + +def label2idx(file): + lines = open(file, 'r').readlines() + lines = [line.strip() for line in lines if line.strip()] + labels = {lines[i]: i for i in range(len(lines))} + return labels + + +def write_vocab_in_order(vocab, outfile): + with open(outfile, 'w') as f: + for key in sorted(vocab.keys()): + f.write(f'{vocab[key]}\n') + + +def if_exist(outfold, files): + if not os.path.exists(outfold): + return False + for file in files: + if not os.path.exists(f'{outfold}/{file}'): + return False + return True + + +def remove_punctuation_from_sentence(sentence): + sentence = re.sub('[' + string.punctuation + ']', '', sentence) + sentence = sentence.lower() + return sentence + + +def ids2text(ids, vocab): + return ' '.join([vocab[int(id_)] for id_ in ids]) + + +def calc_class_weights(label_freq): + """ + Goal is to give more weight to the classes with less samples + so as to match the one with the higest frequency. We achieve this by + dividing the highest frequency by the freq of each label. + Example - + [12, 5, 3] -> [12/12, 12/5, 12/3] -> [1, 2.4, 4] + + Here label_freq is assumed to be sorted by the frequency. I.e. + label_freq[0] is the most frequent element. + + """ + + most_common_label_freq = label_freq[0] + weighted_slots = sorted([(index, most_common_label_freq[1] / freq) for (index, freq) in label_freq]) + return [weight for (_, weight) in weighted_slots] diff --git a/nemo/collections/nlp/utils/loss_utils.py b/nemo/collections/nlp/utils/loss_utils.py new file mode 100644 index 000000000000..f491f7d43fa6 --- /dev/null +++ b/nemo/collections/nlp/utils/loss_utils.py @@ -0,0 +1,42 @@ +# ============================================================================= +# Copyright 2019 AI Applications Design Team at NVIDIA. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================= + +import math + +__all__ = ['_compute_softmax'] + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs diff --git a/nemo/collections/nlp/utils/metrics/__init__.py b/nemo/collections/nlp/utils/metrics/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/nemo/collections/nlp/utils/nlp_utils.py b/nemo/collections/nlp/utils/nlp_utils.py deleted file mode 100644 index 1b1ef57bb27a..000000000000 --- a/nemo/collections/nlp/utils/nlp_utils.py +++ /dev/null @@ -1,123 +0,0 @@ -import os -import time - -import numpy as np -from matplotlib import pyplot as plt -from sklearn.metrics import confusion_matrix - -import nemo - - -def _is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False - - -def mask_padded_tokens(tokens, pad_id): - mask = tokens != pad_id - return mask - - -def read_intent_slot_outputs( - queries, intent_file, slot_file, intent_logits, slot_logits, slot_masks, intents=None, slots=None, -): - intent_dict = get_vocab(intent_file) - slot_dict = get_vocab(slot_file) - pred_intents = np.argmax(intent_logits, 1) - pred_slots = np.argmax(slot_logits, axis=2) - slot_masks = slot_masks > 0.5 - for i, query in enumerate(queries): - nemo.logging.info(f'Query: {query}') - pred = pred_intents[i] - nemo.logging.info(f'Predicted intent:\t{pred}\t{intent_dict[pred]}') - if intents is not None: - nemo.logging.info(f'True intent:\t{intents[i]}\t{intent_dict[intents[i]]}') - - pred_slot = pred_slots[i][slot_masks[i]] - tokens = query.strip().split() - - if len(pred_slot) != len(tokens): - raise ValueError('Pred_slot and tokens must be of the same length') - - for j, token in enumerate(tokens): - output = f'{token}\t{slot_dict[pred_slot[j]]}' - if slots is not None: - output = f'{output}\t{slot_dict[slots[i][j]]}' - nemo.logging.info(output) - - -def get_vocab(file): - lines = open(file, 'r').readlines() - lines = [line.strip() for line in lines if line.strip()] - labels = {i: lines[i] for i in range(len(lines))} - return labels - - -def write_vocab(items, outfile): - vocab = {} - idx = 0 - with open(outfile, 'w') as f: - for item in items: - f.write(item + '\n') - vocab[item] = idx - idx += 1 - return vocab - - -def label2idx(file): - lines = open(file, 'r').readlines() - lines = [line.strip() for line in lines if line.strip()] - labels = {lines[i]: i for i in range(len(lines))} - return labels - - -def write_vocab_in_order(vocab, outfile): - with open(outfile, 'w') as f: - for key in sorted(vocab.keys()): - f.write(f'{vocab[key]}\n') - - -def plot_confusion_matrix(label_ids, labels, preds, graph_fold, normalize=False, prefix=''): - ''' - Plot confusion matrix. - Args: - label_ids (dict): label to id map, for example: {'O': 0, 'LOC': 1} - labels (list of ints): list of true labels - preds (list of ints): list of predicted labels - graph_fold (str): path to output folder - normalize (bool): flag to indicate whether to normalize confusion matrix - prefix (str): prefix for the plot name - - ''' - # remove labels from label_ids that don't appear in the dev set - used_labels = set(labels) | set(preds) - label_ids = {k: label_ids[k] for k, v in label_ids.items() if v in used_labels} - - ids_to_labels = {label_ids[k]: k for k in label_ids} - classes = [ids_to_labels[id] for id in sorted(label_ids.values())] - - title = 'Confusion matrix' - cm = confusion_matrix(labels, preds) - if normalize: - sums = cm.sum(axis=1)[:, np.newaxis] - sums = np.where(sums == 0, 1, sums) - cm = cm.astype('float') / sums - title = 'Normalized ' + title - - fig = plt.figure() - ax = fig.add_subplot(111) - - cax = ax.matshow(cm) - ax.set_xticks(np.arange(-1, len(classes) + 1)) - ax.set_yticks(np.arange(-1, len(classes) + 1)) - ax.set_xticklabels([''] + classes, rotation=90) - ax.set_yticklabels([''] + classes) - ax.set_ylabel('True') - ax.set_xlabel('Predicted') - - os.makedirs(graph_fold, exist_ok=True) - fig.colorbar(cax) - - title = (prefix + ' ' + title).strip() - plt.savefig(os.path.join(graph_fold, title + '_' + time.strftime('%Y%m%d-%H%M%S'))) diff --git a/tests/nlp/test_bert.py b/tests/nlp/test_bert.py index ced011720b19..b15e040d7e1f 100644 --- a/tests/nlp/test_bert.py +++ b/tests/nlp/test_bert.py @@ -22,5 +22,5 @@ class TestBert(NeMoUnitTest): def test_list_pretrained_models(self): - pretrained_models = nemo_nlp.huggingface.BERT.list_pretrained_models() + pretrained_models = nemo_nlp.nm.trainables.huggingface.BERT.list_pretrained_models() self.assertTrue(len(pretrained_models) > 0) diff --git a/tests/nlp/test_spc_tokenizer.py b/tests/nlp/test_spc_tokenizer.py index ac8363a507d7..fa0259fbc120 100644 --- a/tests/nlp/test_spc_tokenizer.py +++ b/tests/nlp/test_spc_tokenizer.py @@ -16,7 +16,7 @@ # limitations under the License. # ============================================================================= -from nemo.collections.nlp import SentencePieceTokenizer +from nemo.collections.nlp.data import SentencePieceTokenizer from tests.common_setup import NeMoUnitTest diff --git a/tests/nlp/test_squad.py b/tests/nlp/test_squad.py index f6c7fe0580f1..8d1c460798ca 100644 --- a/tests/nlp/test_squad.py +++ b/tests/nlp/test_squad.py @@ -20,22 +20,53 @@ import os import shutil +from examples.nlp.scripts.get_squad import SquadDownloader + import nemo import nemo.collections.nlp as nemo_nlp -from nemo.collections.nlp.utils.callbacks.squad import eval_epochs_done_callback, eval_iter_callback -from nemo.collections.nlp.utils.download_squad import SquadDownloader +import nemo.collections.nlp.nm.data_layers.qa_squad_datalayer +import nemo.collections.nlp.nm.trainables.common.token_classification_nm +from nemo.collections.nlp.callbacks.qa_squad_callback import eval_epochs_done_callback, eval_iter_callback from nemo.utils.lr_policies import get_lr_policy from tests.common_setup import NeMoUnitTest +logging = nemo.logging + class TestSquad(NeMoUnitTest): @classmethod def setUpClass(cls) -> None: super().setUpClass() + data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/nlp/squad")) + squad_v1_folder = os.path.join(data_folder, "v1.1") + for f in os.listdir(squad_v1_folder): + ff = os.path.join(squad_v1_folder, f) + if f.startswith("cache"): + logging.info(f"remove {ff}") + os.remove(ff) + squad_v2_folder = os.path.join(data_folder, "v1.1") + for f in os.listdir(squad_v2_folder): + ff = os.path.join(squad_v1_folder, f) + if f.startswith("cache"): + logging.info(f"remove {ff}") + os.remove(ff) @classmethod def tearDownClass(cls) -> None: super().tearDownClass() + data_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/nlp/squad")) + squad_v1_folder = os.path.join(data_folder, "v1.1") + for f in os.listdir(squad_v1_folder): + ff = os.path.join(squad_v1_folder, f) + if f.startswith("cache"): + logging.info(f"remove {ff}") + os.remove(ff) + squad_v2_folder = os.path.join(data_folder, "v1.1") + for f in os.listdir(squad_v2_folder): + ff = os.path.join(squad_v1_folder, f) + if f.startswith("cache"): + logging.info(f"remove {ff}") + os.remove(ff) def test_squad_v1(self): version_2_with_negative = False @@ -54,16 +85,18 @@ def test_squad_v1(self): max_answer_length = 20 null_score_diff_threshold = 0.0 - tokenizer = nemo_nlp.NemoBertTokenizer(pretrained_bert_model) + tokenizer = nemo_nlp.data.NemoBertTokenizer(pretrained_bert_model) neural_factory = nemo.core.NeuralModuleFactory( - backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False, + backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False ) - model = nemo_nlp.huggingface.BERT(pretrained_model_name=pretrained_bert_model) + model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(pretrained_model_name=pretrained_bert_model) hidden_size = model.hidden_size - qa_head = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False,) - squad_loss = nemo_nlp.QuestionAnsweringLoss() + qa_head = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier( + hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False + ) + squad_loss = nemo_nlp.nm.losses.QuestionAnsweringLoss() - data_layer = nemo_nlp.BertQuestionAnsweringDataLayer( + data_layer = nemo.collections.nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer( mode='train', version_2_with_negative=version_2_with_negative, batch_size=batch_size, @@ -74,14 +107,14 @@ def test_squad_v1(self): doc_stride=doc_stride, ) - (input_ids, input_type_ids, input_mask, start_positions, end_positions, _,) = data_layer() + (input_ids, input_type_ids, input_mask, start_positions, end_positions, _) = data_layer() - hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,) + hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) qa_output = qa_head(hidden_states=hidden_states) - loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions,) + loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions) - data_layer_eval = nemo_nlp.BertQuestionAnsweringDataLayer( + data_layer_eval = nemo.collections.nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer( mode='dev', version_2_with_negative=version_2_with_negative, batch_size=batch_size, @@ -101,12 +134,12 @@ def test_squad_v1(self): ) = data_layer_eval() hidden_states_eval = model( - input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval, + input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval ) qa_output_eval = qa_head(hidden_states=hidden_states_eval) _, start_logits_eval, end_logits_eval = squad_loss( - logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval, + logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval ) eval_output = [start_logits_eval, end_logits_eval, unique_ids_eval] @@ -134,7 +167,7 @@ def test_squad_v1(self): eval_step=eval_step_freq, ) - lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion,) + lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion) neural_factory.train( tensors_to_optimize=[loss], @@ -161,16 +194,20 @@ def test_squad_v2(self): max_answer_length = 20 null_score_diff_threshold = 0.0 - tokenizer = nemo_nlp.NemoBertTokenizer(pretrained_bert_model) + tokenizer = nemo_nlp.data.NemoBertTokenizer(pretrained_bert_model) neural_factory = nemo.core.NeuralModuleFactory( - backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False, + backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False ) - model = nemo_nlp.huggingface.BERT(pretrained_model_name=pretrained_bert_model) + model = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(pretrained_model_name=pretrained_bert_model) + hidden_size = model.hidden_size - qa_head = nemo_nlp.TokenClassifier(hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False,) - squad_loss = nemo_nlp.QuestionAnsweringLoss() - data_layer = nemo_nlp.BertQuestionAnsweringDataLayer( + qa_head = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier( + hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False + ) + squad_loss = nemo_nlp.nm.losses.QuestionAnsweringLoss() + + data_layer = nemo.collections.nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer( mode='train', version_2_with_negative=version_2_with_negative, batch_size=batch_size, @@ -181,14 +218,14 @@ def test_squad_v2(self): doc_stride=doc_stride, ) - (input_ids, input_type_ids, input_mask, start_positions, end_positions, _,) = data_layer() + (input_ids, input_type_ids, input_mask, start_positions, end_positions, _) = data_layer() - hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask,) + hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) qa_output = qa_head(hidden_states=hidden_states) - loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions,) + loss, _, _ = squad_loss(logits=qa_output, start_positions=start_positions, end_positions=end_positions) - data_layer_eval = nemo_nlp.BertQuestionAnsweringDataLayer( + data_layer_eval = nemo.collections.nlp.nm.data_layers.qa_squad_datalayer.BertQuestionAnsweringDataLayer( mode='dev', version_2_with_negative=version_2_with_negative, batch_size=batch_size, @@ -208,12 +245,12 @@ def test_squad_v2(self): ) = data_layer_eval() hidden_states_eval = model( - input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval, + input_ids=input_ids_eval, token_type_ids=input_type_ids_eval, attention_mask=input_mask_eval ) qa_output_eval = qa_head(hidden_states=hidden_states_eval) _, start_logits_eval, end_logits_eval = squad_loss( - logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval, + logits=qa_output_eval, start_positions=start_positions_eval, end_positions=end_positions_eval ) eval_output = [start_logits_eval, end_logits_eval, unique_ids_eval] @@ -241,7 +278,7 @@ def test_squad_v2(self): eval_step=eval_step_freq, ) - lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion,) + lr_policy_fn = get_lr_policy('WarmupAnnealing', total_steps=max_steps, warmup_ratio=lr_warmup_proportion) neural_factory.train( tensors_to_optimize=[loss], diff --git a/tests/test_deploy_export.py b/tests/test_deploy_export.py index 5cde3cbbb10e..be6a1a39573c 100644 --- a/tests/test_deploy_export.py +++ b/tests/test_deploy_export.py @@ -31,6 +31,7 @@ import nemo import nemo.collections.asr as nemo_asr import nemo.collections.nlp as nemo_nlp +import nemo.collections.nlp.nm.trainables.common.token_classification_nm from tests.common_setup import NeMoUnitTest @@ -47,9 +48,7 @@ def __test_export_route(self, module, out_name, mode, input_example=None): if out.exists(): os.remove(out) - self.nf.deployment_export( - module=module, output=out_name, input_example=input_example, d_format=mode, - ) + self.nf.deployment_export(module=module, output=out_name, input_example=input_example, d_format=mode) self.assertTrue(out.exists()) if mode == nemo.core.DeploymentFormat.ONNX: @@ -89,7 +88,9 @@ def test_simple_module_export(self): ) def test_TokenClassifier_module_export(self): - t_class = nemo_nlp.TokenClassifier(hidden_size=512, num_classes=16, use_transformer_pretrained=False) + t_class = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier( + hidden_size=512, num_classes=16, use_transformer_pretrained=False + ) self.__test_export_route( module=t_class, out_name="t_class.pt", @@ -98,7 +99,9 @@ def test_TokenClassifier_module_export(self): ) def test_TokenClassifier_module_onnx_export(self): - t_class = nemo_nlp.TokenClassifier(hidden_size=512, num_classes=16, use_transformer_pretrained=False) + t_class = nemo.collections.nlp.nm.trainables.common.token_classification_nm.TokenClassifier( + hidden_size=512, num_classes=16, use_transformer_pretrained=False + ) self.__test_export_route( module=t_class, out_name="t_class.onnx", @@ -109,25 +112,23 @@ def test_TokenClassifier_module_onnx_export(self): def test_jasper_decoder_export_ts(self): j_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=33) self.__test_export_route( - module=j_decoder, out_name="j_decoder.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=None, + module=j_decoder, out_name="j_decoder.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=None ) def test_hf_bert_ts(self): - bert = nemo_nlp.huggingface.BERT(pretrained_model_name="bert-base-uncased") + bert = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(pretrained_model_name="bert-base-uncased") input_example = ( torch.randint(low=0, high=16, size=(2, 16)).cuda(), torch.randint(low=0, high=1, size=(2, 16)).cuda(), torch.randint(low=0, high=1, size=(2, 16)).cuda(), ) self.__test_export_route( - module=bert, out_name="bert.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=input_example, + module=bert, out_name="bert.ts", mode=nemo.core.DeploymentFormat.TORCHSCRIPT, input_example=input_example ) def test_hf_bert_pt(self): - bert = nemo_nlp.huggingface.BERT(pretrained_model_name="bert-base-uncased") - self.__test_export_route( - module=bert, out_name="bert.pt", mode=nemo.core.DeploymentFormat.PYTORCH, - ) + bert = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(pretrained_model_name="bert-base-uncased") + self.__test_export_route(module=bert, out_name="bert.pt", mode=nemo.core.DeploymentFormat.PYTORCH) def test_jasper_encoder_to_onnx(self): with open("tests/data/jasper_smaller.yaml") as file: @@ -144,5 +145,5 @@ def test_jasper_encoder_to_onnx(self): module=jasper_encoder, out_name="jasper_encoder.onnx", mode=nemo.core.DeploymentFormat.ONNX, - input_example=(torch.randn(16, 64, 256).cuda(), torch.randn(256).cuda(),), + input_example=(torch.randn(16, 64, 256).cuda(), torch.randn(256).cuda()), ) diff --git a/tests/test_infer.py b/tests/test_infer.py index c6faeb8cdcec..05cec60c6fb9 100644 --- a/tests/test_infer.py +++ b/tests/test_infer.py @@ -105,22 +105,20 @@ def test_infer_errors(self): with self.assertRaisesRegex(ValueError, "use_cache was set, but cache was empty"): evaluated_tensors = neural_factory.infer( - tensors=[twenty_tensor, thirty_tensor], verbose=False, use_cache=True, + tensors=[twenty_tensor, thirty_tensor], verbose=False, use_cache=True ) new_ten_tensor = minusten(mod_in=twenty_tensor) evaluated_tensors = neural_factory.infer(tensors=[new_ten_tensor], verbose=False, cache=True) with self.assertRaisesRegex(ValueError, "cache was set but was not empty"): - evaluated_tensors = neural_factory.infer( - tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True, - ) + evaluated_tensors = neural_factory.infer(tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True) neural_factory.clear_cache() evaluated_tensors = neural_factory.infer(tensors=[new_ten_tensor], verbose=False, cache=True) with self.assertRaisesRegex(ValueError, "cache and use_cache were both set."): evaluated_tensors = neural_factory.infer( - tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True, use_cache=True, + tensors=[twenty_tensor, thirty_tensor], verbose=False, cache=True, use_cache=True ) self.assertEqual(evaluated_tensors[0][0].squeeze().data, 10) diff --git a/tests/test_neural_types.py b/tests/test_neural_types.py index eb52abcffd7b..c2741ca3d7c6 100644 --- a/tests/test_neural_types.py +++ b/tests/test_neural_types.py @@ -43,13 +43,13 @@ def setUp(self) -> None: logging.info("ASR data found in: {0}".format(data_folder + "asr")) def test_same(self): - btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}) - btc2 = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}) + btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}) + btc2 = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}) self.assertEqual(btc2.compare(btc), NeuralTypeComparisonResult.SAME) def test_transpose_same(self): - btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}) - tbc = NeuralType(axis2type={1: AxisType(BatchTag), 0: AxisType(TimeTag), 2: AxisType(ChannelTag),}) + btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}) + tbc = NeuralType(axis2type={1: AxisType(BatchTag), 0: AxisType(TimeTag), 2: AxisType(ChannelTag)}) self.assertEqual(btc.compare(tbc), NeuralTypeComparisonResult.TRANSPOSE_SAME) self.assertEqual(tbc.compare(btc), NeuralTypeComparisonResult.TRANSPOSE_SAME) @@ -74,9 +74,9 @@ def test_dim_incompatible(self): self.assertEqual(nchw1.compare(nchw2), NeuralTypeComparisonResult.DIM_INCOMPATIBLE) def test_rank_incompatible(self): - btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}) + btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}) nchw = NeuralType( - axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag),} + axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag)} ) self.assertEqual(nchw.compare(btc), NeuralTypeComparisonResult.INCOMPATIBLE) @@ -91,10 +91,10 @@ def test_axis_type(self): def test_semantic_incompatible(self): nchw = NeuralType( - axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag),} + axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag)} ) badd = NeuralType( - axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag), 3: AxisType(WidthTag),} + axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(ChannelTag), 3: AxisType(WidthTag)} ) self.assertEqual(nchw.compare(badd), NeuralTypeComparisonResult.INCOMPATIBLE) self.assertEqual(badd.compare(nchw), NeuralTypeComparisonResult.INCOMPATIBLE) @@ -102,9 +102,9 @@ def test_semantic_incompatible(self): def test_root(self): root = NeuralType({}) non_tensor = NeuralType(None) - btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag),}) + btc = NeuralType(axis2type={0: AxisType(BatchTag), 1: AxisType(TimeTag), 2: AxisType(ChannelTag)}) nchw = NeuralType( - axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag),} + axis2type={0: AxisType(BatchTag), 1: AxisType(ChannelTag), 2: AxisType(HeightTag), 3: AxisType(WidthTag)} ) self.assertEqual(root.compare(btc), NeuralTypeComparisonResult.SAME) self.assertEqual(root.compare(nchw), NeuralTypeComparisonResult.SAME) @@ -117,10 +117,10 @@ def test_root(self): def test_combiner_type_infer(self): combiner = nemo.backends.pytorch.common.SimpleCombiner(mode="add") x_tg = nemo.core.NmTensor( - producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag),}), + producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag)}) ) y_tg = nemo.core.NmTensor( - producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag),}), + producer=None, producer_args=None, name=None, ntype=NeuralType({0: AxisType(BatchTag)}) ) res = combiner(x1=y_tg, x2=x_tg) self.assertEqual(res.compare(x_tg), NeuralTypeComparisonResult.SAME) @@ -157,7 +157,7 @@ def test_optional_input_no_input(self): optimizer = nemo.backends.pytorch.actions.PtActions() optimizer.train( - tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1}, + tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1} ) def test_optional_input_no_with_input(self): @@ -169,7 +169,7 @@ def test_optional_input_no_with_input(self): loss_tensor = loss(predictions=y_pred, target=y) optimizer = nemo.backends.pytorch.actions.PtActions() optimizer.train( - tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1}, + tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1} ) def test_optional_input_no_with_wrong_input(self): @@ -188,9 +188,7 @@ def wrong_fn(): loss_tensor = loss(predictions=y_pred, target=y) optimizer = nemo.backends.pytorch.actions.PtActions() optimizer.train( - tensors_to_optimize=[loss_tensor], - optimizer="sgd", - optimization_params={"lr": 0.0003, "num_epochs": 1}, + tensors_to_optimize=[loss_tensor], optimizer="sgd", optimization_params={"lr": 0.0003, "num_epochs": 1} ) self.assertRaises(NeuralPortNmTensorMismatchError, wrong_fn) @@ -202,7 +200,7 @@ def test_simple_dags(self): labels = jasper_model_definition['labels'] data_layer = nemo_asr.AudioToTextDataLayer( - manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4, + manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4 ) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **jasper_model_definition['AudioToMelSpectrogramPreprocessor'] @@ -216,7 +214,7 @@ def test_simple_dags(self): greedy_decoder = nemo_asr.GreedyCTCDecoder() # DAG definition - (audio_signal, audio_signal_len, transcript, transcript_len,) = data_layer() + (audio_signal, audio_signal_len, transcript, transcript_len) = data_layer() processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len) spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5) @@ -226,7 +224,7 @@ def test_simple_dags(self): log_probs = jasper_decoder(encoder_output=encoded) predictions = greedy_decoder(log_probs=log_probs) loss = ctc_loss( - log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, + log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len ) def wrong(): @@ -235,7 +233,7 @@ def wrong(): labels = jasper_config['labels'] data_layer = nemo_asr.AudioToTextDataLayer( - manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4, + manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4 ) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **jasper_config['AudioToMelSpectrogramPreprocessor'] @@ -246,7 +244,7 @@ def wrong(): ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels)) # DAG definition - (audio_signal, audio_signal_len, transcript, transcript_len,) = data_layer() + (audio_signal, audio_signal_len, transcript, transcript_len) = data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len )