Skip to content

Commit

Permalink
nlp refactoring (#316)
Browse files Browse the repository at this point in the history
Signed-off-by: VahidooX <[email protected]>
Co-authored-by: Yang Zhang <[email protected]>
Co-authored-by: Tomasz Kornuta <[email protected]>
Co-authored-by: Evelina <[email protected]>
  • Loading branch information
4 people authored Feb 4, 2020
1 parent b4aba85 commit 42a5f20
Show file tree
Hide file tree
Showing 123 changed files with 6,616 additions and 5,657 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ To release a new version, please update the changelog as followed:
([PR #286](https://github.com/NVIDIA/NeMo/pull/286)) - @stasbel
- Major cleanup of Neural Module constructors (init), aiming at increasing the framework robustness: cleanup of NeuralModule initialization logic, refactor of trainer/actions (getting rid of local_params), fixes of several examples and unit tests, extraction and storing of intial parameters (init_params).
([PR #309](https://github.com/NVIDIA/NeMo/pull/309)) - @tkornuta-nvidia
- Refactoring of `nemo_nlp` collections:
([PR #316](https://github.com/NVIDIA/NeMo/pull/316)) - @VahidooX, @yzhang123, @ekmb
- renaming of files and restructuring of folder in `nemo_nlp`
- Updated licenses


### Dependencies Update
Expand Down
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ pipeline {
}
stage ('NMT test') {
steps {
sh 'cd examples/nlp && CUDA_VISIBLE_DEVICES=0 python nmt_tutorial.py'
sh 'cd examples/nlp && CUDA_VISIBLE_DEVICES=0 python machine_translation_tutorial.py'
}
}
}
Expand Down
1 change: 0 additions & 1 deletion examples/asr/notebooks/2_Online_ASR_Microphone_Demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@
"data_layer = AudioDataLayer()\n",
"\n",
"data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor(\n",
" factory=neural_factory,\n",
" **model_definition['AudioToMelSpectrogramPreprocessor'])\n",
"\n",
"jasper_encoder = nemo_asr.JasperEncoder(\n",
Expand Down
26 changes: 17 additions & 9 deletions examples/nlp/BERTPretrainingTutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
"from nemo.utils.lr_policies import CosineAnnealing\n",
"\n",
"import nemo.collections.nlp as nemo_nlp\n",
"from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n",
"from nemo.collections.nlp.utils.callbacks.bert_pretraining import eval_iter_callback, \\\n",
"from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n",
"from nemo.collections.nlp.callbacks.lm_bert_callback import eval_iter_callback, \\\n",
" eval_epochs_done_callback\n",
"\n",
"BATCHES_PER_STEP = 1\n",
Expand Down Expand Up @@ -126,7 +126,7 @@
"metadata": {},
"outputs": [],
"source": [
"bert_model = nemo_nlp.huggingface.BERT(\n",
"bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n",
" vocab_size=tokenizer.vocab_size,\n",
" num_hidden_layers=NUM_LAYERS,\n",
" hidden_size=D_MODEL,\n",
Expand All @@ -144,21 +144,21 @@
"outputs": [],
"source": [
"# Masked Language Modeling Loss\n",
"mlm_classifier = nemo_nlp.BertTokenClassifier(D_MODEL,\n",
"mlm_classifier = nemo_nlp.nm.trainables.BertTokenClassifier(D_MODEL,\n",
" num_classes=tokenizer.vocab_size,\n",
" activation=HIDDEN_ACT,\n",
" log_softmax=True)\n",
"mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM()\n",
"mlm_loss = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM()\n",
"\n",
"# Next Sentence Prediciton Loss\n",
"nsp_classifier = nemo_nlp.SequenceClassifier(D_MODEL,\n",
"nsp_classifier = nemo_nlp.nm.trainables.SequenceClassifier(D_MODEL,\n",
" num_classes=2,\n",
" num_layers=2,\n",
" activation='tanh',\n",
" log_softmax=False)\n",
"nsp_loss = nemo.backends.pytorch.common.CrossEntropyLoss()\n",
"\n",
"bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)"
"bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)"
]
},
{
Expand All @@ -167,15 +167,16 @@
"metadata": {},
"outputs": [],
"source": [
"train_data_layer = nemo_nlp.BertPretrainingDataLayer(\n",
"import os\n",
"train_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
" tokenizer=tokenizer,\n",
" dataset=os.path.join(\"data/lm/wikitext-2\", \"train.txt\"),\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
" mask_probability=MASK_PROBABILITY,\n",
" batch_size=BATCH_SIZE\n",
")\n",
"\n",
"eval_data_layer = nemo_nlp.BertPretrainingDataLayer(\n",
"eval_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
" tokenizer=tokenizer,\n",
" dataset=os.path.join(\"data/lm/wikitext-2\", \"valid.txt\"),\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
Expand Down Expand Up @@ -282,6 +283,13 @@
" \"grad_norm_clip\": None\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
27 changes: 14 additions & 13 deletions examples/nlp/NERWithBERT.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,18 @@
"from nemo.utils.lr_policies import WarmupAnnealing\n",
"\n",
"import nemo.collections.nlp as nemo_nlp\n",
"from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n",
"from nemo.collections.nlp.utils.callbacks.token_classification import \\\n",
" eval_iter_callback, eval_epochs_done_callback"
"from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n",
"from nemo.collections.nlp.callbacks.token_classification_callback import \\\n",
" eval_iter_callback, eval_epochs_done_callback\n",
"from nemo.collections.nlp.nm.losses import TokenClassificationLoss\n",
"from nemo.collections.nlp.nm.trainables import TokenClassifier"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it."
"You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it."
]
},
{
Expand Down Expand Up @@ -78,7 +80,7 @@
"# If you're using a standard BERT model, you should do it like this. To see the full\n",
"# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n",
"tokenizer = NemoBertTokenizer(pretrained_model=\"bert-base-cased\")\n",
"bert_model = nemo_nlp.huggingface.BERT(\n",
"bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n",
" pretrained_model_name=\"bert-base-cased\")"
]
},
Expand All @@ -89,7 +91,7 @@
"outputs": [],
"source": [
"# Describe training DAG\n",
"train_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n",
"train_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n",
" tokenizer=tokenizer,\n",
" text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n",
" label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n",
Expand All @@ -99,13 +101,12 @@
"label_ids = train_data_layer.dataset.label_ids\n",
"num_classes = len(label_ids)\n",
"\n",
"ner_classifier = nemo_nlp.TokenClassifier(hidden_size=bert_model.hidden_size,\n",
"hidden_size = bert_model.hidden_size\n",
"ner_classifier = TokenClassifier(hidden_size=hidden_size,\n",
" num_classes=num_classes,\n",
" dropout=CLASSIFICATION_DROPOUT)\n",
"\n",
"ner_loss = nemo_nlp.TokenClassificationLoss(d_model=hidden_size,\n",
" num_classes=len(label_ids),\n",
" dropout=CLASSIFICATION_DROPOUT)\n",
"ner_loss = TokenClassificationLoss(num_classes=len(label_ids))\n",
"\n",
"input_ids, input_type_ids, input_mask, loss_mask, _, labels = train_data_layer()\n",
"\n",
Expand All @@ -124,7 +125,7 @@
"outputs": [],
"source": [
"# Describe evaluation DAG\n",
"eval_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n",
"eval_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n",
" tokenizer=tokenizer,\n",
" text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n",
" label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n",
Expand Down Expand Up @@ -203,9 +204,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.7.4 64-bit",
"display_name": "Python 3",
"language": "python",
"name": "python37464bitc56e562f54084a24b5afed5459c99218"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand Down
38 changes: 21 additions & 17 deletions examples/nlp/PunctuationWithBERT.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,15 @@
"import os\n",
"\n",
"import nemo\n",
"from nemo import logging\n",
"from nemo.utils.lr_policies import WarmupAnnealing\n",
"\n",
"import nemo.collections.nlp as nemo_nlp\n",
"from nemo.collections.nlp import NemoBertTokenizer, TokenClassifier, TokenClassificationLoss\n",
"from nemo.collections.nlp.data.datasets import utils\n",
"from nemo.collections.nlp.utils.callbacks.punctuation_capitalization import eval_iter_callback, eval_epochs_done_callback\n",
"from nemo.collections.nlp.data import NemoBertTokenizer\n",
"from nemo.collections.nlp.nm.trainables import TokenClassifier\n",
"from nemo.collections.nlp.nm.losses import TokenClassificationLoss, LossAggregatorNM\n",
"from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import eval_iter_callback, eval_epochs_done_callback\n",
"from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights\n",
"\n",
"DATA_DIR = \"PATH_TO_WHERE_THE_DATA_IS\"\n",
"WORK_DIR = \"PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS\"\n",
Expand Down Expand Up @@ -47,7 +50,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data."
"In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/nemo/collections/nlp/data/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data."
]
},
{
Expand All @@ -57,7 +60,8 @@
"outputs": [],
"source": [
"# This should take about a minute since the data is already downloaded in the previous step\n",
"! python ../../scripts/get_tatoeba_data.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES"
"\n",
"! python ../../nemo/collections/nlp/data/scripts/get_tatoeba.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES"
]
},
{
Expand Down Expand Up @@ -116,7 +120,7 @@
"# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n",
"\n",
"tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)\n",
"bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)"
"bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)"
]
},
{
Expand All @@ -132,7 +136,7 @@
"metadata": {},
"outputs": [],
"source": [
"train_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n",
"train_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n",
" tokenizer=tokenizer,\n",
" text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n",
" label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n",
Expand All @@ -144,14 +148,14 @@
"\n",
"\n",
"# Define classifier for Punctuation and Capitalization tasks\n",
"punct_classifier = nemo_nlp.TokenClassifier(\n",
"punct_classifier = TokenClassifier(\n",
" hidden_size=bert_model.hidden_size,\n",
" num_classes=len(punct_label_ids),\n",
" dropout=CLASSIFICATION_DROPOUT,\n",
" num_layers=PUNCT_NUM_FC_LAYERS,\n",
" name='Punctuation')\n",
"\n",
"capit_classifier = nemo_nlp.TokenClassifier(\n",
"capit_classifier = TokenClassifier(\n",
" hidden_size=bert_model.hidden_size,\n",
" num_classes=len(capit_label_ids),\n",
" dropout=CLASSIFICATION_DROPOUT,\n",
Expand All @@ -160,14 +164,14 @@
"\n",
"# If you don't want to use weighted loss for Punctuation task, use class_weights=None\n",
"punct_label_freqs = train_data_layer.dataset.punct_label_frequencies\n",
"class_weights = utils.calc_class_weights(punct_label_freqs)\n",
"class_weights = calc_class_weights(punct_label_freqs)\n",
"\n",
"# define loss\n",
"punct_loss = nemo_nlp.TokenClassificationLoss(\n",
"punct_loss = TokenClassificationLoss(\n",
" num_classes=len(punct_label_ids),\n",
" class_weights=class_weights)\n",
"capit_loss = nemo_nlp.TokenClassificationLoss(num_classes=len(capit_label_ids))\n",
"task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)"
"capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids))\n",
"task_loss = LossAggregatorNM(num_inputs=2)"
]
},
{
Expand Down Expand Up @@ -218,7 +222,7 @@
"# during creation of the train_data_layer to make sure that the mapping is correct in case some of the labels from\n",
"# the train set are missing in the dev set.\n",
"\n",
"eval_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n",
"eval_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n",
" tokenizer=tokenizer,\n",
" text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n",
" label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n",
Expand Down Expand Up @@ -361,7 +365,7 @@
"metadata": {},
"outputs": [],
"source": [
"infer_data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(\n",
"infer_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(\n",
" queries=queries,\n",
" tokenizer=tokenizer,\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
Expand Down Expand Up @@ -399,7 +403,7 @@
"capit_preds = np.argmax(capit_logits, axis=2)\n",
"\n",
"for i, query in enumerate(queries):\n",
" nf.logger.info(f'Query: {query}')\n",
" logging(f'Query: {query}')\n",
"\n",
" punct_pred = punct_preds[i][subtokens_mask[i] > 0.5]\n",
" capit_pred = capit_preds[i][subtokens_mask[i] > 0.5]\n",
Expand All @@ -419,7 +423,7 @@
" if punct_label != 'O':\n",
" output += punct_label\n",
" output += ' '\n",
" nf.logger.info(f'Combined: {output.strip()}\\n')"
" logging(f'Combined: {output.strip()}\\n')"
]
},
{
Expand Down
Loading

0 comments on commit 42a5f20

Please sign in to comment.