Skip to content

Commit

Permalink
add all changed nlp files
Browse files Browse the repository at this point in the history
Signed-off-by: VahidooX <[email protected]>
  • Loading branch information
VahidooX committed Jan 31, 2020
1 parent ff7774b commit fdc421b
Show file tree
Hide file tree
Showing 117 changed files with 5,185 additions and 5,394 deletions.
18 changes: 9 additions & 9 deletions examples/nlp/BERTPretrainingTutorial.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
"from nemo.utils.lr_policies import CosineAnnealing\n",
"\n",
"import nemo.collections.nlp as nemo_nlp\n",
"from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n",
"from nemo.collections.nlp.utils.callbacks.bert_pretraining import eval_iter_callback, \\\n",
"from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n",
"from nemo.collections.nlp.callbacks.lm_bert_callback import eval_iter_callback, \\\n",
" eval_epochs_done_callback\n",
"\n",
"BATCHES_PER_STEP = 1\n",
Expand Down Expand Up @@ -126,7 +126,7 @@
"metadata": {},
"outputs": [],
"source": [
"bert_model = nemo_nlp.huggingface.BERT(\n",
"bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n",
" vocab_size=tokenizer.vocab_size,\n",
" num_hidden_layers=NUM_LAYERS,\n",
" hidden_size=D_MODEL,\n",
Expand All @@ -144,21 +144,21 @@
"outputs": [],
"source": [
"# Masked Language Modeling Loss\n",
"mlm_classifier = nemo_nlp.BertTokenClassifier(D_MODEL,\n",
"mlm_classifier = nemo_nlp.nm.trainables.BertTokenClassifier(D_MODEL,\n",
" num_classes=tokenizer.vocab_size,\n",
" activation=HIDDEN_ACT,\n",
" log_softmax=True)\n",
"mlm_loss = nemo_nlp.MaskedLanguageModelingLossNM()\n",
"mlm_loss = nemo_nlp.nm.losses.MaskedLanguageModelingLossNM()\n",
"\n",
"# Next Sentence Prediciton Loss\n",
"nsp_classifier = nemo_nlp.SequenceClassifier(D_MODEL,\n",
"nsp_classifier = nemo_nlp.nm.trainables.SequenceClassifier(D_MODEL,\n",
" num_classes=2,\n",
" num_layers=2,\n",
" activation='tanh',\n",
" log_softmax=False)\n",
"nsp_loss = nemo.backends.pytorch.common.CrossEntropyLoss()\n",
"\n",
"bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)"
"bert_loss = nemo_nlp.nm.losses.LossAggregatorNM(num_inputs=2)"
]
},
{
Expand All @@ -168,15 +168,15 @@
"outputs": [],
"source": [
"import os\n",
"train_data_layer = nemo_nlp.BertPretrainingDataLayer(\n",
"train_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
" tokenizer=tokenizer,\n",
" dataset=os.path.join(\"data/lm/wikitext-2\", \"train.txt\"),\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
" mask_probability=MASK_PROBABILITY,\n",
" batch_size=BATCH_SIZE,\n",
" factory=neural_factory)\n",
"\n",
"eval_data_layer = nemo_nlp.BertPretrainingDataLayer(\n",
"eval_data_layer = nemo_nlp.nm.data_layers.BertPretrainingDataLayer(\n",
" tokenizer=tokenizer,\n",
" dataset=os.path.join(\"data/lm/wikitext-2\", \"valid.txt\"),\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
Expand Down
20 changes: 11 additions & 9 deletions examples/nlp/NERWithBERT.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,18 @@
"from nemo.utils.lr_policies import WarmupAnnealing\n",
"\n",
"import nemo.collections.nlp as nemo_nlp\n",
"from nemo.collections.nlp import NemoBertTokenizer, SentencePieceTokenizer\n",
"from nemo.collections.nlp.utils.callbacks.token_classification import \\\n",
" eval_iter_callback, eval_epochs_done_callback"
"from nemo.collections.nlp.data import NemoBertTokenizer, SentencePieceTokenizer\n",
"from nemo.collections.nlp.callbacks.token_classification_callback import \\\n",
" eval_iter_callback, eval_epochs_done_callback\n",
"from nemo.collections.nlp.nm.losses import TokenClassificationLoss\n",
"from nemo.collections.nlp.nm.trainables import TokenClassifier"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it."
"You can download data from [here](https://github.com/kyzhouhzau/BERT-NER/tree/master/data) and use [this](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/convert_iob_format_to_token_classification_format.py) script to preprocess it."
]
},
{
Expand Down Expand Up @@ -78,7 +80,7 @@
"# If you're using a standard BERT model, you should do it like this. To see the full\n",
"# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n",
"tokenizer = NemoBertTokenizer(pretrained_model=\"bert-base-cased\")\n",
"bert_model = nemo_nlp.huggingface.BERT(\n",
"bert_model = nemo_nlp.nm.trainables.huggingface.BERT(\n",
" pretrained_model_name=\"bert-base-cased\")"
]
},
Expand All @@ -89,7 +91,7 @@
"outputs": [],
"source": [
"# Describe training DAG\n",
"train_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n",
"train_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n",
" tokenizer=tokenizer,\n",
" text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n",
" label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n",
Expand All @@ -100,11 +102,11 @@
"num_classes = len(label_ids)\n",
"\n",
"hidden_size = bert_model.local_parameters[\"hidden_size\"]\n",
"ner_classifier = nemo_nlp.TokenClassifier(hidden_size=hidden_size,\n",
"ner_classifier = TokenClassifier(hidden_size=hidden_size,\n",
" num_classes=num_classes,\n",
" dropout=CLASSIFICATION_DROPOUT)\n",
"\n",
"ner_loss = nemo_nlp.TokenClassificationLoss(d_model=hidden_size,\n",
"ner_loss = TokenClassificationLoss(d_model=hidden_size,\n",
" num_classes=len(label_ids),\n",
" dropout=CLASSIFICATION_DROPOUT)\n",
"\n",
Expand All @@ -125,7 +127,7 @@
"outputs": [],
"source": [
"# Describe evaluation DAG\n",
"eval_data_layer = nemo_nlp.BertTokenClassificationDataLayer(\n",
"eval_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationDataLayer(\n",
" tokenizer=tokenizer,\n",
" text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n",
" label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n",
Expand Down
37 changes: 20 additions & 17 deletions examples/nlp/PunctuationWithBERT.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
"from nemo.utils.lr_policies import WarmupAnnealing\n",
"\n",
"import nemo.collections.nlp as nemo_nlp\n",
"from nemo.collections.nlp import NemoBertTokenizer, TokenClassifier, TokenClassificationLoss\n",
"from nemo.collections.nlp.data.datasets import utils\n",
"from nemo.collections.nlp.utils.callbacks.punctuation_capitalization import eval_iter_callback, eval_epochs_done_callback\n",
"from nemo.collections.nlp.data import NemoBertTokenizer\n",
"from nemo.collections.nlp.nm.trainables import TokenClassifier\n",
"from nemo.collections.nlp.nm.losses import TokenClassificationLoss, LossAggregatorNM\n",
"from nemo.collections.nlp.callbacks.punctuation_capitalization_callback import eval_iter_callback, eval_epochs_done_callback\n",
"from nemo.collections.nlp.utils.common_nlp_utils import calc_class_weights\n",
"\n",
"DATA_DIR = \"PATH_TO_WHERE_THE_DATA_IS\"\n",
"WORK_DIR = \"PATH_TO_WHERE_TO_STORE_CHECKPOINTS_AND_LOGS\"\n",
Expand Down Expand Up @@ -47,7 +49,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data."
"In this notebook we're going to use a subset of English examples from the [Tatoeba collection of sentences](https://tatoeba.org/eng), set NUM_SAMPLES=-1 and consider including other datasets to improve the performance of the model. Use [NeMo/nemo/collections/nlp/data/scripts/get_tatoeba_data.py](https://github.com/NVIDIA/NeMo/blob/master/nemo/collections/nlp/data/scripts/get_tatoeba_data.py) to download and preprocess the Tatoeba data."
]
},
{
Expand All @@ -57,7 +59,8 @@
"outputs": [],
"source": [
"# This should take about a minute since the data is already downloaded in the previous step\n",
"! python ../../scripts/get_tatoeba_data.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES"
"\n",
"! python ../../nemo/collections/nlp/data/scripts/get_tatoeba.py --data_dir $DATA_DIR --num_sample $NUM_SAMPLES"
]
},
{
Expand Down Expand Up @@ -116,7 +119,7 @@
"# list of BERT model names, check out nemo_nlp.huggingface.BERT.list_pretrained_models()\n",
"\n",
"tokenizer = NemoBertTokenizer(pretrained_model=PRETRAINED_BERT_MODEL)\n",
"bert_model = nemo_nlp.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)"
"bert_model = nemo_nlp.nm.trainables.huggingface.BERT(pretrained_model_name=PRETRAINED_BERT_MODEL)"
]
},
{
Expand All @@ -132,7 +135,7 @@
"metadata": {},
"outputs": [],
"source": [
"train_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n",
"train_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n",
" tokenizer=tokenizer,\n",
" text_file=os.path.join(DATA_DIR, 'text_train.txt'),\n",
" label_file=os.path.join(DATA_DIR, 'labels_train.txt'),\n",
Expand All @@ -146,14 +149,14 @@
"\n",
"\n",
"# Define classifier for Punctuation and Capitalization tasks\n",
"punct_classifier = nemo_nlp.TokenClassifier(\n",
"punct_classifier = TokenClassifier(\n",
" hidden_size=hidden_size,\n",
" num_classes=len(punct_label_ids),\n",
" dropout=CLASSIFICATION_DROPOUT,\n",
" num_layers=PUNCT_NUM_FC_LAYERS,\n",
" name='Punctuation')\n",
"\n",
"capit_classifier = nemo_nlp.TokenClassifier(\n",
"capit_classifier = TokenClassifier(\n",
" hidden_size=hidden_size,\n",
" num_classes=len(capit_label_ids),\n",
" dropout=CLASSIFICATION_DROPOUT,\n",
Expand All @@ -162,14 +165,14 @@
"\n",
"# If you don't want to use weighted loss for Punctuation task, use class_weights=None\n",
"punct_label_freqs = train_data_layer.dataset.punct_label_frequencies\n",
"class_weights = utils.calc_class_weights(punct_label_freqs)\n",
"class_weights = calc_class_weights(punct_label_freqs)\n",
"\n",
"# define loss\n",
"punct_loss = nemo_nlp.TokenClassificationLoss(\n",
"punct_loss = TokenClassificationLoss(\n",
" num_classes=len(punct_label_ids),\n",
" class_weights=class_weights)\n",
"capit_loss = nemo_nlp.TokenClassificationLoss(num_classes=len(capit_label_ids))\n",
"task_loss = nemo_nlp.LossAggregatorNM(num_inputs=2)"
"capit_loss = TokenClassificationLoss(num_classes=len(capit_label_ids))\n",
"task_loss = LossAggregatorNM(num_inputs=2)"
]
},
{
Expand Down Expand Up @@ -220,7 +223,7 @@
"# during creation of the train_data_layer to make sure that the mapping is correct in case some of the labels from\n",
"# the train set are missing in the dev set.\n",
"\n",
"eval_data_layer = nemo_nlp.BertPunctuationCapitalizationDataLayer(\n",
"eval_data_layer = nemo_nlp.nm.data_layers.PunctuationCapitalizationDataLayer(\n",
" tokenizer=tokenizer,\n",
" text_file=os.path.join(DATA_DIR, 'text_dev.txt'),\n",
" label_file=os.path.join(DATA_DIR, 'labels_dev.txt'),\n",
Expand Down Expand Up @@ -363,7 +366,7 @@
"metadata": {},
"outputs": [],
"source": [
"infer_data_layer = nemo_nlp.BertTokenClassificationInferDataLayer(\n",
"infer_data_layer = nemo_nlp.nm.data_layers.BertTokenClassificationInferDataLayer(\n",
" queries=queries,\n",
" tokenizer=tokenizer,\n",
" max_seq_length=MAX_SEQ_LENGTH,\n",
Expand Down Expand Up @@ -401,7 +404,7 @@
"capit_preds = np.argmax(capit_logits, axis=2)\n",
"\n",
"for i, query in enumerate(queries):\n",
" nf.logger.info(f'Query: {query}')\n",
" print(f'Query: {query}')\n",
"\n",
" punct_pred = punct_preds[i][subtokens_mask[i] > 0.5]\n",
" capit_pred = capit_preds[i][subtokens_mask[i] > 0.5]\n",
Expand All @@ -421,7 +424,7 @@
" if punct_label != 'O':\n",
" output += punct_label\n",
" output += ' '\n",
" nf.logger.info(f'Combined: {output.strip()}\\n')"
" print(f'Combined: {output.strip()}\\n')"
]
},
{
Expand Down
28 changes: 18 additions & 10 deletions examples/nlp/asr_postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@

import nemo
import nemo.collections.nlp as nemo_nlp
from nemo.collections.nlp.callbacks.translation import eval_epochs_done_callback_wer, eval_iter_callback
import nemo.collections.nlp.nm.data_layers.machine_translation_datalayer
from nemo.collections.nlp.callbacks.machine_translation_callback import (
eval_epochs_done_callback_wer,
eval_iter_callback,
)
from nemo.collections.nlp.data.tokenizers.bert_tokenizer import NemoBertTokenizer
from nemo.core.callbacks import CheckpointCallback
from nemo.utils.lr_policies import SquareAnnealing
Expand Down Expand Up @@ -47,7 +51,7 @@
parser.add_argument("--beam_size", default=4, type=int)
parser.add_argument("--len_pen", default=0.0, type=float)
parser.add_argument(
"--restore_from", dest="restore_from", type=str, default="../../scripts/bert-base-uncased_decoder.pt",
"--restore_from", dest="restore_from", type=str, default="../../scripts/bert-base-uncased_decoder.pt"
)
args = parser.parse_args()

Expand All @@ -66,14 +70,16 @@
tokens_to_add = vocab_size - tokenizer.vocab_size

zeros_transform = nemo.backends.pytorch.common.ZerosLikeNM()
encoder = nemo_nlp.BERT(pretrained_model_name=args.pretrained_model, local_rank=args.local_rank)
encoder = nemo.collections.nlp.nm.trainables.common.huggingface.BERT(
pretrained_model_name=args.pretrained_model, local_rank=args.local_rank
)
device = encoder.bert.embeddings.word_embeddings.weight.get_device()
zeros = torch.zeros((tokens_to_add, args.d_model)).to(device=device)
encoder.bert.embeddings.word_embeddings.weight.data = torch.cat(
(encoder.bert.embeddings.word_embeddings.weight.data, zeros)
)

decoder = nemo_nlp.TransformerDecoderNM(
decoder = nemo_nlp.nm.trainables.TransformerDecoderNM(
d_model=args.d_model,
d_inner=args.d_inner,
num_layers=args.num_layers,
Expand All @@ -90,11 +96,13 @@

decoder.restore_from(args.restore_from, local_rank=args.local_rank)

t_log_softmax = nemo_nlp.TokenClassifier(args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True)
t_log_softmax = nemo_nlp.nm.trainables.TokenClassifier(
args.d_model, num_classes=vocab_size, num_layers=1, log_softmax=True
)

loss_fn = nemo_nlp.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), smoothing=0.1)
loss_fn = nemo_nlp.nm.losses.PaddedSmoothedCrossEntropyLossNM(pad_id=tokenizer.pad_id(), smoothing=0.1)

beam_search = nemo_nlp.BeamSearchTranslatorNM(
beam_search = nemo_nlp.nm.trainables.BeamSearchTranslatorNM(
decoder=decoder,
log_softmax=t_log_softmax,
max_seq_length=args.max_seq_length,
Expand All @@ -114,7 +122,7 @@
def create_pipeline(dataset, tokens_in_batch, clean=False, training=True):
dataset_src = os.path.join(args.data_dir, dataset + "." + args.src_lang)
dataset_tgt = os.path.join(args.data_dir, dataset + "." + args.tgt_lang)
data_layer = nemo_nlp.TranslationDataLayer(
data_layer = nemo_nlp.nm.data_layers.machine_translation_datalayer.TranslationDataLayer(
tokenizer_src=tokenizer,
tokenizer_tgt=tokenizer,
dataset_src=dataset_src,
Expand All @@ -126,7 +134,7 @@ def create_pipeline(dataset, tokens_in_batch, clean=False, training=True):
input_type_ids = zeros_transform(input_type_ids=src)
src_hiddens = encoder(input_ids=src, token_type_ids=input_type_ids, attention_mask=src_mask)
tgt_hiddens = decoder(
input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask,
input_ids_tgt=tgt, hidden_states_src=src_hiddens, input_mask_src=src_mask, input_mask_tgt=tgt_mask
)
log_softmax = t_log_softmax(hidden_states=tgt_hiddens)
loss = loss_fn(logits=log_softmax, target_ids=labels)
Expand Down Expand Up @@ -186,6 +194,6 @@ def print_loss(x):
callbacks=callbacks,
optimizer=args.optimizer,
lr_policy=lr_policy,
optimization_params={"num_epochs": 300, "lr": args.lr, "weight_decay": args.weight_decay,},
optimization_params={"num_epochs": 300, "lr": args.lr, "weight_decay": args.weight_decay},
batches_per_step=args.iter_per_step,
)
Loading

0 comments on commit fdc421b

Please sign in to comment.