diff --git a/examples/nlp/biobert_notebooks/biobert_ner.ipynb b/examples/nlp/biobert_notebooks/biobert_ner.ipynb
new file mode 100644
index 000000000000..b0c81947a14a
--- /dev/null
+++ b/examples/nlp/biobert_notebooks/biobert_ner.ipynb
@@ -0,0 +1,341 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
+    "\n",
+    "Instructions for setting up Colab are as follows:\n",
+    "1. Open a new Python 3 notebook.\n",
+    "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
+    "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
+    "4. Run this cell to set up dependencies.\n",
+    "\"\"\"\n",
+    "# If you're using Google Colab and not running locally, run this cell.\n",
+    "# !pip install wget\n",
+    "# !pip install git+https://github.com/NVIDIA/apex.git\n",
+    "# !pip install nemo_toolkit[nlp]\n",
+    "# !pip install unidecode\n",
+    "import os\n",
+    "import nemo\n",
+    "import nemo.collections.nlp as nemo_nlp\n",
+    "import numpy as np\n",
+    "import time\n",
+    "import errno\n",
+    "\n",
+    "from nemo.backends.pytorch.common.losses import CrossEntropyLossNM\n",
+    "from nemo.collections.nlp.nm.data_layers import BertTokenClassificationDataLayer\n",
+    "from nemo.collections.nlp.nm.trainables import TokenClassifier\n",
+    "from nemo.collections.nlp.callbacks.token_classification_callback import eval_epochs_done_callback, eval_iter_callback\n",
+    "from nemo.utils.lr_policies import get_lr_policy\n",
+    "from nemo import logging"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "BioBERT has the same network architecture as the original BERT, but instead of Wikipedia and BookCorpus it is pretrained on PubMed, a large biomedical text corpus, which achieves better performance in biomedical downstream tasks, such as question answering(QA), named entity recognition(NER) and relationship extraction(RE). This model was trained for 1M steps. For more information please refer to the original paper https://academic.oup.com/bioinformatics/article/36/4/1234/5566506.  For details about BERT please refer to https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo.\n",
+    "\n",
+    "\n",
+    "In this notebook we're going to showcase how to train BioBERT on a biomedical named entity recognition (NER) dataset."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download model checkpoint\n",
+    "Download BioBert/BioMegatron checkpoints from  NGC: https://ngc.nvidia.com/catalog/models and put the encoder weights \n",
+    "at `./checkpoints/biobert/BERT.pt` or `./checkpoints/biomegatron/BERT.pt` and the model configuration file at `./checkpoints/biobert/bert_config.json` or `./checkpoints/biomegatron/bert_config.json`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set which model to use.\n",
+    "model_type=\"biobert\" # \"biomegatron\"\n",
+    "base_checkpoint_path={'biobert': './checkpoints/biobert/', 'biomegatron': './checkpoints/biomegatron/'}\n",
+    "pretrained_model_name={'biobert': 'bert-base-cased', 'biomegatron': 'megatron-bert-uncased'}\n",
+    "do_lower_case={'biobert': False, 'biomegatron': True}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# the checkpoints are available from NGC: https://ngc.nvidia.com/catalog/models\n",
+    "CHECKPOINT_ENCODER = os.path.join(base_checkpoint_path[model_type], 'BERT.pt') # Model encoder checkpoint file\n",
+    "CHECKPOINT_CONFIG = os.path.join(base_checkpoint_path[model_type], 'bert_config.json') # Model configuration file\n",
+    "    \n",
+    "if not os.path.exists(CHECKPOINT_ENCODER):\n",
+    "    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_ENCODER)\n",
+    "\n",
+    "if not os.path.exists(CHECKPOINT_CONFIG):\n",
+    "    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_CONFIG)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download training data\n",
+    "In this example we download the NER dataset NCBI-disease using token_classification/get_medical_data.py to ./datasets/ncbi-disease"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_dir=\"./datasets\"\n",
+    "dataset=\"ncbi-disease\"\n",
+    "!mkdir -p $data_dir\n",
+    "!python ../token_classification/get_medical_data.py --data_dir=$data_dir --dataset=$dataset\n",
+    "!python ../token_classification/import_from_iob_format.py --data_file=$data_dir/$dataset/train.tsv\n",
+    "!python ../token_classification/import_from_iob_format.py --data_file=$data_dir/$dataset/test.tsv\n",
+    "!python ../token_classification/import_from_iob_format.py --data_file=$data_dir/$dataset/dev.tsv\n",
+    "!ls -l $data_dir/$dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the previous step, you should have a ./datasets/ncbi-disease folder that contains the following files:\n",
+    "- labels_train.txt\n",
+    "- labels_dev.txt\n",
+    "- labels_text.txt\n",
+    "- text_train.txt\n",
+    "- text_dev.txt\n",
+    "- text_text.txt\n",
+    "\n",
+    "The format of the data described in NeMo docs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Neural Modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_checkpoint=CHECKPOINT_ENCODER # language model encoder file\n",
+    "model_config=CHECKPOINT_CONFIG # model configuration file\n",
+    "train_data_text_file=f\"{data_dir}/{dataset}/text_train.txt\"\n",
+    "train_data_label_file=f\"{data_dir}/{dataset}/labels_train.txt\"\n",
+    "eval_data_text_file=f\"{data_dir}/{dataset}/text_dev.txt\"\n",
+    "eval_data_label_file=f\"{data_dir}/{dataset}/labels_dev.txt\"\n",
+    "none_label=\"O\" \n",
+    "num_labels=3 # this should be the same number as number of labels in the training data\n",
+    "fc_dropout=0.1\n",
+    "max_seq_length=128\n",
+    "batch_size=32\n",
+    "\n",
+    "nf = nemo.core.NeuralModuleFactory(\n",
+    "    backend=nemo.core.Backend.PyTorch,\n",
+    "    placement=nemo.core.DeviceType.GPU\n",
+    ")\n",
+    "model = nemo_nlp.nm.trainables.get_pretrained_lm_model(\n",
+    "        config=model_config, pretrained_model_name=pretrained_model_name[model_type], checkpoint=model_checkpoint\n",
+    "    )\n",
+    "tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(\n",
+    "    tokenizer_name='nemobert',\n",
+    "    pretrained_model_name=pretrained_model_name[model_type],\n",
+    "    do_lower_case=do_lower_case[model_type]\n",
+    ")\n",
+    "hidden_size = model.hidden_size\n",
+    "classifier = TokenClassifier(hidden_size=hidden_size, num_classes=num_labels, dropout=fc_dropout, num_layers=1)\n",
+    "task_loss = CrossEntropyLossNM(logits_ndim=3)\n",
+    "train_data_layer = BertTokenClassificationDataLayer(\n",
+    "    tokenizer=tokenizer,\n",
+    "    text_file=train_data_text_file,\n",
+    "    label_file=train_data_label_file,\n",
+    "    pad_label=none_label,\n",
+    "    label_ids=None,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    batch_size=batch_size,\n",
+    "    shuffle=True,\n",
+    "    use_cache=True\n",
+    ")\n",
+    "eval_data_layer = BertTokenClassificationDataLayer(\n",
+    "    tokenizer=tokenizer,\n",
+    "    text_file=eval_data_text_file,\n",
+    "    label_file=eval_data_label_file,\n",
+    "    pad_label=none_label,\n",
+    "    label_ids=train_data_layer.dataset.label_ids,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    batch_size=batch_size,\n",
+    "    shuffle=False,\n",
+    "    use_cache=False,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Creating Neural graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = train_data_layer()\n",
+    "train_hidden_states = model(input_ids=train_data.input_ids, token_type_ids=train_data.input_type_ids, attention_mask=train_data.input_mask)\n",
+    "train_logits = classifier(hidden_states=train_hidden_states)\n",
+    "loss = task_loss(logits=train_logits, labels=train_data.labels, loss_mask=train_data.loss_mask)\n",
+    "# If you're training on multiple GPUs, this should be\n",
+    "# len(train_data_layer) // (batch_size * batches_per_step * num_gpus)\n",
+    "train_steps_per_epoch = len(train_data_layer) // batch_size\n",
+    "logging.info(f\"doing {train_steps_per_epoch} steps per epoch\")\n",
+    "\n",
+    "eval_data = eval_data_layer()\n",
+    "eval_hidden_states = model(input_ids=eval_data.input_ids, token_type_ids=eval_data.input_type_ids, attention_mask=eval_data.input_mask)\n",
+    "eval_logits = classifier(hidden_states=eval_hidden_states)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Callbacks\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "train_callback = nemo.core.SimpleLossLoggerCallback(\n",
+    "    tensors=[loss],\n",
+    "    print_func=lambda x: logging.info(\"Loss: {:.3f}\".format(x[0].item())),\n",
+    "    get_tb_values=lambda x: [[\"loss\", x[0]]],\n",
+    "    step_freq=100,\n",
+    "    tb_writer=nf.tb_writer,\n",
+    ")\n",
+    "\n",
+    "# Callback to evaluate the model\n",
+    "eval_callback = nemo.core.EvaluatorCallback(\n",
+    "        eval_tensors=[eval_logits, eval_data.labels, eval_data.subtokens_mask],\n",
+    "        user_iter_callback=lambda x, y: eval_iter_callback(x, y),\n",
+    "        user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, train_data_layer.dataset.label_ids, f'{nf.work_dir}/graphs'),\n",
+    "        tb_writer=nf.tb_writer,\n",
+    "        eval_step=100\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Training\n",
+    "Training could take several minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "num_epochs=10\n",
+    "lr_warmup_proportion=0.1\n",
+    "lr=4e-5\n",
+    "weight_decay=0.01\n",
+    "lr_policy_fn = get_lr_policy(\"WarmupAnnealing\", total_steps=num_epochs * train_steps_per_epoch, warmup_ratio=lr_warmup_proportion\n",
+    ")\n",
+    "nf.train(\n",
+    "    tensors_to_optimize=[loss],\n",
+    "    callbacks=[train_callback, eval_callback],\n",
+    "    lr_policy=lr_policy_fn,\n",
+    "    optimizer=\"adam_w\",\n",
+    "    optimization_params={\"num_epochs\": num_epochs, \"lr\": lr, \"weight_decay\": weight_decay},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The result should look something like\n",
+    "```\n",
+    "[NeMo I 2020-05-22 17:13:48 token_classification_callback:82] Accuracy: 0.9882348032875798\n",
+    "[NeMo I 2020-05-22 17:13:48 token_classification_callback:86] F1 weighted: 98.82\n",
+    "[NeMo I 2020-05-22 17:13:48 token_classification_callback:86] F1 macro: 93.74\n",
+    "[NeMo I 2020-05-22 17:13:48 token_classification_callback:86] F1 micro: 98.82\n",
+    "[NeMo I 2020-05-22 17:13:49 token_classification_callback:89] precision    recall  f1-score   support\n",
+    "    \n",
+    "    O (label id: 0)     0.9938    0.9957    0.9947     22092\n",
+    "    B (label id: 1)     0.8843    0.9034    0.8938       787\n",
+    "    I (label id: 2)     0.9505    0.8982    0.9236      1090\n",
+    "    \n",
+    "           accuracy                         0.9882     23969\n",
+    "          macro avg     0.9429    0.9324    0.9374     23969\n",
+    "       weighted avg     0.9882    0.9882    0.9882     23969\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "metadata": {
+     "collapsed": false
+    },
+    "source": []
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/nlp/biobert_notebooks/biobert_qa.ipynb b/examples/nlp/biobert_notebooks/biobert_qa.ipynb
new file mode 100644
index 000000000000..1a5e53426495
--- /dev/null
+++ b/examples/nlp/biobert_notebooks/biobert_qa.ipynb
@@ -0,0 +1,541 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
+    "\n",
+    "Instructions for setting up Colab are as follows:\n",
+    "1. Open a new Python 3 notebook.\n",
+    "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
+    "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
+    "4. Run this cell to set up dependencies.\n",
+    "\"\"\"\n",
+    "# If you're using Google Colab and not running locally, run this cell.\n",
+    "# !pip install wget\n",
+    "# !pip install git+https://github.com/NVIDIA/apex.git\n",
+    "# !pip install nemo_toolkit[nlp]\n",
+    "# !pip install unidecode\n",
+    "import os\n",
+    "import nemo\n",
+    "import nemo.collections.nlp as nemo_nlp\n",
+    "import numpy as np\n",
+    "import time\n",
+    "import errno\n",
+    "import json\n",
+    "\n",
+    "from nemo.backends.pytorch.common.losses import CrossEntropyLossNM\n",
+    "from nemo.collections.nlp.nm.data_layers import BertQuestionAnsweringDataLayer\n",
+    "from nemo.collections.nlp.nm.trainables import TokenClassifier\n",
+    "from nemo.collections.nlp.callbacks.qa_squad_callback import eval_epochs_done_callback, eval_iter_callback\n",
+    "from nemo.utils.lr_policies import get_lr_policy\n",
+    "from nemo import logging"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "BioBERT has the same network architecture as the original BERT, but instead of Wikipedia and BookCorpus it is pretrained on PubMed, a large biomedical text corpus, which achieves better performance in biomedical downstream tasks, such as question answering(QA), named entity recognition(NER) and relationship extraction(RE). This model was trained for 1M steps. For more information please refer to the original paper https://academic.oup.com/bioinformatics/article/36/4/1234/5566506.  For details about BERT please refer to https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo.\n",
+    "\n",
+    "BioMegatron is an in house model, using Megatron https://github.com/NVIDIA/Megatron-LM pretrained on PubMed. The accuracy is better than using BioBERT on downstream tasks\n",
+    "\n",
+    "\n",
+    "In this notebook we're going to showcase how to train BioBERT/BioMegatron on a biomedical question answering (QA) dataset."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download model  checkpoint\n",
+    "Download BioBert/BioMegatron checkpoints finetuned on SQuADv1.1 from  NGC: https://ngc.nvidia.com/catalog/models. Alternatively, you can also download BioBert/BioMegatron checkpoints and do the finetuning on SQuADv1.1 locally. This will take some time. For this, follow instructions at https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedsquadv1. \n",
+    "    Then, put the encoder weights at `./checkpoints/biobert/qa_squad/BERT.pt` or `./checkpoints/biomegatron/qa_squad/BERT.pt`, the model head weights at `./checkpoints/biobert/qa_squad/TokenClassifier.pt` or `./checkpoints/biomegatron/qa_squad/TokenClassifier.pt` and the model configuration file at `./checkpoints/biobert/qa_squad/bert_config.json` or `./checkpoints/biomegatron/qa_squad/bert_config.json`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set which model to use.\n",
+    "model_type=\"biobert\" # \"biomegatron\"\n",
+    "base_checkpoint_path={'biobert': './checkpoints/biobert/qa_squad', 'biomegatron': './checkpoints/biomegatron/qa_squad'}\n",
+    "pretrained_model_name={'biobert': 'bert-base-cased', 'biomegatron': 'megatron-bert-uncased'}\n",
+    "do_lower_case={'biobert': False, 'biomegatron': True}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# the checkpoints are available from NGC: https://ngc.nvidia.com/catalog/models\n",
+    "CHECKPOINT_ENCODER = os.path.join(base_checkpoint_path[model_type], 'BERT.pt')\n",
+    "CHECKPOINT_HEAD = os.path.join(base_checkpoint_path[model_type], 'TokenClassifier.pt')\n",
+    "CHECKPOINT_CONFIG = os.path.join(base_checkpoint_path[model_type], 'bert_config.json')\n",
+    "    \n",
+    "if not os.path.exists(CHECKPOINT_ENCODER):\n",
+    "    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_ENCODER)\n",
+    "\n",
+    "if not os.path.exists(CHECKPOINT_HEAD):\n",
+    "    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_HEAD)\n",
+    "    \n",
+    "if not os.path.exists(CHECKPOINT_CONFIG):\n",
+    "    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_CONFIG)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download training data\n",
+    "In this example we download the QA dataset BioASQ6b to ./datasets/bioasq. Before using the files in this repository, you must first register BioASQ website and download the [BioASQ Task B](http://participants-area.bioasq.org/Tasks/A/getData/) data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_dir=\"./datasets\"\n",
+    "dataset=\"BioASQ\"\n",
+    "if not os.path.exists(f\"{data_dir}/{dataset}\"):\n",
+    "    !python ../question_answering/get_bioasq.py --data_dir=$data_dir\n",
+    "!ls -l $data_dir/$dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the previous step, you should have a ./datasets/BioASQ folder that contains the following files:\n",
+    "\n",
+    "- 6B1_golden.json\n",
+    "- 6B2_golden.json\n",
+    "- 6B3_golden.json\n",
+    "- 6B4_golden.json\n",
+    "- 6B5_golden.json\n",
+    "- BioASQ-6b/train/Full-Abstract/BioASQ-train-factoid-6b-full-annotated.json\n",
+    "- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-1.json\n",
+    "- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-2.json\n",
+    "- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-3.json\n",
+    "- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-4.json\n",
+    "- BioASQ-6b/test/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-5.json\n",
+    "\n",
+    "The format of the data described in NeMo docs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Neural Modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_checkpoint=CHECKPOINT_ENCODER # language model encoder file\n",
+    "head_checkpoint=CHECKPOINT_HEAD # language model encoder file\n",
+    "model_config=CHECKPOINT_CONFIG # model configuration file\n",
+    "work_dir=\"output_bioasq\"\n",
+    "train_file=f\"{data_dir}/{dataset}/BioASQ-6b/train/Full-Abstract/BioASQ-train-factoid-6b-full-annotated.json\"\n",
+    "doc_stride=128\n",
+    "max_query_length=64\n",
+    "max_seq_length=384\n",
+    "batch_size=12\n",
+    "version_2_with_negative=False\n",
+    "\n",
+    "nf = nemo.core.NeuralModuleFactory(\n",
+    "    backend=nemo.core.Backend.PyTorch,\n",
+    "    placement=nemo.core.DeviceType.GPU,\n",
+    "    log_dir=work_dir\n",
+    ")\n",
+    "model = nemo_nlp.nm.trainables.get_pretrained_lm_model(\n",
+    "        config=model_config, pretrained_model_name=pretrained_model_name[model_type], checkpoint=model_checkpoint\n",
+    "    )\n",
+    "tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(\n",
+    "    tokenizer_name='nemobert',\n",
+    "    pretrained_model_name=pretrained_model_name[model_type],\n",
+    "    do_lower_case=do_lower_case[model_type]\n",
+    ")\n",
+    "hidden_size = model.hidden_size\n",
+    "qa_head = TokenClassifier(\n",
+    "    hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False, name=\"TokenClassifier\"\n",
+    ")\n",
+    "qa_head.restore_from(head_checkpoint)\n",
+    "task_loss = nemo_nlp.nm.losses.SpanningLoss()\n",
+    "# create training data layer, preprocessing takes a while. If you want to cache preprocessed data for future reuse use --use_cache=True\n",
+    "train_data_layer = BertQuestionAnsweringDataLayer(\n",
+    "    mode=\"train\",\n",
+    "    tokenizer=tokenizer,\n",
+    "    version_2_with_negative=version_2_with_negative,\n",
+    "    data_file=train_file,\n",
+    "    max_query_length=max_query_length,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    doc_stride=doc_stride,\n",
+    "    batch_size=batch_size,\n",
+    "    shuffle=True,\n",
+    "    use_cache=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Creating Neural graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = train_data_layer()\n",
+    "hidden_states = model(input_ids=train_data.input_ids, token_type_ids=train_data.input_type_ids, attention_mask=train_data.input_mask)\n",
+    "qa_output = qa_head(hidden_states=hidden_states)\n",
+    "loss = task_loss(logits=qa_output, start_positions=train_data.start_positions, end_positions=train_data.end_positions)\n",
+    "# If you're training on multiple GPUs, this should be\n",
+    "# len(train_data_layer) // (batch_size * batches_per_step * num_gpus)\n",
+    "train_steps_per_epoch = len(train_data_layer) // batch_size\n",
+    "logging.info(f\"doing {train_steps_per_epoch} steps per epoch\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Callbacks\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_callback = nemo.core.SimpleLossLoggerCallback(\n",
+    "    tensors=[loss.loss],\n",
+    "    print_func=lambda x: logging.info(\"Loss: {:.3f}\".format(x[0].item())),\n",
+    "    get_tb_values=lambda x: [[\"loss\", x[0]]],\n",
+    "    step_freq=100,\n",
+    "    tb_writer=nf.tb_writer,\n",
+    ")\n",
+    "ckpt_callback = nemo.core.CheckpointCallback(\n",
+    "    folder=nf.checkpoint_dir, epoch_freq=1, step_freq=-1\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training\n",
+    "this may take more than an hour."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_epochs=5\n",
+    "lr=5e-6\n",
+    "lr_warmup_proportion=0\n",
+    "weight_decay=0\n",
+    "lr_policy_fn = get_lr_policy(\"WarmupAnnealing\", total_steps=num_epochs * train_steps_per_epoch, warmup_ratio=lr_warmup_proportion\n",
+    ")\n",
+    "nf.reset_trainer()\n",
+    "nf.train(\n",
+    "    tensors_to_optimize=[loss.loss],\n",
+    "    callbacks=[train_callback, ckpt_callback],\n",
+    "    lr_policy=lr_policy_fn,\n",
+    "    optimizer=\"adam_w\",\n",
+    "    optimization_params={\"num_epochs\": num_epochs, \"lr\": lr, \"weight_decay\": weight_decay},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference\n",
+    "Do inference on test data e.g. 6b-1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_dataset=\"6b\"\n",
+    "test_dataset_idx=\"1\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_file=f\"{data_dir}/{dataset}/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-{test_dataset}-{test_dataset_idx}.json\"\n",
+    "logging.info(f\"using test file {test_file}\")\n",
+    "test_data_layer = BertQuestionAnsweringDataLayer(\n",
+    "    mode=\"test\",\n",
+    "    tokenizer=tokenizer,\n",
+    "    version_2_with_negative=version_2_with_negative,\n",
+    "    data_file=test_file,\n",
+    "    max_query_length=max_query_length,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    doc_stride=doc_stride,\n",
+    "    batch_size=1,\n",
+    "    shuffle=False,\n",
+    "    use_cache=False\n",
+    ")\n",
+    "\n",
+    "# Creating Neural test graph\n",
+    "test_data = test_data_layer()\n",
+    "test_hidden_states = model(input_ids=test_data.input_ids, token_type_ids=test_data.input_type_ids, attention_mask=test_data.input_mask)\n",
+    "test_qa_output = qa_head(hidden_states=test_hidden_states)\n",
+    "test_tensors=[test_data.unique_ids, test_qa_output]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "n_best_size=20\n",
+    "null_score_diff_threshold=0\n",
+    "max_answer_length=30\n",
+    "logging.info(f\"work dir {work_dir}, checkpoint dir {nf.checkpoint_dir}\")\n",
+    "output_prediction_file=f\"{work_dir}/predictions.json\"\n",
+    "output_nbest_file=f\"{work_dir}/nbest.json\"\n",
+    "evaluated_tensors = nf.infer(\n",
+    "    tensors=test_tensors, cache=False, offload_to_cpu=False, checkpoint_dir=nf.checkpoint_dir\n",
+    ")\n",
+    "unique_ids = []\n",
+    "for t in evaluated_tensors[0]:\n",
+    "    unique_ids.extend(t.tolist())\n",
+    "logits = []\n",
+    "for t in evaluated_tensors[1]:\n",
+    "    logits.extend(t.tolist())\n",
+    "start_logits, end_logits = np.split(np.asarray(logits), 2, axis=-1)\n",
+    "(all_predictions, all_nbest, scores_diff) = test_data_layer.dataset.get_predictions(\n",
+    "    unique_ids=unique_ids,\n",
+    "    start_logits=start_logits,\n",
+    "    end_logits=end_logits,\n",
+    "    n_best_size=n_best_size,\n",
+    "    max_answer_length=max_answer_length,\n",
+    "    version_2_with_negative=version_2_with_negative,\n",
+    "    null_score_diff_threshold=null_score_diff_threshold,\n",
+    "    do_lower_case=do_lower_case[model_type],\n",
+    ")\n",
+    "with open(output_nbest_file, \"w\") as writer:\n",
+    "    writer.write(json.dumps(all_nbest, indent=4) + \"\\n\")\n",
+    "with open(output_prediction_file, \"w\") as writer:\n",
+    "    writer.write(json.dumps(all_predictions, indent=4) + \"\\n\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "  \"version\": \"BioASQ6b\", \r\n",
+      "  \"data\": [\r\n",
+      "    {\r\n",
+      "      \"title\": \"BioASQ6b\", \r\n",
+      "      \"paragraphs\": [\r\n",
+      "        {\r\n",
+      "          \"context\": \"emMAW: computing minimal absent words in external memory. Motivation: The biological significance of minimal absent words has been investigated in genomes of organisms from all domains of life. For instance, three minimal absent words of the human genome were found in Ebola virus genomes. There exists an O(n) -time and O(n) -space algorithm for computing all minimal absent words of a sequence of length n on a fixed-sized alphabet based on suffix arrays. A standard implementation of this algorithm, when applied to a large sequence of length n , requires more than 20 n \\u00a0bytes of RAM. Such memory requirements are a significant hurdle to the computation of minimal absent words in large datasets. Results: We present emMAW, the first external-memory algorithm for computing minimal absent words. A free open-source implementation of our algorithm is made available. This allows for computation of minimal absent words on far bigger data sets than was previously possible. Our implementation requires less than 3 h on a standard workstation to process the full human genome when as little as 1 GB of RAM is made available. We stress that our implementation, despite making use of external memory, is fast; indeed, even on relatively smaller datasets when enough RAM is available to hold all necessary data structures, it is less than two times slower than state-of-the-art internal-memory implementations. Availability and implementation: https://github.com/solonas13/maw (free software under the terms of the GNU GPL). Contact: alice.heliou@lix.polytechnique.fr or solon.pissis@kcl.ac.uk. Supplementary information: Supplementary data are available at Bioinformatics online.\", \r\n",
+      "          \"qas\": [\r\n",
+      "            {\r\n",
+      "              \"question\": \"Which algorithm is available for computing minimal absent words using external memory?\", \r\n",
+      "              \"id\": \"5a6a3335b750ff4455000025_000\"\r\n",
+      "            }\r\n",
+      "          ]\r\n",
+      "        }, \r\n"
+     ]
+    }
+   ],
+   "source": [
+    "# a test question example would be \n",
+    "!head -n 15 $test_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{\r\n",
+      "    \"5a6a3335b750ff4455000025_000\": [\r\n",
+      "        {\r\n",
+      "            \"text\": \"emMAW\",\r\n",
+      "            \"probability\": 0.9950947775946923,\r\n",
+      "            \"start_logit\": [\r\n",
+      "                5.9411702156066895\r\n",
+      "            ],\r\n",
+      "            \"end_logit\": [\r\n",
+      "                5.900936126708984\r\n",
+      "            ]\r\n",
+      "        },\r\n",
+      "        {\r\n",
+      "            \"text\": \"emMAW,\",\r\n",
+      "            \"probability\": 0.001918466002056132,\r\n",
+      "            \"start_logit\": [\r\n",
+      "                5.9411702156066895\r\n",
+      "            ],\r\n",
+      "            \"end_logit\": [\r\n",
+      "                -0.3503759503364563\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "# the corresponding first 2 best answers of the n-best list prediction with probabilities. The most likely answer is \"emMAW\" with a probability of 0.995\n",
+    "!head -n 20 $output_nbest_file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "      \"exact_answer\": [\r\n",
+      "        [\r\n",
+      "          \"emMAW\"\r\n",
+      "        ]\r\n",
+      "      ], \r\n",
+      "      \"concepts\": [], \r\n",
+      "      \"type\": \"factoid\", \r\n",
+      "      \"id\": \"5a6a3335b750ff4455000025\", \r\n"
+     ]
+    }
+   ],
+   "source": [
+    "# the golden label can be found in this following file under \"exact_answer\". In this case, the it is equal to the prediction which is \"emMAW\"\n",
+    "prefix=test_dataset.upper()\n",
+    "suffix=f\"{test_dataset_idx}_golden.json\"\n",
+    "!grep -B 7 \"5a6a3335b750ff4455000025\" $data_dir/$dataset/$prefix$suffix"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Evaluate inference output with BioASQ metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not os.path.exists('bioasq-biobert'):\n",
+    "    print(\"clone https://github.com/dmis-lab/bioasq-biobert.git\")\n",
+    "    !git clone https://github.com/dmis-lab/bioasq-biobert.git && cd bioasq-biobert && git fetch origin pull/12/head:fix_indentation && git checkout fix_indentation && cd ..\n",
+    "if not os.path.exists('Evaluation-Measures'):\n",
+    "    print(\"clone https://github.com/BioASQ/Evaluation-Measures.git\")\n",
+    "    !git clone https://github.com/BioASQ/Evaluation-Measures.git && git checkout cd93f3b8eb290c965d18ef466ee28a0bcf451e5d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "transformed_nbest_dir=f\"{work_dir}/transformed_nbest\"\n",
+    "!mkdir -p $transformed_nbest_dir\n",
+    "!python bioasq-biobert/biocodes/transform_n2b_factoid.py --nbest_path=$output_nbest_file --output_path=$transformed_nbest_dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prefix=test_dataset.upper()\n",
+    "suffix=f\"{test_dataset_idx}_golden.json\"\n",
+    "! java -Xmx10G -cp Evaluation-Measures/flat/BioASQEvaluation/dist/BioASQEvaluation.jar evaluation.EvaluatorTask1b -phaseB -e 5 $data_dir/$dataset/$prefix$suffix $transformed_nbest_dir/BioASQform_BioASQ-answer.json"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The result will look something like this for BioBERT:\n",
+    "\n",
+    "```0.0 0.45161290322580644 0.6774193548387096 0.5403225806451613 0.0 0.0 0.0 0.0 0.0 0.0```\n",
+    "\n",
+    "where the second, third and fourth numbers will be strict accuracy (SAcc), lenient accuracy (LAcc) and mean reciprocal rank (MRR) for factoid\n",
+    "questions respectively."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "metadata": {
+     "collapsed": false
+    },
+    "source": []
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/nlp/biobert_notebooks/biobert_re.ipynb b/examples/nlp/biobert_notebooks/biobert_re.ipynb
new file mode 100644
index 000000000000..b5099dc13c5c
--- /dev/null
+++ b/examples/nlp/biobert_notebooks/biobert_re.ipynb
@@ -0,0 +1,337 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"\n",
+    "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n",
+    "\n",
+    "Instructions for setting up Colab are as follows:\n",
+    "1. Open a new Python 3 notebook.\n",
+    "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n",
+    "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n",
+    "4. Run this cell to set up dependencies.\n",
+    "\"\"\"\n",
+    "# If you're using Google Colab and not running locally, run this cell.\n",
+    "# !pip install wget\n",
+    "# !pip install git+https://github.com/NVIDIA/apex.git\n",
+    "# !pip install nemo_toolkit[nlp]\n",
+    "# !pip install unidecode\n",
+    "import os\n",
+    "import nemo\n",
+    "import nemo.collections.nlp as nemo_nlp\n",
+    "import numpy as np\n",
+    "import time\n",
+    "import errno\n",
+    "\n",
+    "from nemo.backends.pytorch.common.losses import CrossEntropyLossNM\n",
+    "from nemo.collections.nlp.data.datasets import TextClassificationDataDesc\n",
+    "from nemo.collections.nlp.nm.data_layers import BertTextClassificationDataLayer\n",
+    "from nemo.collections.nlp.nm.trainables import SequenceClassifier\n",
+    "from nemo.collections.nlp.callbacks.text_classification_callback import eval_epochs_done_callback, eval_iter_callback\n",
+    "from nemo.utils.lr_policies import get_lr_policy\n",
+    "from nemo import logging"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction\n",
+    "BioBERT has the same network architecture as the original BERT, but instead of Wikipedia and BookCorpus it is pretrained on PubMed, a large biomedical text corpus, which achieves better performance in biomedical downstream tasks, such as question answering(QA), named entity recognition(NER) and relationship extraction(RE). This model was trained for 1M steps. For more information please refer to the original paper https://academic.oup.com/bioinformatics/article/36/4/1234/5566506.  For details about BERT please refer to https://ngc.nvidia.com/catalog/models/nvidia:bertbaseuncasedfornemo.\n",
+    "\n",
+    "\n",
+    "In this notebook we're going to showcase how to train BioBERT on a biomedical relation extraction (RE) dataset."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download model checkpoint\n",
+    "Download BioBert/BioMegatron checkpoints from  NGC: https://ngc.nvidia.com/catalog/models and put the encoder weights \n",
+    "at `./checkpoints/biobert/BERT.pt` or `./checkpoints/biomegatron/BERT.pt` and the model configuration file at `./checkpoints/biobert/bert_config.json` or `./checkpoints/biomegatron/bert_config.json`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set which model to use.\n",
+    "model_type=\"biobert\" # \"biomegatron\"\n",
+    "base_checkpoint_path={'biobert': './checkpoints/biobert/', 'biomegatron': './checkpoints/biomegatron/'}\n",
+    "pretrained_model_name={'biobert': 'bert-base-cased', 'biomegatron': 'megatron-bert-uncased'}\n",
+    "do_lower_case={'biobert': False, 'biomegatron': True}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# the checkpoints are available from NGC: https://ngc.nvidia.com/catalog/models\n",
+    "CHECKPOINT_ENCODER = os.path.join(base_checkpoint_path[model_type], 'BERT.pt') # Model encoder checkpoint file\n",
+    "CHECKPOINT_CONFIG = os.path.join(base_checkpoint_path[model_type], 'bert_config.json') # Model configuration file\n",
+    "    \n",
+    "if not os.path.exists(CHECKPOINT_ENCODER):\n",
+    "    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_ENCODER)\n",
+    "\n",
+    "if not os.path.exists(CHECKPOINT_CONFIG):\n",
+    "    raise OSError(errno.ENOENT, os.strerror(errno.ENOENT), CHECKPOINT_CONFIG)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Download training data\n",
+    "In this example we download the RE dataset chemprot to ./datasets/chemprot and process it with text_classification/data/import_datasets.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#download https://github.com/arwhirang/recursive_chemprot/blob/master/Demo/tree_LSTM/data/chemprot-data_treeLSTM.zip and extract it into ./datasets/chemprot\n",
+    "data_dir=\"./datasets\"\n",
+    "dataset=\"chemprot\"\n",
+    "if not os.path.exists(f\"{data_dir}/{dataset}\"):\n",
+    "    !mkdir -p $data_dir/$dataset\n",
+    "    !wget \"https://github.com/arwhirang/recursive_chemprot/blob/master/Demo/tree_LSTM/data/chemprot-data_treeLSTM.zip?raw=true\" -O data.zip\n",
+    "    !unzip data.zip -d $data_dir/$dataset\n",
+    "    !rm data.zip\n",
+    "\n",
+    "!python ../text_classification/data/import_datasets.py --source_data_dir=$data_dir/$dataset --target_data_dir=$data_dir/$dataset --dataset_name=$dataset\n",
+    "!ls -l $data_dir/$dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the previous step, you should have a ./datasets/chemprot folder that contains the following files:\n",
+    "- train.tsv\n",
+    "- test.tsv\n",
+    "- dev.tsv\n",
+    "- label_mapping.tsv\n",
+    "\n",
+    "The format of the data described in NeMo docs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Neural Modules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_checkpoint=CHECKPOINT_ENCODER # language model encoder file\n",
+    "model_config=CHECKPOINT_CONFIG # model configuration file\n",
+    "train_data_text_file=f\"{data_dir}/{dataset}/train.tsv\"\n",
+    "eval_data_text_file=f\"{data_dir}/{dataset}/dev.tsv\"\n",
+    "fc_dropout=0.1\n",
+    "max_seq_length=128\n",
+    "batch_size=32\n",
+    "num_output_layers=1\n",
+    "\n",
+    "nf = nemo.core.NeuralModuleFactory(\n",
+    "    backend=nemo.core.Backend.PyTorch,\n",
+    "    placement=nemo.core.DeviceType.GPU\n",
+    ")\n",
+    "model = nemo_nlp.nm.trainables.get_pretrained_lm_model(\n",
+    "        config=model_config, pretrained_model_name=pretrained_model_name[model_type], checkpoint=model_checkpoint\n",
+    "    )\n",
+    "tokenizer = nemo.collections.nlp.data.tokenizers.get_tokenizer(\n",
+    "    tokenizer_name='nemobert',\n",
+    "    pretrained_model_name=pretrained_model_name[model_type],\n",
+    "    do_lower_case=do_lower_case[model_type]\n",
+    ")\n",
+    "hidden_size = model.hidden_size\n",
+    "data_desc = TextClassificationDataDesc(data_dir=f\"{data_dir}/{dataset}\", modes=['train', 'dev'])\n",
+    "classifier = nemo_nlp.nm.trainables.SequenceClassifier(    \n",
+    "    hidden_size=hidden_size,\n",
+    "    num_classes=data_desc.num_labels,\n",
+    "    dropout=fc_dropout,\n",
+    "    num_layers=num_output_layers,\n",
+    "    log_softmax=False,\n",
+    ")\n",
+    "task_loss = CrossEntropyLossNM(weight=None)\n",
+    "train_data_layer = BertTextClassificationDataLayer(\n",
+    "    tokenizer=tokenizer,\n",
+    "    input_file=train_data_text_file,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    batch_size=batch_size,\n",
+    "    shuffle=True,\n",
+    "    use_cache=True\n",
+    ")\n",
+    "eval_data_layer = BertTextClassificationDataLayer(\n",
+    "    tokenizer=tokenizer,\n",
+    "    input_file=eval_data_text_file,\n",
+    "    max_seq_length=max_seq_length,\n",
+    "    batch_size=batch_size,\n",
+    "    shuffle=False,\n",
+    "    use_cache=False\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Creating Neural graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = train_data_layer()\n",
+    "train_hidden_states = model(input_ids=train_data.input_ids, token_type_ids=train_data.input_type_ids, attention_mask=train_data.input_mask)\n",
+    "train_logits = classifier(hidden_states=train_hidden_states)\n",
+    "loss = task_loss(logits=train_logits, labels=train_data.labels)\n",
+    "# If you're training on multiple GPUs, this should be\n",
+    "# len(train_data_layer) // (batch_size * batches_per_step * num_gpus)\n",
+    "train_steps_per_epoch = len(train_data_layer) // batch_size\n",
+    "logging.info(f\"doing {train_steps_per_epoch} steps per epoch\")\n",
+    "\n",
+    "eval_data = eval_data_layer()\n",
+    "eval_hidden_states = model(input_ids=eval_data.input_ids, token_type_ids=eval_data.input_type_ids, attention_mask=eval_data.input_mask)\n",
+    "eval_logits = classifier(hidden_states=eval_hidden_states)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Create Callbacks\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_callback = nemo.core.SimpleLossLoggerCallback(\n",
+    "    tensors=[loss],\n",
+    "    print_func=lambda x: logging.info(\"Loss: {:.3f}\".format(x[0].item())),\n",
+    "    get_tb_values=lambda x: [[\"loss\", x[0]]],\n",
+    "    step_freq=100,\n",
+    "    tb_writer=nf.tb_writer,\n",
+    ")\n",
+    "\n",
+    "# Callback to evaluate the model\n",
+    "eval_callback = nemo.core.EvaluatorCallback(\n",
+    "        eval_tensors=[eval_logits, eval_data.labels],\n",
+    "        user_iter_callback=lambda x, y: eval_iter_callback(x, y, eval_data_layer),\n",
+    "        user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, f'{nf.work_dir}/graphs'),\n",
+    "        tb_writer=nf.tb_writer,\n",
+    "        eval_step=500,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training\n",
+    "Training could take several minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_epochs=3\n",
+    "lr_warmup_proportion=0.1\n",
+    "lr=3e-5\n",
+    "weight_decay=0.01\n",
+    "lr_policy_fn = get_lr_policy(\"WarmupAnnealing\", total_steps=num_epochs * train_steps_per_epoch, warmup_ratio=lr_warmup_proportion\n",
+    ")\n",
+    "nf.train(\n",
+    "    tensors_to_optimize=[loss],\n",
+    "    callbacks=[train_callback, eval_callback],\n",
+    "    lr_policy=lr_policy_fn,\n",
+    "    optimizer=\"adam_w\",\n",
+    "    optimization_params={\"num_epochs\": num_epochs, \"lr\": lr, \"weight_decay\": weight_decay},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The result should look something like this:\n",
+    "```\n",
+    "precision    recall  f1-score   support\n",
+    "    \n",
+    "               0     0.7328    0.8348    0.7805       115\n",
+    "               1     0.9402    0.9291    0.9346      7950\n",
+    "               2     0.8311    0.9146    0.8708       199\n",
+    "               3     0.6400    0.6302    0.6351       457\n",
+    "               4     0.8002    0.8317    0.8156      1093\n",
+    "               5     0.7228    0.7518    0.7370       548\n",
+    "    \n",
+    "        accuracy                         0.8949     10362\n",
+    "       macro avg     0.7778    0.8153    0.7956     10362\n",
+    "    weighted avg     0.8963    0.8949    0.8954     10362\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "metadata": {
+     "collapsed": false
+    },
+    "source": []
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/nlp/question_answering/get_bioasq.py b/examples/nlp/question_answering/get_bioasq.py
new file mode 100755
index 000000000000..1a07f8f025aa
--- /dev/null
+++ b/examples/nlp/question_answering/get_bioasq.py
@@ -0,0 +1,104 @@
+#!/bin/bash
+# =============================================================================
+# Copyright 2020 NVIDIA. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+# Disclaimer:
+# All the data in this repository is no longer updated since 2019.Jan.24th and it may not reflect current data available.
+#
+#### BioASQ
+# Before using the files in this repository, you must first register BioASQ website and download the [BioASQ Task B](http://participants-area.bioasq.org/Tasks/A/getData/) data.
+# See "An overview of the BIOASQ large-scale biomedical semantic indexing and question answering competition (Tsatsaronis et al. 2015)" for datasets details.
+#
+# Copyright 2019 dmis-lab/biobert.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import logging
+import os
+import subprocess
+from distutils.dir_util import copy_tree
+
+from nemo import logging
+
+URL = {
+    'bioasq': 'https://drive.google.com/uc?id=19ft5q44W4SuptJgTwR84xZjsHg1jvjSZ',
+    'bioasq_6b': 'https://drive.google.com/uc?id=1-KzAQzaE-Zd4jOlZG_7k7D4odqPI3dL1',
+}
+
+
+def download(download_url: str, parent_dir: str):
+    tmp_zip = '/tmp/data.zip'
+    tmp_unzip = '/tmp/data'
+    if not os.path.exists(tmp_unzip):
+        os.makedirs(tmp_unzip)
+    else:
+        subprocess.run(['rm', '-rf', tmp_unzip])
+    subprocess.run(['gdown', '-O', tmp_zip, download_url])
+    subprocess.run(['unzip', tmp_zip, '-d', tmp_unzip])
+    copy_tree(tmp_unzip, parent_dir)
+    subprocess.run(['rm', '-rf', tmp_zip])
+    subprocess.run(['rm', '-rf', tmp_unzip])
+
+
+def __maybe_download_file(parent_dir: str):
+    """
+    from https://github.com/dmis-lab/biobert download https://drive.google.com/uc?id=19ft5q44W4SuptJgTwR84xZjsHg1jvjSZ
+    from https://github.com/dmis-lab/bioasq-biobert#datasets  https://drive.google.com/uc?id=1-KzAQzaE-Zd4jOlZG_7k7D4odqPI3dL1
+
+    If exists, skips download
+    Args:
+        parent_dir: local filepath
+    """
+    target_dir = os.path.join(parent_dir, 'BioASQ')
+    if os.path.exists(target_dir):
+        logging.info(f'{target_dir} found. Skipping download')
+    else:
+        download_url = URL['bioasq']
+        logging.info(f'Downloading {download_url} from https://github.com/dmis-lab/biobert to {target_dir}')
+        download(download_url, parent_dir)
+    parent_dir = target_dir
+    target_dir = os.path.join(parent_dir, 'BioASQ-6b')
+    if os.path.exists(target_dir):
+        logging.info(f'{target_dir} found. Skipping download')
+    else:
+        download_url = URL['bioasq_6b']
+        logging.info(
+            f'Downloading {download_url} from https://github.com/dmis-lab/bioasq-biobert#datasets to {target_dir}'
+        )
+        download(download_url, parent_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Prepare dataset')
+    parser.add_argument("--data_dir", required=True, type=str, help="directory to download dataset to")
+    args = parser.parse_args()
+
+    if not os.path.exists(args.data_dir):
+        os.makedirs(args.data_dir)
+
+    logging.info(f'Downloading dataset')
+    __maybe_download_file(args.data_dir)
diff --git a/examples/nlp/question_answering/question_answering_squad.py b/examples/nlp/question_answering/question_answering_squad.py
index 74f0f6d09208..c23f3aa27532 100755
--- a/examples/nlp/question_answering/question_answering_squad.py
+++ b/examples/nlp/question_answering/question_answering_squad.py
@@ -357,7 +357,7 @@ def create_pipeline(
     hidden_size = model.hidden_size
 
     qa_head = nemo_nlp.nm.trainables.TokenClassifier(
-        hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False
+        hidden_size=hidden_size, num_classes=2, num_layers=1, log_softmax=False, name="TokenClassifier"
     )
     squad_loss = nemo_nlp.nm.losses.SpanningLoss()