docs update (NVIDIA#450)

* docs update Signed-off-by: Evelina Bakhturina <[email protected]> * intend fix Signed-off-by: Evelina Bakhturina <[email protected]> Signed-off-by: VahidooX <[email protected]>
VahidooX · Mar 10, 2020 · 9e315de · 9e315de
1 parent 7470170
commit 9e315de
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 20 deletions.
diff --git a/docs/sources/source/nlp/bert_pretraining.rst b/docs/sources/source/nlp/bert_pretraining.rst
@@ -84,7 +84,7 @@ To train on a Chinese dataset, you should use `NemoBertTokenizer`.
 
         # If you're using a custom vocabulary, create your tokenizer like this
         tokenizer = nemo_nlp.data.SentencePieceTokenizer(model_path="tokenizer.model")
-        special_tokens = nemo_nlp.utils.MODEL_SPECIAL_TOKENS['bert']
+        special_tokens = nemo_nlp.data.get_bert_special_tokens('bert')
         tokenizer.add_special_tokens(special_tokens)
 
         # Otherwise, create your tokenizer like this

diff --git a/docs/sources/source/nlp/question_answering.rst b/docs/sources/source/nlp/question_answering.rst
@@ -69,21 +69,6 @@ First, we instantiate Neural Module Factory which defines 1) backend (PyTorch),
                                                files_to_copy=[__file__],
                                                add_time_to_log_dir=True)
 
-We define the tokenizer which transforms text into BERT tokens, using `NemoBertTokenizer`.
-This will tokenize text following the mapping of the original BERT model.
-
-    .. code-block:: python
-
-        hidden_size = model.hidden_size
-        tokenizer = nemo_nlp.data.NemoBertTokenizer(bert_derivate='bert', pretrained_model="bert-base-uncased")
-        # to use RoBERTa tokenizer, run e.g.
-        special_tokens_roberta = nemo_nlp.utils.MODEL_SPECIAL_TOKENS['roberta']
-        tokenizer = nemo_nlp.data.NemoBertTokenizer(bert_derivate='roberta', pretrained_model="roberta-base", special_tokens=special_tokens_roberta)
-        # to use Albert tokenizer, run e.g.
-        special_tokens_albert = nemo_nlp.utils.MODEL_SPECIAL_TOKENS['albert']
-        tokenizer = nemo_nlp.data.NemoBertTokenizer(bert_derivate='albert', pretrained_model="albert-base-v1", special_tokens=special_tokens_albert)
-
-
 Next, we define all Neural Modules participating in our question answering classification pipeline.
 
     * Process data: the `BertQuestionAnsweringDataLayer` is supposed to do the preprocessing of raw data into the format data supported by `SquadDataset`.
@@ -129,6 +114,14 @@ Next, we define all Neural Modules participating in our question answering class
         args.pretrained_model_name = "albert-base-v1"
         model = nemo_nlp.nm.trainables.huggingface.Albert(args.pretrained_model_name)
 
+    * Define the tokenizer which transforms text into BERT tokens, using `NemoBertTokenizer`. This will tokenize text following the mapping of the original BERT model.
+
+    .. code-block:: python
+
+        hidden_size = model.hidden_size
+        tokenizer = nemo_nlp.data.NemoBertTokenizer(pretrained_model=args.pretrained_model_name)
+
+
     * Create the classifier head for our task.
 
     .. code-block:: python

diff --git a/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb b/examples/nlp/language_modeling/BERTPretrainingTutorial.ipynb
@@ -118,7 +118,7 @@
    "source": [
     "# tokenizer.model file was created during Step 1\n",
     "tokenizer = nemo_nlp.data.SentencePieceTokenizer(model_path=\"tokenizer.model\")\n",
-    "special_tokens = nemo_nlp.utils.MODEL_SPECIAL_TOKENS['bert']\n",
+    "special_tokens = nemo_nlp.data.get_bert_special_tokens('bert')\n",
     "tokenizer.add_special_tokens(special_tokens)"
    ]
   },
@@ -308,7 +308,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.4"
   },
   "pycharm": {
    "stem_cell": {
@@ -322,4 +322,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}
diff --git a/examples/nlp/language_modeling/bert_pretraining.py b/examples/nlp/language_modeling/bert_pretraining.py
@@ -253,7 +253,7 @@
     args.max_seq_length = config['max_position_embeddings']
 
 if 'data_text' in sys.argv:
-    special_tokens = nemo_nlp.data.tokenizers.MODEL_SPECIAL_TOKENS['bert']
+    special_tokens = nemo_nlp.data.get_bert_special_tokens('bert')
 
     data_desc = BERTPretrainingDataDesc(
         args.dataset_name,