From 0b4ba3f08e0d0f957db809f38f9904cd7dc7a900 Mon Sep 17 00:00:00 2001
From: Zhilin Wang <wangzhilin12061996@hotmail.com>
Date: Thu, 19 May 2022 15:53:34 -0700
Subject: [PATCH 1/5] fix bugs for dialogue tutorial

Signed-off-by: Zhilin Wang <wangzhilin12061996@hotmail.com>
---
 tutorials/nlp/Dialogue.ipynb | 52 ++++++++++++++++++------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb
index aca5e11ce0a1..aaaf8eb09ce2 100644
--- a/tutorials/nlp/Dialogue.ipynb
+++ b/tutorials/nlp/Dialogue.ipynb
@@ -28,7 +28,7 @@
       "source": [
         "import os \n",
         "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n",
-        "!git clone https://github.com/NVIDIA/NeMo --branch main\n",
+        "!git clone https://github.com/NVIDIA/NeMo --branch r1.9.0\n",
         "os.chdir('NeMo')\n",
         "!./reinstall.sh\n",
         "os.chdir('..')\n"
@@ -87,7 +87,7 @@
         "\n",
         "An example is:\n",
         "\n",
-        "* utterance: what alarms have i set for tomorrow intent: \n",
+        "* utterance: what alarms have i set for tomorrow \n",
         "* intent: alarm_query\n",
         "* slots: date(tomorrow)\n",
         "\n",
@@ -287,15 +287,20 @@
     },
     {
       "cell_type": "markdown",
-      "source": [
-        "## 1.4 (Optional) To train/ test a GPT2 model on the assistant dataset, run the cell below "
-      ],
       "metadata": {
         "id": "-44x5PqyrOeQ"
-      }
+      },
+      "source": [
+        "## 1.4 (Optional) To train/ test a GPT2 model on the assistant dataset, run the cell below "
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "QyqQbpR4rNHT"
+      },
+      "outputs": [],
       "source": [
         "# model.dataset.data_dir: folder to load data from\n",
         "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n",
@@ -312,15 +317,13 @@
         "  model.dataset.target_template=with_slots \\\n",
         "  model.dataset.eval_mode=generation \\\n",
         "  exp_manager.create_wandb_logger=False)"
-      ],
-      "metadata": {
-        "id": "QyqQbpR4rNHT"
-      },
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
+      "metadata": {
+        "id": "FbQ-6TVM1yQg"
+      },
       "source": [
         "**After 1 epoch:**\n",
         "\n",
@@ -417,10 +420,7 @@
         "     test_loss_epoch       0.019178826361894608\n",
         "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
         "```"
-      ],
-      "metadata": {
-        "id": "FbQ-6TVM1yQg"
-      }
+      ]
     },
     {
       "cell_type": "markdown",
@@ -437,7 +437,7 @@
         "\n",
         "An example is:\n",
         "\n",
-        "* utterance: I will be eating there at 11:30 am so make it for then.\n",
+        "* utterance: I will be eating there at 11:30 am so make the reservation for then.\n",
         "* intent: ReserveRestaurant\n",
         "* slots: {\"time\": \"11:30 am\"}\n",
         "\n",
@@ -498,14 +498,14 @@
     },
     {
       "cell_type": "code",
-      "source": [
-        "!ls sgd_gpt2_predictions"
-      ],
+      "execution_count": null,
       "metadata": {
         "id": "kGDlV5HvI2PQ"
       },
-      "execution_count": null,
-      "outputs": []
+      "outputs": [],
+      "source": [
+        "!ls sgd_gpt2_predictions"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -515,7 +515,7 @@
       "source": [
         "**After 1 epoch:**\n",
         "\n",
-        "More epoches would needed to reach convergence.\n",
+        "More epochs would needed to reach convergence.\n",
         "\n",
         "\n",
         "```\n",
@@ -590,7 +590,7 @@
         "An example is \n",
         "\n",
         "\n",
-        "*   question: what county is nine mile in\n",
+        "*   question: What county is Nine Mile in?\n",
         "*   extracted_answer: Onondaga\n",
         "*   fluent_answer: Nine Mile is in Onondaga county.\n"
       ]
@@ -667,7 +667,7 @@
       "source": [
         "**After 1 epoch:**\n",
         "\n",
-        "Train more epoches for optimal performance\n",
+        "Train more epochs for optimal performance\n",
         "\n",
         "```\n",
         "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n",
@@ -713,4 +713,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 0
-}
\ No newline at end of file
+}

From 7495f126479dc8993f031809a5976b8866718ec9 Mon Sep 17 00:00:00 2001
From: Zhilin Wang <wangzhilin12061996@hotmail.com>
Date: Thu, 19 May 2022 21:01:10 -0700
Subject: [PATCH 2/5] update path for convert_datasets.py due to conflict PR

Signed-off-by: Zhilin Wang <wangzhilin12061996@hotmail.com>
---
 tutorials/nlp/Dialogue.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb
index aaaf8eb09ce2..853fb0345b4f 100644
--- a/tutorials/nlp/Dialogue.ipynb
+++ b/tutorials/nlp/Dialogue.ipynb
@@ -107,7 +107,7 @@
         "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n",
         "!unzip master.zip\n",
         "# convert the dataset to the NeMo format\n",
-        "!python NeMo/examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant\n"
+        "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant"
       ]
     },
     {

From b21032269bae7ccf8b6aac62e951b68e48b68d71 Mon Sep 17 00:00:00 2001
From: Zhilin Wang <wangzhilin12061996@hotmail.com>
Date: Fri, 20 May 2022 10:22:13 -0700
Subject: [PATCH 3/5] restore previously deleted files

Signed-off-by: Zhilin Wang <wangzhilin12061996@hotmail.com>
---
 .../data/assistant_utils.py                   | 151 ++++++++++
 .../data/import_datasets.py                   | 265 ++++++++++++++++++
 tutorials/nlp/Dialogue.ipynb                  |   2 +-
 3 files changed, 417 insertions(+), 1 deletion(-)
 create mode 100644 examples/nlp/intent_slot_classification/data/assistant_utils.py
 create mode 100644 examples/nlp/intent_slot_classification/data/import_datasets.py

diff --git a/examples/nlp/intent_slot_classification/data/assistant_utils.py b/examples/nlp/intent_slot_classification/data/assistant_utils.py
new file mode 100644
index 000000000000..e45b3c7c96a2
--- /dev/null
+++ b/examples/nlp/intent_slot_classification/data/assistant_utils.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import shutil
+
+from nemo.collections.nlp.data.data_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist, write_files
+from nemo.utils import logging
+
+
+def copy_input_files(infold):
+    """ Put training files in convenient place for conversion to our format. """
+    our_infold = infold + "/dataset"
+
+    if os.path.exists(our_infold + "/trainset") and os.path.exists(our_infold + "/testset"):
+        logging.info("Input folders exists")
+        return
+
+    logging.info(f"Copying files to input folder: {our_infold}")
+    os.makedirs(infold, exist_ok=True)
+
+    old_infold = (
+        infold + '/CrossValidation/autoGeneFromRealAnno/autoGene_2018_03_22-13_01_25_169/CrossValidation/KFold_1'
+    )
+    if not os.path.exists(our_infold + "/trainset"):
+        shutil.copytree(old_infold + '/trainset', our_infold + '/trainset')
+
+    if not os.path.exists(our_infold + "/testset"):
+        shutil.copytree(old_infold + '/testset/csv', our_infold + '/testset')
+
+
+def get_intents(infold):
+    """ Get list of intents from file names. """
+    intents = [f[:-4] for f in os.listdir(infold)]
+    intents.sort()
+    print(f'Found {len(intents)} intents')
+    return intents
+
+
+def get_intent_queries(infold, intent_names, mode):
+    """ Get list of queries with their corresponding intent number. """
+    intent_queries = ['sentence\tlabel\n']
+
+    for index, intent in enumerate(intent_names):
+        queries = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines()
+        for query in queries[1:]:
+            phrases = query.split(";")
+            intent_query = phrases[4][1:-1] + "\t" + str(index)
+            intent_queries.append(intent_query)
+
+    return intent_queries
+
+
+def get_slots(infold, modes):
+    """
+    Find a lost of unique slot types in training and testing data.
+    We use a single slot type name both for starting and continuation tokes (not using B-, I- notation).
+    """
+    slots = set()
+
+    for mode in modes:
+        path = f'{infold}/{mode}set'
+        for filename in os.listdir(path):
+            lines = open(f'{path}/{filename}', 'r', encoding='utf-8').readlines()
+            for line in lines[1:]:
+                query = line.split(";")[3]
+                slot_phrases = re.findall('\[.*?\]', query)
+                for slot_phrase in slot_phrases:
+                    slot = slot_phrase.split(" : ")[0][1:]
+                    slots.add(slot)
+
+    slots = sorted(slots)
+    slots.append("O")
+    print(f'Found {len(slots)} slot types')
+    return slots
+
+
+def get_slot_queries(infold, slot_dict, mode, intent_names):
+    """ Convert each word in a query to corresponding slot number. """
+    slot_queries = []
+    outside_slot = len(slot_dict) - 1
+
+    # keep the same order of files/queries as for intents
+    for intent in intent_names:
+        lines = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines()
+        for line in lines[1:]:
+            slot_query = ""
+            query = line.split(";")[3]
+            words = query.split(" ")
+            current_slot = outside_slot
+            for word in words:
+                if word[0] == "[":
+                    current_slot = slot_dict[word[1:]]
+                elif word[0] == ":":
+                    continue
+                else:
+                    slot_query += str(current_slot) + " "
+                    if word[-1] == ']':
+                        current_slot = outside_slot
+
+            slot_queries.append(slot_query.strip())
+
+    return slot_queries
+
+
+def process_assistant(infold, outfold, modes=['train', 'test']):
+    """
+    https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes
+    about 25 thousand examples with 66 various multi-domain intents and 57 entity types.
+    """
+    if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('robot', outfold))
+        return outfold
+
+    logging.info(f'Processing assistant commands dataset and store at {outfold}')
+    os.makedirs(outfold, exist_ok=True)
+
+    # copy train/test files to the convenient directory to work with
+    copy_input_files(infold)
+    infold += "/dataset"
+
+    # get list of intents from train folder (test folder supposed to be the same)
+    intent_names = get_intents(infold + "/trainset")
+    write_files(intent_names, f'{outfold}/dict.intents.csv')
+
+    # get all train and test queries with their intent
+    for mode in modes:
+        intent_queries = get_intent_queries(infold, intent_names, mode)
+        write_files(intent_queries, f'{outfold}/{mode}.tsv')
+
+    # get list of all unique slots in training and testing files
+    slot_types = get_slots(infold, modes)
+    write_files(slot_types, f'{outfold}/dict.slots.csv')
+
+    # create files of slot queries
+    slot_dict = {k: v for v, k in enumerate(slot_types)}
+    for mode in modes:
+        slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names)
+        write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')
\ No newline at end of file
diff --git a/examples/nlp/intent_slot_classification/data/import_datasets.py b/examples/nlp/intent_slot_classification/data/import_datasets.py
new file mode 100644
index 000000000000..afcdcaa773c2
--- /dev/null
+++ b/examples/nlp/intent_slot_classification/data/import_datasets.py
@@ -0,0 +1,265 @@
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import shutil
+from os.path import exists
+
+from assistant_utils import process_assistant
+
+from nemo.collections.nlp.data.data_utils.data_preprocessing import (
+    DATABASE_EXISTS_TMP,
+    MODE_EXISTS_TMP,
+    create_dataset,
+    get_dataset,
+    get_vocab,
+    if_exist,
+)
+from nemo.utils import logging
+
+
+def ids2text(ids, vocab):
+    return ' '.join([vocab[int(id_)] for id_ in ids])
+
+
+def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False):
+    """ MSFT's dataset, processed by Kaggle
+    https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk
+    """
+    vocab = get_vocab(f'{infold}/atis.dict.vocab.csv')
+
+    if if_exist(outfold, [f'{mode}.tsv' for mode in modes]):
+        logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold))
+        return outfold
+    logging.info(f'Processing ATIS dataset and storing at {outfold}.')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+    for mode in modes:
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w', encoding='utf-8')
+        outfiles[mode].write('sentence\tlabel\n')
+        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w', encoding='utf-8')
+
+        queries = open(f'{infold}/atis.{mode}.query.csv', 'r', encoding='utf-8').readlines()
+        intents = open(f'{infold}/atis.{mode}.intent.csv', 'r', encoding='utf-8').readlines()
+        slots = open(f'{infold}/atis.{mode}.slots.csv', 'r', encoding='utf-8').readlines()
+
+        for i, query in enumerate(queries):
+            sentence = ids2text(query.strip().split()[1:-1], vocab)
+            if do_lower_case:
+                sentence = sentence.lower()
+            outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n')
+            slot = ' '.join(slots[i].strip().split()[1:-1])
+            outfiles[mode + '_slots'].write(slot + '\n')
+
+    shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv')
+    shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv')
+    for mode in modes:
+        outfiles[mode].close()
+
+
+def process_snips(infold, outfold, do_lower_case, modes=['train', 'test'], dev_split=0.1):
+    if not os.path.exists(infold):
+        link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets'
+        raise ValueError(f'Data not found at {infold}. ' f'You may request to download the SNIPS dataset from {link}.')
+
+    exist = True
+    for dataset in ['light', 'speak', 'all']:
+        if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]):
+            logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset, outfold))
+        else:
+            exist = False
+    if exist:
+        return outfold
+
+    logging.info(f'Processing SNIPS dataset and storing at folders "speak", "light" and "all" under {outfold}.')
+    logging.info(
+        f'Processing and importing "smart-speaker-en-close-field" -> "speak" and "smart-speaker-en-close-field" -> "light".'
+    )
+
+    os.makedirs(outfold, exist_ok=True)
+
+    speak_dir = 'smart-speaker-en-close-field'
+    light_dir = 'smart-lights-en-close-field'
+
+    light_files = [f'{infold}/{light_dir}/dataset.json']
+    speak_files = [f'{infold}/{speak_dir}/training_dataset.json']
+    speak_files.append(f'{infold}/{speak_dir}/test_dataset.json')
+
+    light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split)
+    speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files)
+
+    create_dataset(light_train, light_dev, light_slots, light_intents, do_lower_case, f'{outfold}/light')
+    create_dataset(speak_train, speak_dev, speak_slots, speak_intents, do_lower_case, f'{outfold}/speak')
+    create_dataset(
+        light_train + speak_train,
+        light_dev + speak_dev,
+        light_slots | speak_slots,
+        light_intents | speak_intents,
+        do_lower_case,
+        f'{outfold}/all',
+    )
+
+
+def process_jarvis_datasets(
+    infold, outfold, modes=['train', 'test', 'dev'], do_lower_case=False, ignore_prev_intent=False
+):
+    """ process and convert Jarvis datasets into NeMo's BIO format
+    """
+    dataset_name = "jarvis"
+    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
+        logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
+        return outfold
+
+    logging.info(f'Processing {dataset_name} dataset and storing at {outfold}')
+
+    os.makedirs(outfold, exist_ok=True)
+
+    outfiles = {}
+    intents_list = {}
+    slots_list = {}
+    slots_list_all = {}
+
+    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w', encoding='utf-8')
+    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w', encoding='utf-8')
+
+    outfiles['dict_slots'].write('O\n')
+    slots_list["O"] = 0
+    slots_list_all["O"] = 0
+
+    for mode in modes:
+        if if_exist(outfold, [f'{mode}.tsv']):
+            logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
+            continue
+
+        if not if_exist(infold, [f'{mode}.tsv']):
+            logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.')
+            continue
+
+        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w', encoding='utf-8')
+        outfiles[mode].write('sentence\tlabel\n')
+        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w', encoding='utf-8')
+
+        queries = open(f'{infold}/{mode}.tsv', 'r', encoding='utf-8').readlines()
+
+        for i, query in enumerate(queries):
+            line_splits = query.strip().split("\t")
+            if len(line_splits) == 3:
+                intent_str, slot_tags_str, sentence = line_splits
+            else:
+                intent_str, sentence = line_splits
+                slot_tags_str = ""
+
+            if intent_str not in intents_list:
+                intents_list[intent_str] = len(intents_list)
+                outfiles['dict_intents'].write(f'{intent_str}\n')
+
+            if ignore_prev_intent:
+                start_token = 2
+            else:
+                start_token = 1
+
+            if do_lower_case:
+                sentence = sentence.lower()
+            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
+            outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n')
+
+            slot_tags_list = []
+            if slot_tags_str.strip():
+                slot_tags = slot_tags_str.strip().split(",")
+                for st in slot_tags:
+                    if not st.strip():
+                        continue
+                    [start_i, end_i, slot_name] = st.strip().split(":")
+                    slot_tags_list.append([int(start_i), int(end_i), slot_name])
+                    if slot_name not in slots_list:
+                        slots_list[slot_name] = len(slots_list)
+                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
+                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
+                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
+                        outfiles['dict_slots'].write(f'I-{slot_name}\n')
+
+            slot_tags_list.sort(key=lambda x: x[0])
+            slots = []
+            processed_index = 0
+            for tag_start, tag_end, tag_str in slot_tags_list:
+                if tag_start > processed_index:
+                    words_list = sentence[processed_index:tag_start].strip().split()
+                    slots.extend([str(slots_list_all['O'])] * len(words_list))
+                words_list = sentence[tag_start:tag_end].strip().split()
+                slots.append(str(slots_list_all[f'B-{tag_str}']))
+                slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1))
+                processed_index = tag_end
+
+            if processed_index < len(sentence):
+                words_list = sentence[processed_index:].strip().split()
+                slots.extend([str(slots_list_all['O'])] * len(words_list))
+
+            slots = slots[1:-1]
+            slot = ' '.join(slots)
+            outfiles[mode + '_slots'].write(slot + '\n')
+
+        outfiles[mode + '_slots'].close()
+        outfiles[mode].close()
+
+    outfiles['dict_slots'].close()
+    outfiles['dict_intents'].close()
+
+    return outfold
+
+
+if __name__ == "__main__":
+    # Parse the command-line arguments.
+    parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.")
+    parser.add_argument(
+        "--dataset_name", required=True, type=str, choices=['atis', 'snips', 'jarvis', 'assistant'],
+    )
+    parser.add_argument(
+        "--source_data_dir", required=True, type=str, help='path to the folder containing the dataset files'
+    )
+    parser.add_argument("--target_data_dir", required=True, type=str, help='path to save the processed dataset')
+    parser.add_argument("--do_lower_case", action='store_true')
+    parser.add_argument(
+        "--ignore_prev_intent",
+        action='store_true',
+        help='ignores previous intent while importing datasets in jarvis\'s format',
+    )
+
+    args = parser.parse_args()
+
+    dataset_name = args.dataset_name
+    source_dir = args.source_data_dir
+    target_dir = args.target_data_dir
+
+    if not exists(source_dir):
+        raise FileNotFoundError(f"{source_dir} does not exist.")
+
+    if dataset_name == 'atis':
+        process_atis(infold=source_dir, outfold=target_dir, do_lower_case=args.do_lower_case)
+    elif dataset_name == 'snips':
+        process_snips(infold=source_dir, outfold=target_dir, do_lower_case=args.do_lower_case)
+    elif dataset_name == 'jarvis':
+        process_jarvis_datasets(
+            infold=source_dir,
+            outfold=target_dir,
+            modes=["train", "test", "dev"],
+            do_lower_case=args.do_lower_case,
+            ignore_prev_intent=args.ignore_prev_intent,
+        )
+    elif dataset_name == 'assistant':
+        process_assistant(infold=source_dir, outfold=target_dir)
+    else:
+        raise ValueError(f'Dataset {dataset_name} is not supported.')
\ No newline at end of file
diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb
index 853fb0345b4f..aaaf8eb09ce2 100644
--- a/tutorials/nlp/Dialogue.ipynb
+++ b/tutorials/nlp/Dialogue.ipynb
@@ -107,7 +107,7 @@
         "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n",
         "!unzip master.zip\n",
         "# convert the dataset to the NeMo format\n",
-        "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant"
+        "!python NeMo/examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant\n"
       ]
     },
     {

From 4a7bfa271dfae78931826c6fdaa18e220e3fdc58 Mon Sep 17 00:00:00 2001
From: Zhilin Wang <wangzhilin12061996@hotmail.com>
Date: Fri, 20 May 2022 15:39:09 -0700
Subject: [PATCH 4/5] style fix

Signed-off-by: Zhilin Wang <wangzhilin12061996@hotmail.com>
---
 examples/nlp/intent_slot_classification/data/assistant_utils.py | 2 +-
 examples/nlp/intent_slot_classification/data/import_datasets.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/nlp/intent_slot_classification/data/assistant_utils.py b/examples/nlp/intent_slot_classification/data/assistant_utils.py
index e45b3c7c96a2..4463d65705b8 100644
--- a/examples/nlp/intent_slot_classification/data/assistant_utils.py
+++ b/examples/nlp/intent_slot_classification/data/assistant_utils.py
@@ -148,4 +148,4 @@ def process_assistant(infold, outfold, modes=['train', 'test']):
     slot_dict = {k: v for v, k in enumerate(slot_types)}
     for mode in modes:
         slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names)
-        write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')
\ No newline at end of file
+        write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')
diff --git a/examples/nlp/intent_slot_classification/data/import_datasets.py b/examples/nlp/intent_slot_classification/data/import_datasets.py
index afcdcaa773c2..bbbd54e97e05 100644
--- a/examples/nlp/intent_slot_classification/data/import_datasets.py
+++ b/examples/nlp/intent_slot_classification/data/import_datasets.py
@@ -262,4 +262,4 @@ def process_jarvis_datasets(
     elif dataset_name == 'assistant':
         process_assistant(infold=source_dir, outfold=target_dir)
     else:
-        raise ValueError(f'Dataset {dataset_name} is not supported.')
\ No newline at end of file
+        raise ValueError(f'Dataset {dataset_name} is not supported.')

From a42b7f0f18b978468058bb18f48cef6b58c78859 Mon Sep 17 00:00:00 2001
From: Zhilin Wang <wangzhilin12061996@hotmail.com>
Date: Fri, 20 May 2022 21:30:28 -0700
Subject: [PATCH 5/5] update tutorial

Signed-off-by: Zhilin Wang <wangzhilin12061996@hotmail.com>
---
 tutorials/nlp/Dialogue.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb
index 853fb0345b4f..3c7288d93d05 100644
--- a/tutorials/nlp/Dialogue.ipynb
+++ b/tutorials/nlp/Dialogue.ipynb
@@ -107,7 +107,7 @@
         "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n",
         "!unzip master.zip\n",
         "# convert the dataset to the NeMo format\n",
-        "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant"
+        "!python NeMo/examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant"
       ]
     },
     {