From 0b4ba3f08e0d0f957db809f38f9904cd7dc7a900 Mon Sep 17 00:00:00 2001 From: Zhilin Wang Date: Thu, 19 May 2022 15:53:34 -0700 Subject: [PATCH 1/5] fix bugs for dialogue tutorial Signed-off-by: Zhilin Wang --- tutorials/nlp/Dialogue.ipynb | 52 ++++++++++++++++++------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb index aca5e11ce0a1..aaaf8eb09ce2 100644 --- a/tutorials/nlp/Dialogue.ipynb +++ b/tutorials/nlp/Dialogue.ipynb @@ -28,7 +28,7 @@ "source": [ "import os \n", "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n", - "!git clone https://github.com/NVIDIA/NeMo --branch main\n", + "!git clone https://github.com/NVIDIA/NeMo --branch r1.9.0\n", "os.chdir('NeMo')\n", "!./reinstall.sh\n", "os.chdir('..')\n" @@ -87,7 +87,7 @@ "\n", "An example is:\n", "\n", - "* utterance: what alarms have i set for tomorrow intent: \n", + "* utterance: what alarms have i set for tomorrow \n", "* intent: alarm_query\n", "* slots: date(tomorrow)\n", "\n", @@ -287,15 +287,20 @@ }, { "cell_type": "markdown", - "source": [ - "## 1.4 (Optional) To train/ test a GPT2 model on the assistant dataset, run the cell below " - ], "metadata": { "id": "-44x5PqyrOeQ" - } + }, + "source": [ + "## 1.4 (Optional) To train/ test a GPT2 model on the assistant dataset, run the cell below " + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QyqQbpR4rNHT" + }, + "outputs": [], "source": [ "# model.dataset.data_dir: folder to load data from\n", "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", @@ -312,15 +317,13 @@ " model.dataset.target_template=with_slots \\\n", " model.dataset.eval_mode=generation \\\n", " exp_manager.create_wandb_logger=False)" - ], - "metadata": { - "id": "QyqQbpR4rNHT" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "FbQ-6TVM1yQg" + }, "source": [ "**After 1 epoch:**\n", "\n", @@ -417,10 +420,7 @@ " test_loss_epoch 0.019178826361894608\n", "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", "```" - ], - "metadata": { - "id": "FbQ-6TVM1yQg" - } + ] }, { "cell_type": "markdown", @@ -437,7 +437,7 @@ "\n", "An example is:\n", "\n", - "* utterance: I will be eating there at 11:30 am so make it for then.\n", + "* utterance: I will be eating there at 11:30 am so make the reservation for then.\n", "* intent: ReserveRestaurant\n", "* slots: {\"time\": \"11:30 am\"}\n", "\n", @@ -498,14 +498,14 @@ }, { "cell_type": "code", - "source": [ - "!ls sgd_gpt2_predictions" - ], + "execution_count": null, "metadata": { "id": "kGDlV5HvI2PQ" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "!ls sgd_gpt2_predictions" + ] }, { "cell_type": "markdown", @@ -515,7 +515,7 @@ "source": [ "**After 1 epoch:**\n", "\n", - "More epoches would needed to reach convergence.\n", + "More epochs would needed to reach convergence.\n", "\n", "\n", "```\n", @@ -590,7 +590,7 @@ "An example is \n", "\n", "\n", - "* question: what county is nine mile in\n", + "* question: What county is Nine Mile in?\n", "* extracted_answer: Onondaga\n", "* fluent_answer: Nine Mile is in Onondaga county.\n" ] @@ -667,7 +667,7 @@ "source": [ "**After 1 epoch:**\n", "\n", - "Train more epoches for optimal performance\n", + "Train more epochs for optimal performance\n", "\n", "```\n", "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", @@ -713,4 +713,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From 7495f126479dc8993f031809a5976b8866718ec9 Mon Sep 17 00:00:00 2001 From: Zhilin Wang Date: Thu, 19 May 2022 21:01:10 -0700 Subject: [PATCH 2/5] update path for convert_datasets.py due to conflict PR Signed-off-by: Zhilin Wang --- tutorials/nlp/Dialogue.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb index aaaf8eb09ce2..853fb0345b4f 100644 --- a/tutorials/nlp/Dialogue.ipynb +++ b/tutorials/nlp/Dialogue.ipynb @@ -107,7 +107,7 @@ "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n", "!unzip master.zip\n", "# convert the dataset to the NeMo format\n", - "!python NeMo/examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant\n" + "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant" ] }, { From b21032269bae7ccf8b6aac62e951b68e48b68d71 Mon Sep 17 00:00:00 2001 From: Zhilin Wang Date: Fri, 20 May 2022 10:22:13 -0700 Subject: [PATCH 3/5] restore previously deleted files Signed-off-by: Zhilin Wang --- .../data/assistant_utils.py | 151 ++++++++++ .../data/import_datasets.py | 265 ++++++++++++++++++ tutorials/nlp/Dialogue.ipynb | 2 +- 3 files changed, 417 insertions(+), 1 deletion(-) create mode 100644 examples/nlp/intent_slot_classification/data/assistant_utils.py create mode 100644 examples/nlp/intent_slot_classification/data/import_datasets.py diff --git a/examples/nlp/intent_slot_classification/data/assistant_utils.py b/examples/nlp/intent_slot_classification/data/assistant_utils.py new file mode 100644 index 000000000000..e45b3c7c96a2 --- /dev/null +++ b/examples/nlp/intent_slot_classification/data/assistant_utils.py @@ -0,0 +1,151 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import shutil + +from nemo.collections.nlp.data.data_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist, write_files +from nemo.utils import logging + + +def copy_input_files(infold): + """ Put training files in convenient place for conversion to our format. """ + our_infold = infold + "/dataset" + + if os.path.exists(our_infold + "/trainset") and os.path.exists(our_infold + "/testset"): + logging.info("Input folders exists") + return + + logging.info(f"Copying files to input folder: {our_infold}") + os.makedirs(infold, exist_ok=True) + + old_infold = ( + infold + '/CrossValidation/autoGeneFromRealAnno/autoGene_2018_03_22-13_01_25_169/CrossValidation/KFold_1' + ) + if not os.path.exists(our_infold + "/trainset"): + shutil.copytree(old_infold + '/trainset', our_infold + '/trainset') + + if not os.path.exists(our_infold + "/testset"): + shutil.copytree(old_infold + '/testset/csv', our_infold + '/testset') + + +def get_intents(infold): + """ Get list of intents from file names. """ + intents = [f[:-4] for f in os.listdir(infold)] + intents.sort() + print(f'Found {len(intents)} intents') + return intents + + +def get_intent_queries(infold, intent_names, mode): + """ Get list of queries with their corresponding intent number. """ + intent_queries = ['sentence\tlabel\n'] + + for index, intent in enumerate(intent_names): + queries = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines() + for query in queries[1:]: + phrases = query.split(";") + intent_query = phrases[4][1:-1] + "\t" + str(index) + intent_queries.append(intent_query) + + return intent_queries + + +def get_slots(infold, modes): + """ + Find a lost of unique slot types in training and testing data. + We use a single slot type name both for starting and continuation tokes (not using B-, I- notation). + """ + slots = set() + + for mode in modes: + path = f'{infold}/{mode}set' + for filename in os.listdir(path): + lines = open(f'{path}/{filename}', 'r', encoding='utf-8').readlines() + for line in lines[1:]: + query = line.split(";")[3] + slot_phrases = re.findall('\[.*?\]', query) + for slot_phrase in slot_phrases: + slot = slot_phrase.split(" : ")[0][1:] + slots.add(slot) + + slots = sorted(slots) + slots.append("O") + print(f'Found {len(slots)} slot types') + return slots + + +def get_slot_queries(infold, slot_dict, mode, intent_names): + """ Convert each word in a query to corresponding slot number. """ + slot_queries = [] + outside_slot = len(slot_dict) - 1 + + # keep the same order of files/queries as for intents + for intent in intent_names: + lines = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines() + for line in lines[1:]: + slot_query = "" + query = line.split(";")[3] + words = query.split(" ") + current_slot = outside_slot + for word in words: + if word[0] == "[": + current_slot = slot_dict[word[1:]] + elif word[0] == ":": + continue + else: + slot_query += str(current_slot) + " " + if word[-1] == ']': + current_slot = outside_slot + + slot_queries.append(slot_query.strip()) + + return slot_queries + + +def process_assistant(infold, outfold, modes=['train', 'test']): + """ + https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes + about 25 thousand examples with 66 various multi-domain intents and 57 entity types. + """ + if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('robot', outfold)) + return outfold + + logging.info(f'Processing assistant commands dataset and store at {outfold}') + os.makedirs(outfold, exist_ok=True) + + # copy train/test files to the convenient directory to work with + copy_input_files(infold) + infold += "/dataset" + + # get list of intents from train folder (test folder supposed to be the same) + intent_names = get_intents(infold + "/trainset") + write_files(intent_names, f'{outfold}/dict.intents.csv') + + # get all train and test queries with their intent + for mode in modes: + intent_queries = get_intent_queries(infold, intent_names, mode) + write_files(intent_queries, f'{outfold}/{mode}.tsv') + + # get list of all unique slots in training and testing files + slot_types = get_slots(infold, modes) + write_files(slot_types, f'{outfold}/dict.slots.csv') + + # create files of slot queries + slot_dict = {k: v for v, k in enumerate(slot_types)} + for mode in modes: + slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names) + write_files(slot_queries, f'{outfold}/{mode}_slots.tsv') \ No newline at end of file diff --git a/examples/nlp/intent_slot_classification/data/import_datasets.py b/examples/nlp/intent_slot_classification/data/import_datasets.py new file mode 100644 index 000000000000..afcdcaa773c2 --- /dev/null +++ b/examples/nlp/intent_slot_classification/data/import_datasets.py @@ -0,0 +1,265 @@ +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import shutil +from os.path import exists + +from assistant_utils import process_assistant + +from nemo.collections.nlp.data.data_utils.data_preprocessing import ( + DATABASE_EXISTS_TMP, + MODE_EXISTS_TMP, + create_dataset, + get_dataset, + get_vocab, + if_exist, +) +from nemo.utils import logging + + +def ids2text(ids, vocab): + return ' '.join([vocab[int(id_)] for id_ in ids]) + + +def process_atis(infold, outfold, modes=['train', 'test'], do_lower_case=False): + """ MSFT's dataset, processed by Kaggle + https://www.kaggle.com/siddhadev/atis-dataset-from-ms-cntk + """ + vocab = get_vocab(f'{infold}/atis.dict.vocab.csv') + + if if_exist(outfold, [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('ATIS', outfold)) + return outfold + logging.info(f'Processing ATIS dataset and storing at {outfold}.') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + for mode in modes: + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w', encoding='utf-8') + outfiles[mode].write('sentence\tlabel\n') + outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w', encoding='utf-8') + + queries = open(f'{infold}/atis.{mode}.query.csv', 'r', encoding='utf-8').readlines() + intents = open(f'{infold}/atis.{mode}.intent.csv', 'r', encoding='utf-8').readlines() + slots = open(f'{infold}/atis.{mode}.slots.csv', 'r', encoding='utf-8').readlines() + + for i, query in enumerate(queries): + sentence = ids2text(query.strip().split()[1:-1], vocab) + if do_lower_case: + sentence = sentence.lower() + outfiles[mode].write(f'{sentence}\t{intents[i].strip()}\n') + slot = ' '.join(slots[i].strip().split()[1:-1]) + outfiles[mode + '_slots'].write(slot + '\n') + + shutil.copyfile(f'{infold}/atis.dict.intent.csv', f'{outfold}/dict.intents.csv') + shutil.copyfile(f'{infold}/atis.dict.slots.csv', f'{outfold}/dict.slots.csv') + for mode in modes: + outfiles[mode].close() + + +def process_snips(infold, outfold, do_lower_case, modes=['train', 'test'], dev_split=0.1): + if not os.path.exists(infold): + link = 'https://github.com/snipsco/spoken-language-understanding-research-datasets' + raise ValueError(f'Data not found at {infold}. ' f'You may request to download the SNIPS dataset from {link}.') + + exist = True + for dataset in ['light', 'speak', 'all']: + if if_exist(f'{outfold}/{dataset}', [f'{mode}.tsv' for mode in modes]): + logging.info(DATABASE_EXISTS_TMP.format('SNIPS-' + dataset, outfold)) + else: + exist = False + if exist: + return outfold + + logging.info(f'Processing SNIPS dataset and storing at folders "speak", "light" and "all" under {outfold}.') + logging.info( + f'Processing and importing "smart-speaker-en-close-field" -> "speak" and "smart-speaker-en-close-field" -> "light".' + ) + + os.makedirs(outfold, exist_ok=True) + + speak_dir = 'smart-speaker-en-close-field' + light_dir = 'smart-lights-en-close-field' + + light_files = [f'{infold}/{light_dir}/dataset.json'] + speak_files = [f'{infold}/{speak_dir}/training_dataset.json'] + speak_files.append(f'{infold}/{speak_dir}/test_dataset.json') + + light_train, light_dev, light_slots, light_intents = get_dataset(light_files, dev_split) + speak_train, speak_dev, speak_slots, speak_intents = get_dataset(speak_files) + + create_dataset(light_train, light_dev, light_slots, light_intents, do_lower_case, f'{outfold}/light') + create_dataset(speak_train, speak_dev, speak_slots, speak_intents, do_lower_case, f'{outfold}/speak') + create_dataset( + light_train + speak_train, + light_dev + speak_dev, + light_slots | speak_slots, + light_intents | speak_intents, + do_lower_case, + f'{outfold}/all', + ) + + +def process_jarvis_datasets( + infold, outfold, modes=['train', 'test', 'dev'], do_lower_case=False, ignore_prev_intent=False +): + """ process and convert Jarvis datasets into NeMo's BIO format + """ + dataset_name = "jarvis" + if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']): + logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold)) + return outfold + + logging.info(f'Processing {dataset_name} dataset and storing at {outfold}') + + os.makedirs(outfold, exist_ok=True) + + outfiles = {} + intents_list = {} + slots_list = {} + slots_list_all = {} + + outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv', 'w', encoding='utf-8') + outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv', 'w', encoding='utf-8') + + outfiles['dict_slots'].write('O\n') + slots_list["O"] = 0 + slots_list_all["O"] = 0 + + for mode in modes: + if if_exist(outfold, [f'{mode}.tsv']): + logging.info(MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode)) + continue + + if not if_exist(infold, [f'{mode}.tsv']): + logging.info(f'{mode} mode of {dataset_name}' f' is skipped as it was not found.') + continue + + outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'), 'w', encoding='utf-8') + outfiles[mode].write('sentence\tlabel\n') + outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv', 'w', encoding='utf-8') + + queries = open(f'{infold}/{mode}.tsv', 'r', encoding='utf-8').readlines() + + for i, query in enumerate(queries): + line_splits = query.strip().split("\t") + if len(line_splits) == 3: + intent_str, slot_tags_str, sentence = line_splits + else: + intent_str, sentence = line_splits + slot_tags_str = "" + + if intent_str not in intents_list: + intents_list[intent_str] = len(intents_list) + outfiles['dict_intents'].write(f'{intent_str}\n') + + if ignore_prev_intent: + start_token = 2 + else: + start_token = 1 + + if do_lower_case: + sentence = sentence.lower() + sentence_cld = " ".join(sentence.strip().split()[start_token:-1]) + outfiles[mode].write(f'{sentence_cld}\t' f'{str(intents_list[intent_str])}\n') + + slot_tags_list = [] + if slot_tags_str.strip(): + slot_tags = slot_tags_str.strip().split(",") + for st in slot_tags: + if not st.strip(): + continue + [start_i, end_i, slot_name] = st.strip().split(":") + slot_tags_list.append([int(start_i), int(end_i), slot_name]) + if slot_name not in slots_list: + slots_list[slot_name] = len(slots_list) + slots_list_all[f'B-{slot_name}'] = len(slots_list_all) + slots_list_all[f'I-{slot_name}'] = len(slots_list_all) + outfiles['dict_slots'].write(f'B-{slot_name}\n') + outfiles['dict_slots'].write(f'I-{slot_name}\n') + + slot_tags_list.sort(key=lambda x: x[0]) + slots = [] + processed_index = 0 + for tag_start, tag_end, tag_str in slot_tags_list: + if tag_start > processed_index: + words_list = sentence[processed_index:tag_start].strip().split() + slots.extend([str(slots_list_all['O'])] * len(words_list)) + words_list = sentence[tag_start:tag_end].strip().split() + slots.append(str(slots_list_all[f'B-{tag_str}'])) + slots.extend([str(slots_list_all[f'I-{tag_str}'])] * (len(words_list) - 1)) + processed_index = tag_end + + if processed_index < len(sentence): + words_list = sentence[processed_index:].strip().split() + slots.extend([str(slots_list_all['O'])] * len(words_list)) + + slots = slots[1:-1] + slot = ' '.join(slots) + outfiles[mode + '_slots'].write(slot + '\n') + + outfiles[mode + '_slots'].close() + outfiles[mode].close() + + outfiles['dict_slots'].close() + outfiles['dict_intents'].close() + + return outfold + + +if __name__ == "__main__": + # Parse the command-line arguments. + parser = argparse.ArgumentParser(description="Process and convert datasets into NeMo\'s format.") + parser.add_argument( + "--dataset_name", required=True, type=str, choices=['atis', 'snips', 'jarvis', 'assistant'], + ) + parser.add_argument( + "--source_data_dir", required=True, type=str, help='path to the folder containing the dataset files' + ) + parser.add_argument("--target_data_dir", required=True, type=str, help='path to save the processed dataset') + parser.add_argument("--do_lower_case", action='store_true') + parser.add_argument( + "--ignore_prev_intent", + action='store_true', + help='ignores previous intent while importing datasets in jarvis\'s format', + ) + + args = parser.parse_args() + + dataset_name = args.dataset_name + source_dir = args.source_data_dir + target_dir = args.target_data_dir + + if not exists(source_dir): + raise FileNotFoundError(f"{source_dir} does not exist.") + + if dataset_name == 'atis': + process_atis(infold=source_dir, outfold=target_dir, do_lower_case=args.do_lower_case) + elif dataset_name == 'snips': + process_snips(infold=source_dir, outfold=target_dir, do_lower_case=args.do_lower_case) + elif dataset_name == 'jarvis': + process_jarvis_datasets( + infold=source_dir, + outfold=target_dir, + modes=["train", "test", "dev"], + do_lower_case=args.do_lower_case, + ignore_prev_intent=args.ignore_prev_intent, + ) + elif dataset_name == 'assistant': + process_assistant(infold=source_dir, outfold=target_dir) + else: + raise ValueError(f'Dataset {dataset_name} is not supported.') \ No newline at end of file diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb index 853fb0345b4f..aaaf8eb09ce2 100644 --- a/tutorials/nlp/Dialogue.ipynb +++ b/tutorials/nlp/Dialogue.ipynb @@ -107,7 +107,7 @@ "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n", "!unzip master.zip\n", "# convert the dataset to the NeMo format\n", - "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant" + "!python NeMo/examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant\n" ] }, { From 4a7bfa271dfae78931826c6fdaa18e220e3fdc58 Mon Sep 17 00:00:00 2001 From: Zhilin Wang Date: Fri, 20 May 2022 15:39:09 -0700 Subject: [PATCH 4/5] style fix Signed-off-by: Zhilin Wang --- examples/nlp/intent_slot_classification/data/assistant_utils.py | 2 +- examples/nlp/intent_slot_classification/data/import_datasets.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/nlp/intent_slot_classification/data/assistant_utils.py b/examples/nlp/intent_slot_classification/data/assistant_utils.py index e45b3c7c96a2..4463d65705b8 100644 --- a/examples/nlp/intent_slot_classification/data/assistant_utils.py +++ b/examples/nlp/intent_slot_classification/data/assistant_utils.py @@ -148,4 +148,4 @@ def process_assistant(infold, outfold, modes=['train', 'test']): slot_dict = {k: v for v, k in enumerate(slot_types)} for mode in modes: slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names) - write_files(slot_queries, f'{outfold}/{mode}_slots.tsv') \ No newline at end of file + write_files(slot_queries, f'{outfold}/{mode}_slots.tsv') diff --git a/examples/nlp/intent_slot_classification/data/import_datasets.py b/examples/nlp/intent_slot_classification/data/import_datasets.py index afcdcaa773c2..bbbd54e97e05 100644 --- a/examples/nlp/intent_slot_classification/data/import_datasets.py +++ b/examples/nlp/intent_slot_classification/data/import_datasets.py @@ -262,4 +262,4 @@ def process_jarvis_datasets( elif dataset_name == 'assistant': process_assistant(infold=source_dir, outfold=target_dir) else: - raise ValueError(f'Dataset {dataset_name} is not supported.') \ No newline at end of file + raise ValueError(f'Dataset {dataset_name} is not supported.') From a42b7f0f18b978468058bb18f48cef6b58c78859 Mon Sep 17 00:00:00 2001 From: Zhilin Wang Date: Fri, 20 May 2022 21:30:28 -0700 Subject: [PATCH 5/5] update tutorial Signed-off-by: Zhilin Wang --- tutorials/nlp/Dialogue.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb index 853fb0345b4f..3c7288d93d05 100644 --- a/tutorials/nlp/Dialogue.ipynb +++ b/tutorials/nlp/Dialogue.ipynb @@ -107,7 +107,7 @@ "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n", "!unzip master.zip\n", "# convert the dataset to the NeMo format\n", - "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/convert_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant" + "!python NeMo/examples/nlp/intent_slot_classification/data/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant" ] }, {