-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* fix bugs for dialogue tutorial Signed-off-by: Zhilin Wang <[email protected]> * update path for convert_datasets.py due to conflict PR Signed-off-by: Zhilin Wang <[email protected]> * restore previously deleted files Signed-off-by: Zhilin Wang <[email protected]> * style fix Signed-off-by: Zhilin Wang <[email protected]>
- Loading branch information
Showing
2 changed files
with
416 additions
and
0 deletions.
There are no files selected for viewing
151 changes: 151 additions & 0 deletions
151
examples/nlp/intent_slot_classification/data/assistant_utils.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
import re | ||
import shutil | ||
|
||
from nemo.collections.nlp.data.data_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist, write_files | ||
from nemo.utils import logging | ||
|
||
|
||
def copy_input_files(infold): | ||
""" Put training files in convenient place for conversion to our format. """ | ||
our_infold = infold + "/dataset" | ||
|
||
if os.path.exists(our_infold + "/trainset") and os.path.exists(our_infold + "/testset"): | ||
logging.info("Input folders exists") | ||
return | ||
|
||
logging.info(f"Copying files to input folder: {our_infold}") | ||
os.makedirs(infold, exist_ok=True) | ||
|
||
old_infold = ( | ||
infold + '/CrossValidation/autoGeneFromRealAnno/autoGene_2018_03_22-13_01_25_169/CrossValidation/KFold_1' | ||
) | ||
if not os.path.exists(our_infold + "/trainset"): | ||
shutil.copytree(old_infold + '/trainset', our_infold + '/trainset') | ||
|
||
if not os.path.exists(our_infold + "/testset"): | ||
shutil.copytree(old_infold + '/testset/csv', our_infold + '/testset') | ||
|
||
|
||
def get_intents(infold): | ||
""" Get list of intents from file names. """ | ||
intents = [f[:-4] for f in os.listdir(infold)] | ||
intents.sort() | ||
print(f'Found {len(intents)} intents') | ||
return intents | ||
|
||
|
||
def get_intent_queries(infold, intent_names, mode): | ||
""" Get list of queries with their corresponding intent number. """ | ||
intent_queries = ['sentence\tlabel\n'] | ||
|
||
for index, intent in enumerate(intent_names): | ||
queries = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines() | ||
for query in queries[1:]: | ||
phrases = query.split(";") | ||
intent_query = phrases[4][1:-1] + "\t" + str(index) | ||
intent_queries.append(intent_query) | ||
|
||
return intent_queries | ||
|
||
|
||
def get_slots(infold, modes): | ||
""" | ||
Find a lost of unique slot types in training and testing data. | ||
We use a single slot type name both for starting and continuation tokes (not using B-, I- notation). | ||
""" | ||
slots = set() | ||
|
||
for mode in modes: | ||
path = f'{infold}/{mode}set' | ||
for filename in os.listdir(path): | ||
lines = open(f'{path}/{filename}', 'r', encoding='utf-8').readlines() | ||
for line in lines[1:]: | ||
query = line.split(";")[3] | ||
slot_phrases = re.findall('\[.*?\]', query) | ||
for slot_phrase in slot_phrases: | ||
slot = slot_phrase.split(" : ")[0][1:] | ||
slots.add(slot) | ||
|
||
slots = sorted(slots) | ||
slots.append("O") | ||
print(f'Found {len(slots)} slot types') | ||
return slots | ||
|
||
|
||
def get_slot_queries(infold, slot_dict, mode, intent_names): | ||
""" Convert each word in a query to corresponding slot number. """ | ||
slot_queries = [] | ||
outside_slot = len(slot_dict) - 1 | ||
|
||
# keep the same order of files/queries as for intents | ||
for intent in intent_names: | ||
lines = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines() | ||
for line in lines[1:]: | ||
slot_query = "" | ||
query = line.split(";")[3] | ||
words = query.split(" ") | ||
current_slot = outside_slot | ||
for word in words: | ||
if word[0] == "[": | ||
current_slot = slot_dict[word[1:]] | ||
elif word[0] == ":": | ||
continue | ||
else: | ||
slot_query += str(current_slot) + " " | ||
if word[-1] == ']': | ||
current_slot = outside_slot | ||
|
||
slot_queries.append(slot_query.strip()) | ||
|
||
return slot_queries | ||
|
||
|
||
def process_assistant(infold, outfold, modes=['train', 'test']): | ||
""" | ||
https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes | ||
about 25 thousand examples with 66 various multi-domain intents and 57 entity types. | ||
""" | ||
if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]): | ||
logging.info(DATABASE_EXISTS_TMP.format('robot', outfold)) | ||
return outfold | ||
|
||
logging.info(f'Processing assistant commands dataset and store at {outfold}') | ||
os.makedirs(outfold, exist_ok=True) | ||
|
||
# copy train/test files to the convenient directory to work with | ||
copy_input_files(infold) | ||
infold += "/dataset" | ||
|
||
# get list of intents from train folder (test folder supposed to be the same) | ||
intent_names = get_intents(infold + "/trainset") | ||
write_files(intent_names, f'{outfold}/dict.intents.csv') | ||
|
||
# get all train and test queries with their intent | ||
for mode in modes: | ||
intent_queries = get_intent_queries(infold, intent_names, mode) | ||
write_files(intent_queries, f'{outfold}/{mode}.tsv') | ||
|
||
# get list of all unique slots in training and testing files | ||
slot_types = get_slots(infold, modes) | ||
write_files(slot_types, f'{outfold}/dict.slots.csv') | ||
|
||
# create files of slot queries | ||
slot_dict = {k: v for v, k in enumerate(slot_types)} | ||
for mode in modes: | ||
slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names) | ||
write_files(slot_queries, f'{outfold}/{mode}_slots.tsv') |
Oops, something went wrong.