Skip to content

Commit

Permalink
Dialogue tutorial fix (#4218)
Browse files Browse the repository at this point in the history
* fix bugs for dialogue tutorial

Signed-off-by: Zhilin Wang <[email protected]>

* update path for convert_datasets.py due to conflict PR

Signed-off-by: Zhilin Wang <[email protected]>

* restore previously deleted files

Signed-off-by: Zhilin Wang <[email protected]>

* style fix

Signed-off-by: Zhilin Wang <[email protected]>
  • Loading branch information
Zhilin123 authored May 21, 2022
1 parent 215d059 commit 3852549
Show file tree
Hide file tree
Showing 2 changed files with 416 additions and 0 deletions.
151 changes: 151 additions & 0 deletions examples/nlp/intent_slot_classification/data/assistant_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import re
import shutil

from nemo.collections.nlp.data.data_utils.data_preprocessing import DATABASE_EXISTS_TMP, if_exist, write_files
from nemo.utils import logging


def copy_input_files(infold):
""" Put training files in convenient place for conversion to our format. """
our_infold = infold + "/dataset"

if os.path.exists(our_infold + "/trainset") and os.path.exists(our_infold + "/testset"):
logging.info("Input folders exists")
return

logging.info(f"Copying files to input folder: {our_infold}")
os.makedirs(infold, exist_ok=True)

old_infold = (
infold + '/CrossValidation/autoGeneFromRealAnno/autoGene_2018_03_22-13_01_25_169/CrossValidation/KFold_1'
)
if not os.path.exists(our_infold + "/trainset"):
shutil.copytree(old_infold + '/trainset', our_infold + '/trainset')

if not os.path.exists(our_infold + "/testset"):
shutil.copytree(old_infold + '/testset/csv', our_infold + '/testset')


def get_intents(infold):
""" Get list of intents from file names. """
intents = [f[:-4] for f in os.listdir(infold)]
intents.sort()
print(f'Found {len(intents)} intents')
return intents


def get_intent_queries(infold, intent_names, mode):
""" Get list of queries with their corresponding intent number. """
intent_queries = ['sentence\tlabel\n']

for index, intent in enumerate(intent_names):
queries = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines()
for query in queries[1:]:
phrases = query.split(";")
intent_query = phrases[4][1:-1] + "\t" + str(index)
intent_queries.append(intent_query)

return intent_queries


def get_slots(infold, modes):
"""
Find a lost of unique slot types in training and testing data.
We use a single slot type name both for starting and continuation tokes (not using B-, I- notation).
"""
slots = set()

for mode in modes:
path = f'{infold}/{mode}set'
for filename in os.listdir(path):
lines = open(f'{path}/{filename}', 'r', encoding='utf-8').readlines()
for line in lines[1:]:
query = line.split(";")[3]
slot_phrases = re.findall('\[.*?\]', query)
for slot_phrase in slot_phrases:
slot = slot_phrase.split(" : ")[0][1:]
slots.add(slot)

slots = sorted(slots)
slots.append("O")
print(f'Found {len(slots)} slot types')
return slots


def get_slot_queries(infold, slot_dict, mode, intent_names):
""" Convert each word in a query to corresponding slot number. """
slot_queries = []
outside_slot = len(slot_dict) - 1

# keep the same order of files/queries as for intents
for intent in intent_names:
lines = open(f'{infold}/{mode}set/{intent}.csv', 'r', encoding='utf-8').readlines()
for line in lines[1:]:
slot_query = ""
query = line.split(";")[3]
words = query.split(" ")
current_slot = outside_slot
for word in words:
if word[0] == "[":
current_slot = slot_dict[word[1:]]
elif word[0] == ":":
continue
else:
slot_query += str(current_slot) + " "
if word[-1] == ']':
current_slot = outside_slot

slot_queries.append(slot_query.strip())

return slot_queries


def process_assistant(infold, outfold, modes=['train', 'test']):
"""
https://github.com/xliuhw/NLU-Evaluation-Data - this dataset includes
about 25 thousand examples with 66 various multi-domain intents and 57 entity types.
"""
if if_exist(outfold, [f'{mode}_slots.tsv' for mode in modes]):
logging.info(DATABASE_EXISTS_TMP.format('robot', outfold))
return outfold

logging.info(f'Processing assistant commands dataset and store at {outfold}')
os.makedirs(outfold, exist_ok=True)

# copy train/test files to the convenient directory to work with
copy_input_files(infold)
infold += "/dataset"

# get list of intents from train folder (test folder supposed to be the same)
intent_names = get_intents(infold + "/trainset")
write_files(intent_names, f'{outfold}/dict.intents.csv')

# get all train and test queries with their intent
for mode in modes:
intent_queries = get_intent_queries(infold, intent_names, mode)
write_files(intent_queries, f'{outfold}/{mode}.tsv')

# get list of all unique slots in training and testing files
slot_types = get_slots(infold, modes)
write_files(slot_types, f'{outfold}/dict.slots.csv')

# create files of slot queries
slot_dict = {k: v for v, k in enumerate(slot_types)}
for mode in modes:
slot_queries = get_slot_queries(infold, slot_dict, mode, intent_names)
write_files(slot_queries, f'{outfold}/{mode}_slots.tsv')
Loading

0 comments on commit 3852549

Please sign in to comment.