Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update new aliasing code, add constants and test cases #124

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions slu/config/alias-eval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Sample Format
intent_alised:
- intent1
- intent2
- intent3
5 changes: 5 additions & 0 deletions slu/config/alias-train.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Sample Format
intent_alised:
- intent1
- intent2
- intent3
2 changes: 2 additions & 0 deletions slu/slu/constants/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@
CONFIG_PATH = os.path.join("config", "config.yaml")
PROMPTS_CONFIG_PATH = os.path.join("config", "prompts.yaml")
MISSING_PROMPTS_PATH = os.path.join("config", "missing_prompts.yaml")
ALIAS_TRAIN_PATH = os.path.join("config", "alias-train.yaml")
ALIAS_EVAL_PATH = os.path.join("config", "alias-eval.yaml")
INTENT_LABEL_ENCODER = "labelencoder.pkl"
ENTITY_LABELS = "entity_label_list.pkl"
XLMR = "xlmroberta"
Expand Down
9 changes: 8 additions & 1 deletion slu/slu/dev/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from slu.src.controller.prediction import get_predictions
from slu.utils import logger
from slu.utils.config import Config, YAMLLocalConfig
from slu.utils.preprocessing import make_label_column_uniform, make_data_column_uniform, make_reftime_column_uniform


def zoom_out_labels(labels: List[str]):
Expand Down Expand Up @@ -145,10 +146,16 @@ def test_classifier(args: argparse.Namespace):
predict_api = get_predictions(const.TEST, config=config, debug=False)
dataset = dataset or config.get_dataset(const.CLASSIFICATION, f"{const.TEST}.csv")
test_df = pd.read_csv(dataset)
test_df = make_label_column_uniform(test_df,const.ALIAS_EVAL_PATH)
test_df = make_data_column_uniform(test_df)
test_df = make_reftime_column_uniform(test_df)
test_df = test_df[~test_df[const.TAG].isin(config.tasks.classification.skip)]
test_df = test_df[test_df[const.ALTERNATIVES] != "[]"]
test_df = test_df.replace({const.TAG: config.tasks.classification.alias})

logger.info(
f"Model will be tested for the following classes:\
\n{test_df[const.TAG].value_counts(dropna=False)}"
)
logger.info("Running predictions")
predictions = []
logger.disable("slu")
Expand Down
91 changes: 12 additions & 79 deletions slu/slu/dev/train.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from slu.utils.preprocessing import make_label_column_uniform, make_data_column_uniform, make_reftime_column_uniform
"""
Routine for Classifier and NER training.
"""
Expand All @@ -17,75 +18,7 @@
from slu.src.controller.processors import SLUPipeline
from slu.utils import logger
from slu.utils.config import Config, YAMLLocalConfig


def make_label_column_uniform(data_frame: pd.DataFrame) -> None:
if const.INTENTS in data_frame.columns:
column = const.INTENTS
elif const.LABELS in data_frame.columns:
column = const.LABELS
elif const.TAG in data_frame.columns:
column = const.TAG
else:
raise ValueError(
f"Expected one of {const.LABELS}, {const.TAG} to be present in the dataset."
)
data_frame.rename(columns={column: const.TAG}, inplace=True)


def reftime_patterns(reftime: str):
time_fns = [
datetime.fromisoformat,
lambda date_string: datetime.strptime(
date_string, "%Y-%m-%d %H:%M:%S.%f %z %Z"
),
lambda date_string: datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ"),
lambda date_string: datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S.%f%z"),
]
for time_fn in time_fns:
try:
return time_fn(reftime)
except ValueError:
continue
raise ValueError(f"Could not parse reftime {reftime}")


def make_reftime_column_uniform(data_frame: pd.DataFrame) -> None:
if const.REFERENCE_TIME not in data_frame.columns:
return

for i, row in tqdm(
data_frame.iterrows(), total=len(data_frame), desc="Fixing reference time"
):
if row[const.REFERENCE_TIME] is not None and not pd.isna(
row[const.REFERENCE_TIME]
):
data_frame.loc[i, const.REFERENCE_TIME] = reftime_patterns(
row[const.REFERENCE_TIME]
).isoformat()


def make_data_column_uniform(data_frame: pd.DataFrame) -> None:
if const.ALTERNATIVES in data_frame.columns:
column = const.ALTERNATIVES
elif const.DATA in data_frame.columns:
column = const.DATA
else:
raise ValueError(
f"Expected one of {const.ALTERNATIVES}, {const.DATA} to be present in the dataset."
)
data_frame.rename(columns={column: const.ALTERNATIVES}, inplace=True)

for i, row in tqdm(
data_frame.iterrows(), total=len(data_frame), desc="Fixing data structure"
):
if isinstance(row[const.ALTERNATIVES], str):
data = json.loads(row[const.ALTERNATIVES])
if const.ALTERNATIVES in data:
data_frame.loc[i, const.ALTERNATIVES] = json.dumps(
data[const.ALTERNATIVES]
)

from slu.utils.preprocessing import make_label_column_uniform, make_data_column_uniform, make_reftime_column_uniform

def create_data_splits(args: argparse.Namespace) -> None:
"""
Expand Down Expand Up @@ -115,18 +48,14 @@ def create_data_splits(args: argparse.Namespace) -> None:

data_frame = pd.read_csv(dataset_file)
logger.debug(f"Data frame: {data_frame.shape}")
data_frame = make_label_column_uniform(data_frame, const.ALIAS_TRAIN_PATH)
skip_list = config.get_skip_list(const.CLASSIFICATION)
# Replacing intents with their alias
data_frame = data_frame.replace({const.TAG: config.tasks.classification.alias})
skip_filter = data_frame[const.TAG].isin(skip_list)
logger.info(
f"Model will be trained for the following classes:\
\n{data_frame[const.TAG].value_counts(dropna=False)}"
)
make_label_column_uniform(data_frame)
make_data_column_uniform(data_frame)
make_reftime_column_uniform(data_frame)

skip_filter = data_frame[const.TAG].isin(skip_list)

failed_transcripts = data_frame[const.ALTERNATIVES].isin(["[[]]", "[]"])
non_empty_transcripts = data_frame[const.ALTERNATIVES].isna()
invalid_samples = skip_filter | non_empty_transcripts | failed_transcripts
Expand Down Expand Up @@ -190,9 +119,13 @@ def train_intent_classifier(args: argparse.Namespace) -> None:
logger.info("Preparing dataset.")
dataset = dataset or config.get_dataset(const.CLASSIFICATION, f"{const.TRAIN}.csv")
data_frame = pd.read_csv(dataset)
make_label_column_uniform(data_frame)
make_data_column_uniform(data_frame)
make_reftime_column_uniform(data_frame)
data_frame = make_label_column_uniform(data_frame, const.ALIAS_TRAIN_PATH)
data_frame = make_data_column_uniform(data_frame)
data_frame = make_reftime_column_uniform(data_frame)
logger.info(
f"Model will be trained for the following classes:\
\n{data_frame[const.TAG].value_counts(dropna=False)}"
)

logger.info("Training started.")
workflow.train(data_frame)
Expand Down
73 changes: 73 additions & 0 deletions slu/slu/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,79 @@ def generate(self) -> dict:

return self.config_dict

class YamlAliasConfig(ConfigDataProviderInterface):
"""
An instance of this class can:
- Load and validate an alias.yaml file.
"""

def __init__(
self, config_path: Optional[str] = None, debug: Optional[bool] = True
) -> None:
self.config_path: str = config_path
self.debug: bool = debug
self.intents: set = set()

def get_config_dict(self) -> dict:
"""
Read a alias.yaml in common format.
"""
with open(self.config_path, "r", encoding="utf8") as handle:
config_dict = yaml.safe_load(handle)
if not isinstance(config_dict, dict):
logger.debug("No aliases found")
return config_dict

def get_alias_dict(self) -> dict:
"""
Convert commmon format (alias : intent) to a dictionary mapping (intent : alias) .
"""
alias_dict: dict = {}
for k,v in self.config_dict.items():
for x in v:
alias_dict.setdefault(x,k)
return alias_dict

def get_config_path(self) -> str:
return self.config_path

def validate(self) -> None:
if self.config_dict:
if not (all(isinstance(key, str) for key in self.config_dict.keys())):
raise TypeError(
f"Invalid or Malformed key, please make sure {self.config_path} is correctly defined"
)

for key in self.config_dict:
if not (
all(isinstance(value, str) for value in self.config_dict[key])
& all(
valid_string(value)
for value in self.config_dict[key]
)
):
raise TypeError(
f"Invalid or Malformed value, please make sure {self.config_path} is correctly defined"
)

for value in self.config_dict[key]:
if value in self.intents:
raise ValueError(
f"Duplicate key found {value}. Please check {self.config_path}"
)
else:
self.intents.add(value)

def generate(self) -> dict:
"""
Create, validate, and return a dictionary mapping between intent and their respective aliases.
:rtype: Dict[str, str]
"""
self.config_dict: Dict[str] = self.get_config_dict()
self.validate()
alias_dict = self.get_alias_dict() if self.config_dict else {}
return alias_dict


def load_gen_config():
project_config_map = YAMLLocalConfig().generate()
Expand Down
82 changes: 82 additions & 0 deletions slu/slu/utils/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import json
from datetime import datetime

import pandas as pd
from tqdm import tqdm

from slu import constants as const
from slu.utils import logger
from slu.utils.config import YamlAliasConfig

def make_label_column_uniform(data_frame: pd.DataFrame, alias_yaml:str) -> pd.DataFrame:
alias_map = YamlAliasConfig(config_path=alias_yaml).generate()

if const.INTENTS in data_frame.columns:
column = const.INTENTS
elif const.LABELS in data_frame.columns:
column = const.LABELS
elif const.TAG in data_frame.columns:
column = const.TAG
else:
raise ValueError(
f"Expected one of {const.LABELS}, {const.TAG} to be present in the dataset."
)
data_frame.rename(columns={column: const.TAG}, inplace=True)
data_frame = data_frame.replace({const.TAG: alias_map})
return data_frame

def reftime_patterns(reftime: str):
time_fns = [
datetime.fromisoformat,
lambda date_string: datetime.strptime(
date_string, "%Y-%m-%d %H:%M:%S.%f %z %Z"
),
lambda date_string: datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ"),
lambda date_string: datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S.%f%z"),
]
for time_fn in time_fns:
try:
return time_fn(reftime)
except ValueError:
continue
raise ValueError(f"Could not parse reftime {reftime}")


def make_reftime_column_uniform(data_frame: pd.DataFrame) -> pd.DataFrame:
if const.REFERENCE_TIME not in data_frame.columns:
return

for i, row in tqdm(
data_frame.iterrows(), total=len(data_frame), desc="Fixing reference time"
):
if row[const.REFERENCE_TIME] is not None and not pd.isna(
row[const.REFERENCE_TIME]
):
data_frame.loc[i, const.REFERENCE_TIME] = reftime_patterns(
row[const.REFERENCE_TIME]
).isoformat()

return data_frame

def make_data_column_uniform(data_frame: pd.DataFrame) -> pd.DataFrame:
if const.ALTERNATIVES in data_frame.columns:
column = const.ALTERNATIVES
elif const.DATA in data_frame.columns:
column = const.DATA
else:
raise ValueError(
f"Expected one of {const.ALTERNATIVES}, {const.DATA} to be present in the dataset."
)
data_frame.rename(columns={column: const.ALTERNATIVES}, inplace=True)

for i, row in tqdm(
data_frame.iterrows(), total=len(data_frame), desc="Fixing data structure"
):
if isinstance(row[const.ALTERNATIVES], str):
data = json.loads(row[const.ALTERNATIVES])
if const.ALTERNATIVES in data:
data_frame.loc[i, const.ALTERNATIVES] = json.dumps(
data[const.ALTERNATIVES]
)

return data_frame
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
intent_alised1:
- intent1
- intent2
- intent3

intent_alised2:
- intent1
- intent4
- intent5
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
intent_alised1:
- intent1
- intent2
- intent3

intent_alised2:
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
intent_alised1:
- intent1
- intent2
-
14 changes: 14 additions & 0 deletions slu/tests/test_intent_aliasing/data/test_cases.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
- type: alias_validity
args:
file: tests/test_contextual_slu/data/input/invalid-alias-1.yaml
is_valid: False

- type: alias_validity
args:
file: tests/test_contextual_slu/data/input/invalid-alias-2.yaml
is_valid: False

- type: alias_validity
args:
file: tests/test_contextual_slu/data/input/invalid-alias-3.yaml
is_valid: False
Loading