Skip to content

Commit

Permalink
0.93.14
Browse files Browse the repository at this point in the history
  • Loading branch information
FBurkhardt committed Jan 29, 2025
1 parent 1d1f8eb commit 1c179ad
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 17 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Changelog
=========

Version 0.93.14 (24-01-27)
--------------------------
* print cohen's d results to store

Version 0.93.13 (24-01-27)
--------------------------
* plot: replaced "class_label" by target name for continuous distributions
Expand Down
2 changes: 1 addition & 1 deletion nkululeko/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
VERSION="0.93.13"
VERSION="0.93.14"
SAMPLING_RATE = 16000
9 changes: 5 additions & 4 deletions nkululeko/data/dataset_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,20 @@
import os
import os.path

import audformat.utils
import pandas as pd

import nkululeko.glob_conf as glob_conf
import audformat.utils

from nkululeko.data.dataset import Dataset
import nkululeko.glob_conf as glob_conf
from nkululeko.reporting.report_item import ReportItem


class Dataset_CSV(Dataset):
"""Class to represent datasets stored as a csv file"""
"""Class to represent datasets stored as a csv file."""

def load(self):
"""Load the dataframe with files, speakers and task labels"""
"""Load the dataframe with files, speakers and task labels."""
self.util.debug(f"loading {self.name}")
self.got_target, self.got_speaker, self.got_gender = False, False, False
data_file = self.util.config_val_data(self.name, "", "")
Expand Down
15 changes: 12 additions & 3 deletions nkululeko/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ def __init__(self):
self.format = self.util.config_val("PLOT", "format", "png")
self.target = self.util.config_val("DATA", "target", "emotion")
self.with_ccc = eval(self.util.config_val("PLOT", "ccc", "False"))
self.type_s = "samples"

def plot_distributions_speaker(self, df):
self.type_s = "speaker"
df_speakers = pd.DataFrame()
pd.options.mode.chained_assignment = None # default='warn'
for s in df.speaker.unique():
Expand Down Expand Up @@ -301,11 +303,18 @@ def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
plot_df = plot_df.rename(columns={cont_col: self.target})
cont_col = self.target
dist_type = self.util.config_val("EXPL", "dist_type", "kde")
cats, cat_str, es = su.get_effect_size(plot_df, cat_col, cont_col)
max_cat, cat_str, effect_results = su.get_effect_size(
plot_df, cat_col, cont_col
)
self.util.debug(effect_results)
self.util.print_results_to_store(
f"cohens-d_{self.type_s}", str(effect_results) + "\n"
)
es = effect_results[max_cat]
model_type = self.util.get_model_type()
if dist_type == "hist" and model_type != "tree":
ax = sns.histplot(plot_df, x=cont_col, hue=cat_col, kde=True)
caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}"
caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({max_cat}):" f" {es}"
ax.set_title(caption)
ax.set_xlabel(f"{cont_col}")
ax.set_ylabel(f"number of {ylab}")
Expand All @@ -319,7 +328,7 @@ def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
warn_singular=False,
)
ax.set(xlabel=f"{cont_col}")
caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}"
caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({max_cat}):" f" {es}"
ax.figure.suptitle(caption)
return ax, caption

Expand Down
20 changes: 11 additions & 9 deletions nkululeko/utils/stats.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import math
from itertools import combinations
import math

import numpy as np
import pandas as pd


def check_na(a):
Expand All @@ -14,9 +15,8 @@ def check_na(a):
return a


def cohen_d(d1, d2):
"""
Compute Cohen's d from two distributions of real valued arrays.
def cohen_d(d1: np.array, d2: np.array) -> float:
"""Compute Cohen's d from two distributions of real valued arrays.
Args:
d1: one array
Expand Down Expand Up @@ -50,7 +50,9 @@ def all_combinations(items_list):
return result


def get_effect_size(df, target, variable):
def get_effect_size(
df: pd.DataFrame, target: str, variable: str
) -> tuple[str, str, dict]:
"""Get the effect size as Cohen's D.
Effect size is computed from a real numbered variable on a categorical target.
Expand All @@ -68,21 +70,21 @@ def get_effect_size(df, target, variable):
for c in categories:
cats[c] = df[df[target] == c][variable].values
combos = all_combinations(categories)
results = {}
results = {categories[0]: 0}
if len(categories) == 1:
cat_s = cohens_D_to_string(0)
return categories[0], cat_s, 0
return categories[0], cat_s, results
else:
for combo in combos:
one = combo[0]
other = combo[1]
results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
max_cat = max(results, key=results.get)
cat_s = cohens_D_to_string(float(results[max_cat]))
return max_cat, cat_s, results[max_cat]
return max_cat, cat_s, results


def cohens_D_to_string(val):
def cohens_D_to_string(val: float) -> str:
if val < 0.2:
rval = "no effect"
elif val < 0.2:
Expand Down
15 changes: 15 additions & 0 deletions nkululeko/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,21 @@ def get_pred_name(self):
pred_name = self.get_model_description()
return f"{results_dir}/pred_{target}_{pred_name}.csv"

def print_results_to_store(self, name: str, contents: str) -> str:
"""Write contents to a result file.
Args:
name (str): the (sub) name of the file_
Returns:
str: The path to the file
"""
results_dir = self.get_path("res_dir")
pred_name = self.get_model_description()
path = os.path.join(results_dir, f"{name}_{pred_name}.txt")
with open(path, "a") as f:
f.write(contents)

def is_categorical(self, pd_series):
"""Check if a dataframe column is categorical."""
return pd_series.dtype.name == "object" or isinstance(
Expand Down

0 comments on commit 1c179ad

Please sign in to comment.