0.93.14

felixbur · Jan 29, 2025 · 1c179ad · 1c179ad
1 parent 1d1f8eb
commit 1c179ad
Show file tree

Hide file tree

Showing 6 changed files with 48 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,10 @@
 Changelog
 =========
 
+Version 0.93.14 (24-01-27)
+--------------------------
+* print cohen's d results to store
+
 Version 0.93.13 (24-01-27)
 --------------------------
 * plot: replaced "class_label" by target name for continuous distributions

diff --git a/nkululeko/constants.py b/nkululeko/constants.py
@@ -1,2 +1,2 @@
-VERSION="0.93.13"
+VERSION="0.93.14"
 SAMPLING_RATE = 16000
diff --git a/nkululeko/data/dataset_csv.py b/nkululeko/data/dataset_csv.py
@@ -3,19 +3,20 @@
 import os
 import os.path
 
-import audformat.utils
 import pandas as pd
 
-import nkululeko.glob_conf as glob_conf
+import audformat.utils
+
 from nkululeko.data.dataset import Dataset
+import nkululeko.glob_conf as glob_conf
 from nkululeko.reporting.report_item import ReportItem
 
 
 class Dataset_CSV(Dataset):
-    """Class to represent datasets stored as a csv file"""
+    """Class to represent datasets stored as a csv file."""
 
     def load(self):
-        """Load the dataframe with files, speakers and task labels"""
+        """Load the dataframe with files, speakers and task labels."""
         self.util.debug(f"loading {self.name}")
         self.got_target, self.got_speaker, self.got_gender = False, False, False
         data_file = self.util.config_val_data(self.name, "", "")

diff --git a/nkululeko/plots.py b/nkululeko/plots.py
@@ -24,8 +24,10 @@ def __init__(self):
         self.format = self.util.config_val("PLOT", "format", "png")
         self.target = self.util.config_val("DATA", "target", "emotion")
         self.with_ccc = eval(self.util.config_val("PLOT", "ccc", "False"))
+        self.type_s = "samples"
 
     def plot_distributions_speaker(self, df):
+        self.type_s = "speaker"
         df_speakers = pd.DataFrame()
         pd.options.mode.chained_assignment = None  # default='warn'
         for s in df.speaker.unique():
@@ -301,11 +303,18 @@ def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
             plot_df = plot_df.rename(columns={cont_col: self.target})
             cont_col = self.target
         dist_type = self.util.config_val("EXPL", "dist_type", "kde")
-        cats, cat_str, es = su.get_effect_size(plot_df, cat_col, cont_col)
+        max_cat, cat_str, effect_results = su.get_effect_size(
+            plot_df, cat_col, cont_col
+        )
+        self.util.debug(effect_results)
+        self.util.print_results_to_store(
+            f"cohens-d_{self.type_s}", str(effect_results) + "\n"
+        )
+        es = effect_results[max_cat]
         model_type = self.util.get_model_type()
         if dist_type == "hist" and model_type != "tree":
             ax = sns.histplot(plot_df, x=cont_col, hue=cat_col, kde=True)
-            caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}"
+            caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({max_cat}):" f" {es}"
             ax.set_title(caption)
             ax.set_xlabel(f"{cont_col}")
             ax.set_ylabel(f"number of {ylab}")
@@ -319,7 +328,7 @@ def plotcatcont(self, df, cat_col, cont_col, xlab, ylab):
                 warn_singular=False,
             )
             ax.set(xlabel=f"{cont_col}")
-            caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({cats}):" f" {es}"
+            caption = f"{ylab} {plot_df.shape[0]}. {cat_str} ({max_cat}):" f" {es}"
             ax.figure.suptitle(caption)
         return ax, caption
 

diff --git a/nkululeko/utils/stats.py b/nkululeko/utils/stats.py
@@ -1,7 +1,8 @@
-import math
 from itertools import combinations
+import math
 
 import numpy as np
+import pandas as pd
 
 
 def check_na(a):
@@ -14,9 +15,8 @@ def check_na(a):
         return a
 
 
-def cohen_d(d1, d2):
-    """
-    Compute Cohen's d from two distributions of real valued arrays.
+def cohen_d(d1: np.array, d2: np.array) -> float:
+    """Compute Cohen's d from two distributions of real valued arrays.
 
     Args:
         d1: one array
@@ -50,7 +50,9 @@ def all_combinations(items_list):
     return result
 
 
-def get_effect_size(df, target, variable):
+def get_effect_size(
+    df: pd.DataFrame, target: str, variable: str
+) -> tuple[str, str, dict]:
     """Get the effect size as Cohen's D.
 
     Effect size is computed  from a real numbered variable on a categorical target.
@@ -68,21 +70,21 @@ def get_effect_size(df, target, variable):
     for c in categories:
         cats[c] = df[df[target] == c][variable].values
     combos = all_combinations(categories)
-    results = {}
+    results = {categories[0]: 0}
     if len(categories) == 1:
         cat_s = cohens_D_to_string(0)
-        return categories[0], cat_s, 0
+        return categories[0], cat_s, results
     else:
         for combo in combos:
             one = combo[0]
             other = combo[1]
             results[f"{one}-{other}"] = cohen_d(cats[one], cats[other])
         max_cat = max(results, key=results.get)
         cat_s = cohens_D_to_string(float(results[max_cat]))
-    return max_cat, cat_s, results[max_cat]
+    return max_cat, cat_s, results
 
 
-def cohens_D_to_string(val):
+def cohens_D_to_string(val: float) -> str:
     if val < 0.2:
         rval = "no effect"
     elif val < 0.2:

diff --git a/nkululeko/utils/util.py b/nkululeko/utils/util.py
@@ -160,6 +160,21 @@ def get_pred_name(self):
         pred_name = self.get_model_description()
         return f"{results_dir}/pred_{target}_{pred_name}.csv"
 
+    def print_results_to_store(self, name: str, contents: str) -> str:
+        """Write contents to a result file.
+
+        Args:
+            name (str): the (sub) name of the file_
+
+        Returns:
+            str: The path to the file
+        """
+        results_dir = self.get_path("res_dir")
+        pred_name = self.get_model_description()
+        path = os.path.join(results_dir, f"{name}_{pred_name}.txt")
+        with open(path, "a") as f:
+            f.write(contents)
+
     def is_categorical(self, pd_series):
         """Check if a dataframe column is categorical."""
         return pd_series.dtype.name == "object" or isinstance(