Small changes according to PR review

GispoCoding · Oct 26, 2023 · 20ac841 · 20ac841
1 parent 5df1336
commit 20ac841
Show file tree

Hide file tree

Showing 4 changed files with 86 additions and 49 deletions.
diff --git a/docs/statistical_analyses/statistical_testing.md b/docs/statistical_analyses/statistical_testing.md
@@ -1,3 +1,3 @@
-# Statistical testing
+# Statistical (hypothesis) testing
 
-::: eis_toolkit.statistical_analyses.statistical_testing
+::: eis_toolkit.statistical_analyses.statistical_tests
diff --git a/eis_toolkit/checks/dataframe.py b/eis_toolkit/checks/dataframe.py
@@ -32,3 +32,15 @@ def check_columns_numeric(df: pd.DataFrame, columns: Sequence[str]) -> bool:
     """
     columns_numeric = df.columns.select_dtypes(include="number").columns.to_list()
     return all(column in columns_numeric for column in columns)
+
+
+def check_empty_dataframe(df: pd.DataFrame) -> bool:
+    """Check if the dataframe is empty.
+
+    Args:
+        df: Dataframe to be checked.
+
+    Return:
+        True if dataframe is empty, otherwise False.
+    """
+    return df.empty
diff --git a/eis_toolkit/statistical_analyses/statistical_tests.py b/eis_toolkit/statistical_analyses/statistical_tests.py
@@ -1,70 +1,78 @@
 import pandas as pd
 from beartype import beartype
-from beartype.typing import Literal, Optional, Sequence, Tuple
+from beartype.typing import Literal, Optional, Sequence
 from scipy.stats import chi2_contingency, shapiro
 
 from eis_toolkit import exceptions
+from eis_toolkit.checks.dataframe import check_columns_valid, check_empty_dataframe
 
 
 @beartype
-def check_empty_dataframe(data: pd.DataFrame):
-    """Check if the input dataframe is empty.
-
-    Args:
-        data: Input DataFrame
-
-    Raises:
-        EmptyDataFrameException: The input DataFrame is empty.
-    """
-    if data.empty:
-        raise exceptions.EmptyDataFrameException("The input DataFrame is empty.")
+def chi_square_test(data: pd.DataFrame, target_column: str, columns: Sequence[str] = None) -> dict:
+    """Compute Chi-square test for independence on the input data.
 
-
-@beartype
-def chi_square_test(data: pd.DataFrame, target_column: str) -> Sequence[Tuple[float, float, int]]:
-    """Compute Chi-square test for independence on categorical data.
+    It is assumed that the variables in the input data are independent and that they are categorical, i.e. strings,
+    booleans or integers, but not floats.
 
     Args:
-        data: DataFrame containing the input data.
+        data: Dataframe containing the input data
         target_column: Variable against which independence of other variables is tested.
+        columns: Variables that are tested against the variable in target_column. If None, every column is used.
 
     Raises:
-        InvalidParameterValueException: The target_column is not in input DataFrame.
+        EmptyDataFrameException: The input Dataframe is empty.
+        InvalidParameterValueException: The target_column is not in input Dataframe or invalid column is provided.
 
     Returns:
         Test statistics for each variable (except target_column).
     """
-    check_empty_dataframe(data)
-
-    if target_column not in data.columns:
-        raise exceptions.InvalidParameterValueException("Target column not found in the DataFrame.")
-
-    statistics = []
-    for column in data.columns:
+    if check_empty_dataframe(data):
+        raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")
+
+    if not check_columns_valid(data, target_column):
+        raise exceptions.InvalidParameterValueException("Target column not found in the Dataframe.")
+
+    if columns is not None:
+        invalid_columns = [column for column in columns if column not in data.columns]
+        if any(invalid_columns):
+            raise exceptions.InvalidParameterValueException(
+                f"The following variables are not in the dataframe: {invalid_columns}"
+            )
+    else:
+        columns = data.columns
+
+    statistics = {}
+    for column in columns:
         if column != target_column:
             contingency_table = pd.crosstab(data[target_column], data[column])
             chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table)
-            statistics.append((chi_square, p_value, degrees_of_freedom))
+            statistics[column] = (chi_square, p_value, degrees_of_freedom)
 
     return statistics
 
 
 @beartype
-def normality_test(data: pd.DataFrame) -> Sequence[Tuple[float, float]]:
-    """Compute Shapiro-Wilk test for normality on numeric input data.
+def normality_test(data: pd.DataFrame) -> dict:
+    """Compute Shapiro-Wilk test for normality on the input data.
+
+    It is assumed that the input data is normally distributed and numeric, i.e. integers or floats.
 
     Args:
-        data: DataFrame containing the input data.
+        data: Dataframe containing the input data.
 
     Returns:
         Test statistics for each variable.
+
+    Raises:
+        EmptyDataFrameException: The input Dataframe is empty.
     """
-    check_empty_dataframe(data)
+    if check_empty_dataframe(data):
+        raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")
 
-    statistics = []
+    statistics = {}
     for column in data.columns:
         statistic, p_value = shapiro(data[column])
-        statistics.append((statistic, p_value))
+        statistics[column] = (statistic, p_value)
 
     return statistics
 
@@ -75,49 +83,61 @@ def correlation_matrix(
     correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson",
     min_periods: Optional[int] = None,
 ) -> pd.DataFrame:
-    """Compute correlation matrix on numeric input data.
+    """Compute correlation matrix on the input data.
+
+    It is assumed that the data is numeric, i.e. integers or floats.
 
     Args:
-        data: DataFrame containing the input data.
+        data: Dataframe containing the input data.
         correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'.
         min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
 
     Raises:
+        EmptyDataFrameException: The input Dataframe is empty.
         InvalidParameterValueException: min_periods argument is used with method 'kendall'.
 
     Returns:
-        Correlation matrix
+        Dataframe containing the correlation matrix
     """
-    check_empty_dataframe(data)
+    if check_empty_dataframe(data):
+        raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")
 
     if correlation_method == "kendall" and min_periods is not None:
         raise exceptions.InvalidParameterValueException(
             "The argument min_periods is available only with correlation methods 'pearson' and 'spearman'."
         )
 
-    return data.corr(method=correlation_method, min_periods=min_periods)
+    matrix = data.corr(method=correlation_method, min_periods=min_periods)
+
+    return matrix
 
 
 @beartype
 def covariance_matrix(
     data: pd.DataFrame, min_periods: Optional[int] = None, delta_degrees_of_freedom: int = 1
 ) -> pd.DataFrame:
-    """Compute covariance matrix on numeric input data.
+    """Compute covariance matrix on the input data.
+
+    It is assumed that the data is numeric, i.e. integers or floats.
 
     Args:
-        data: DataFrame containing the input data.
+        data: Dataframe containing the input data.
         min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
         delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1.
 
     Raises:
         InvalidParameterValueException: Provided value for delta_degrees_of_freedom is negative.
 
     Returns:
-        Covariance matrix
+        EmptyDataFrameException: The input Dataframe is empty.
+        Dataframe containing the covariance matrix
     """
-    check_empty_dataframe(data)
+    if check_empty_dataframe(data):
+        raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")
 
     if delta_degrees_of_freedom < 0:
         raise exceptions.InvalidParameterValueException("Delta degrees of freedom must be non-negative.")
 
-    return data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)
+    matrix = data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)
+
+    return matrix
diff --git a/tests/statistical_analyses/statistical_tests_test.py b/tests/statistical_analyses/statistical_tests_test.py
@@ -5,7 +5,6 @@
 
 from eis_toolkit import exceptions
 from eis_toolkit.statistical_analyses.statistical_tests import (
-    check_empty_dataframe,
     chi_square_test,
     correlation_matrix,
     covariance_matrix,
@@ -20,14 +19,14 @@
 
 def test_chi_square_test():
     """Test that returned statistics for independence are correct."""
-    output_statistics = chi_square_test(data=categorical_data, target_column=target_column)
-    np.testing.assert_array_equal((output_statistics[0]), (0.0, 1.0, 1))
+    output_statistics = chi_square_test(data=categorical_data, target_column=target_column, columns=("f"))
+    np.testing.assert_array_equal((output_statistics["f"]), (0.0, 1.0, 1))
 
 
 def test_normality_test():
     """Test that returned statistics for normality are correct."""
     output_statistics = normality_test(data=numeric_data)
-    np.testing.assert_array_almost_equal(output_statistics[0], (0.72863, 0.02386), decimal=5)
+    np.testing.assert_array_almost_equal(output_statistics["a"], (0.72863, 0.02386), decimal=5)
 
 
 def test_correlation_matrix():
@@ -62,7 +61,13 @@ def test_empty_df():
     """Test that empty DataFrame raises the correct exception."""
     empty_df = pd.DataFrame()
     with pytest.raises(exceptions.EmptyDataFrameException):
-        check_empty_dataframe(data=empty_df)
+        normality_test(data=empty_df)
+
+
+def test_invalid_columns():
+    """Test that invalid column name in raises the correct exception."""
+    with pytest.raises(exceptions.InvalidParameterValueException):
+        chi_square_test(data=categorical_data, target_column=target_column, columns=["f", "x"])
 
 
 def test_invalid_target_column():