Skip to content

Commit

Permalink
Small changes according to PR review
Browse files Browse the repository at this point in the history
  • Loading branch information
msorvoja committed Oct 26, 2023
1 parent 5df1336 commit 20ac841
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 49 deletions.
4 changes: 2 additions & 2 deletions docs/statistical_analyses/statistical_testing.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Statistical testing
# Statistical (hypothesis) testing

::: eis_toolkit.statistical_analyses.statistical_testing
::: eis_toolkit.statistical_analyses.statistical_tests
12 changes: 12 additions & 0 deletions eis_toolkit/checks/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,15 @@ def check_columns_numeric(df: pd.DataFrame, columns: Sequence[str]) -> bool:
"""
columns_numeric = df.columns.select_dtypes(include="number").columns.to_list()
return all(column in columns_numeric for column in columns)


def check_empty_dataframe(df: pd.DataFrame) -> bool:
"""Check if the dataframe is empty.
Args:
df: Dataframe to be checked.
Return:
True if dataframe is empty, otherwise False.
"""
return df.empty
104 changes: 62 additions & 42 deletions eis_toolkit/statistical_analyses/statistical_tests.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,78 @@
import pandas as pd
from beartype import beartype
from beartype.typing import Literal, Optional, Sequence, Tuple
from beartype.typing import Literal, Optional, Sequence
from scipy.stats import chi2_contingency, shapiro

from eis_toolkit import exceptions
from eis_toolkit.checks.dataframe import check_columns_valid, check_empty_dataframe


@beartype
def check_empty_dataframe(data: pd.DataFrame):
"""Check if the input dataframe is empty.
Args:
data: Input DataFrame
Raises:
EmptyDataFrameException: The input DataFrame is empty.
"""
if data.empty:
raise exceptions.EmptyDataFrameException("The input DataFrame is empty.")
def chi_square_test(data: pd.DataFrame, target_column: str, columns: Sequence[str] = None) -> dict:
"""Compute Chi-square test for independence on the input data.

@beartype
def chi_square_test(data: pd.DataFrame, target_column: str) -> Sequence[Tuple[float, float, int]]:
"""Compute Chi-square test for independence on categorical data.
It is assumed that the variables in the input data are independent and that they are categorical, i.e. strings,
booleans or integers, but not floats.
Args:
data: DataFrame containing the input data.
data: Dataframe containing the input data
target_column: Variable against which independence of other variables is tested.
columns: Variables that are tested against the variable in target_column. If None, every column is used.
Raises:
InvalidParameterValueException: The target_column is not in input DataFrame.
EmptyDataFrameException: The input Dataframe is empty.
InvalidParameterValueException: The target_column is not in input Dataframe or invalid column is provided.
Returns:
Test statistics for each variable (except target_column).
"""
check_empty_dataframe(data)

if target_column not in data.columns:
raise exceptions.InvalidParameterValueException("Target column not found in the DataFrame.")

statistics = []
for column in data.columns:
if check_empty_dataframe(data):
raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")

if not check_columns_valid(data, target_column):
raise exceptions.InvalidParameterValueException("Target column not found in the Dataframe.")

if columns is not None:
invalid_columns = [column for column in columns if column not in data.columns]
if any(invalid_columns):
raise exceptions.InvalidParameterValueException(
f"The following variables are not in the dataframe: {invalid_columns}"
)
else:
columns = data.columns

statistics = {}
for column in columns:
if column != target_column:
contingency_table = pd.crosstab(data[target_column], data[column])
chi_square, p_value, degrees_of_freedom, _ = chi2_contingency(contingency_table)
statistics.append((chi_square, p_value, degrees_of_freedom))
statistics[column] = (chi_square, p_value, degrees_of_freedom)

return statistics


@beartype
def normality_test(data: pd.DataFrame) -> Sequence[Tuple[float, float]]:
"""Compute Shapiro-Wilk test for normality on numeric input data.
def normality_test(data: pd.DataFrame) -> dict:
"""Compute Shapiro-Wilk test for normality on the input data.
It is assumed that the input data is normally distributed and numeric, i.e. integers or floats.
Args:
data: DataFrame containing the input data.
data: Dataframe containing the input data.
Returns:
Test statistics for each variable.
Raises:
EmptyDataFrameException: The input Dataframe is empty.
"""
check_empty_dataframe(data)
if check_empty_dataframe(data):
raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")

statistics = []
statistics = {}
for column in data.columns:
statistic, p_value = shapiro(data[column])
statistics.append((statistic, p_value))
statistics[column] = (statistic, p_value)

return statistics

Expand All @@ -75,49 +83,61 @@ def correlation_matrix(
correlation_method: Literal["pearson", "kendall", "spearman"] = "pearson",
min_periods: Optional[int] = None,
) -> pd.DataFrame:
"""Compute correlation matrix on numeric input data.
"""Compute correlation matrix on the input data.
It is assumed that the data is numeric, i.e. integers or floats.
Args:
data: DataFrame containing the input data.
data: Dataframe containing the input data.
correlation_method: 'pearson', 'kendall', or 'spearman'. Defaults to 'pearson'.
min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
Raises:
EmptyDataFrameException: The input Dataframe is empty.
InvalidParameterValueException: min_periods argument is used with method 'kendall'.
Returns:
Correlation matrix
Dataframe containing the correlation matrix
"""
check_empty_dataframe(data)
if check_empty_dataframe(data):
raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")

if correlation_method == "kendall" and min_periods is not None:
raise exceptions.InvalidParameterValueException(
"The argument min_periods is available only with correlation methods 'pearson' and 'spearman'."
)

return data.corr(method=correlation_method, min_periods=min_periods)
matrix = data.corr(method=correlation_method, min_periods=min_periods)

return matrix


@beartype
def covariance_matrix(
data: pd.DataFrame, min_periods: Optional[int] = None, delta_degrees_of_freedom: int = 1
) -> pd.DataFrame:
"""Compute covariance matrix on numeric input data.
"""Compute covariance matrix on the input data.
It is assumed that the data is numeric, i.e. integers or floats.
Args:
data: DataFrame containing the input data.
data: Dataframe containing the input data.
min_periods: Minimum number of observations required per pair of columns to have valid result. Optional.
delta_degrees_of_freedom: Delta degrees of freedom used for computing covariance matrix. Defaults to 1.
Raises:
InvalidParameterValueException: Provided value for delta_degrees_of_freedom is negative.
Returns:
Covariance matrix
EmptyDataFrameException: The input Dataframe is empty.
Dataframe containing the covariance matrix
"""
check_empty_dataframe(data)
if check_empty_dataframe(data):
raise exceptions.EmptyDataFrameException("The input Dataframe is empty.")

if delta_degrees_of_freedom < 0:
raise exceptions.InvalidParameterValueException("Delta degrees of freedom must be non-negative.")

return data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)
matrix = data.cov(min_periods=min_periods, ddof=delta_degrees_of_freedom)

return matrix
15 changes: 10 additions & 5 deletions tests/statistical_analyses/statistical_tests_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

from eis_toolkit import exceptions
from eis_toolkit.statistical_analyses.statistical_tests import (
check_empty_dataframe,
chi_square_test,
correlation_matrix,
covariance_matrix,
Expand All @@ -20,14 +19,14 @@

def test_chi_square_test():
"""Test that returned statistics for independence are correct."""
output_statistics = chi_square_test(data=categorical_data, target_column=target_column)
np.testing.assert_array_equal((output_statistics[0]), (0.0, 1.0, 1))
output_statistics = chi_square_test(data=categorical_data, target_column=target_column, columns=("f"))
np.testing.assert_array_equal((output_statistics["f"]), (0.0, 1.0, 1))


def test_normality_test():
"""Test that returned statistics for normality are correct."""
output_statistics = normality_test(data=numeric_data)
np.testing.assert_array_almost_equal(output_statistics[0], (0.72863, 0.02386), decimal=5)
np.testing.assert_array_almost_equal(output_statistics["a"], (0.72863, 0.02386), decimal=5)


def test_correlation_matrix():
Expand Down Expand Up @@ -62,7 +61,13 @@ def test_empty_df():
"""Test that empty DataFrame raises the correct exception."""
empty_df = pd.DataFrame()
with pytest.raises(exceptions.EmptyDataFrameException):
check_empty_dataframe(data=empty_df)
normality_test(data=empty_df)


def test_invalid_columns():
"""Test that invalid column name in raises the correct exception."""
with pytest.raises(exceptions.InvalidParameterValueException):
chi_square_test(data=categorical_data, target_column=target_column, columns=["f", "x"])


def test_invalid_target_column():
Expand Down

0 comments on commit 20ac841

Please sign in to comment.