Skip to content

Commit

Permalink
reimplemented multiple tests, improved type hints, linted and formatt…
Browse files Browse the repository at this point in the history
…ed code with ruff
  • Loading branch information
maximtrp committed Oct 20, 2024
1 parent c4da3ca commit d018ba3
Show file tree
Hide file tree
Showing 5 changed files with 1,492 additions and 814 deletions.
2 changes: 2 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@
.. image:: https://img.shields.io/conda/vn/conda-forge/scikit-posthocs.svg
:target: https://anaconda.org/conda-forge/scikit-posthocs

===============

**scikit-posthocs** is a Python package that provides post hoc tests for
pairwise multiple comparisons that are usually performed in statistical
data analysis to assess the differences between group levels if a statistically
Expand Down
43 changes: 32 additions & 11 deletions scikit_posthocs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,43 @@
__version__ = '0.9.1'
__version__ = "0.10.0"

from scikit_posthocs._global import global_simes_test, global_f_test
from scikit_posthocs._omnibus import test_osrt, test_durbin, test_mackwolfe

from scikit_posthocs._posthocs import (
posthoc_anderson, posthoc_conover, posthoc_conover_friedman,
posthoc_dscf, posthoc_dunn, posthoc_durbin,
posthoc_mannwhitney, posthoc_miller_friedman, posthoc_nemenyi,
posthoc_nemenyi_friedman, posthoc_npm_test, posthoc_quade,
posthoc_scheffe, posthoc_siegel_friedman, posthoc_tamhane,
posthoc_ttest, posthoc_tukey, posthoc_tukey_hsd,
posthoc_vanwaerden, posthoc_wilcoxon, posthoc_dunnett,
__convert_to_df, __convert_to_block_df,
posthoc_anderson,
posthoc_conover,
posthoc_conover_friedman,
posthoc_dscf,
posthoc_dunn,
posthoc_durbin,
posthoc_mannwhitney,
posthoc_miller_friedman,
posthoc_nemenyi,
posthoc_nemenyi_friedman,
posthoc_npm_test,
posthoc_quade,
posthoc_scheffe,
posthoc_siegel_friedman,
posthoc_tamhane,
posthoc_ttest,
posthoc_tukey,
posthoc_tukey_hsd,
posthoc_vanwaerden,
posthoc_wilcoxon,
posthoc_dunnett,
__convert_to_df,
__convert_to_block_df,
)

from scikit_posthocs._plotting import (
sign_array, sign_plot, sign_table, critical_difference_diagram,
sign_array,
sign_plot,
sign_table,
critical_difference_diagram,
)
from scikit_posthocs._outliers import (
outliers_gesd, outliers_grubbs, outliers_iqr, outliers_tietjen,
outliers_gesd,
outliers_grubbs,
outliers_iqr,
outliers_tietjen,
)
147 changes: 81 additions & 66 deletions scikit_posthocs/_omnibus.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

from typing import Union, List, Tuple
from typing import Optional, Union, List, Tuple, cast
import itertools as it
import numpy as np
import scipy.stats as ss
Expand All @@ -10,14 +10,14 @@


def test_mackwolfe(
data: Union[List, np.ndarray, DataFrame],
val_col: str = None,
group_col: str = None,
p: int = None,
n_perm: int = 100,
sort: bool = False) -> Tuple[float, float]:

'''Mack-Wolfe Test for Umbrella Alternatives.
data: Union[List, np.ndarray, DataFrame],
val_col: Optional[str] = None,
group_col: Optional[str] = None,
p: Optional[int] = None,
n_perm: int = 100,
sort: bool = False,
) -> Tuple[float, float]:
"""Mack-Wolfe Test for Umbrella Alternatives.
In dose-finding studies one may assume an increasing treatment effect with
increasing dose level. However, the test subject may actually succumb to
Expand Down Expand Up @@ -71,25 +71,26 @@ def test_mackwolfe(
--------
>>> x = [[22, 23, 35], [60, 59, 54], [98, 78, 50], [60, 82, 59], [22, 44, 33], [23, 21, 25]]
>>> sp.posthoc_mackwolfe(x)
'''
"""
x, _val_col, _group_col = __convert_to_df(data, val_col, group_col)

if not sort:
x[_group_col] = Categorical(
x[_group_col], categories=x[_group_col].unique(), ordered=True)
x[_group_col], categories=x[_group_col].unique(), ordered=True
)
x.sort_values(by=[_group_col], ascending=True, inplace=True)

k = x[_group_col].unique().size

if p and p > k:
print("Selected 'p' > number of groups:", str(p), " > ", str(k))
return False
return (np.nan, np.nan)
elif p is not None and p < 1:
print("Selected 'p' < 1: ", str(p))
return False
return (np.nan, np.nan)

Rij = x[_val_col].rank()
n = x.groupby(_group_col)[_val_col].count()
n = cast(Series, x.groupby(_group_col)[_val_col].count())

def _fn(Ri, Rj):
return np.sum(Ri.apply(lambda x: Rj[Rj > x].size))
Expand All @@ -100,44 +101,53 @@ def _ustat(Rij, g, k):

for i in range(k):
for j in range(i):
U[i, j] = _fn(Rij[x[_group_col] == levels[i]], Rij[x[_group_col] == levels[j]])
U[j, i] = _fn(Rij[x[_group_col] == levels[j]], Rij[x[_group_col] == levels[i]])
U[i, j] = _fn(
Rij[x[_group_col] == levels[i]], Rij[x[_group_col] == levels[j]]
)
U[j, i] = _fn(
Rij[x[_group_col] == levels[j]], Rij[x[_group_col] == levels[i]]
)

return U

def _ap(p, U):
tmp1 = 0.
def _ap(p, U) -> float:
tmp1 = 0.0
if p > 0:
for i in range(p):
for j in range(i+1, p+1):
for j in range(i + 1, p + 1):
tmp1 += U[i, j]
tmp2 = 0.
tmp2 = 0.0
if p < k:
for i in range(p, k):
for j in range(i+1, k):
for j in range(i + 1, k):
tmp2 += U[j, i]

return tmp1 + tmp2

def _n1(p, n):
return np.sum(n[:p+1])
def _n1(p: int, n: Series) -> float:
return np.sum(n[: p + 1])

def _n2(p, n):
def _n2(p: int, n: Series) -> float:
return np.sum(n[p:k])

def _mean_at(p, n):
def _mean_at(p, n) -> float:
N1 = _n1(p, n)
N2 = _n2(p, n)
return (N1**2. + N2**2. - np.sum(n**2.) - n.iloc[p]**2.)/4.
return (N1**2.0 + N2**2.0 - np.sum(n**2.0) - n.iloc[p] ** 2.0) / 4.0

def _var_at(p, n):
def _var_at(p: int, n: Series) -> float:
N1 = _n1(p, n)
N2 = _n2(p, n)
N = np.sum(n)

var = (2. * (N1**3 + N2**3) + 3. * (N1**2 + N2**2) -
np.sum(n**2 * (2*n + 3.)) - n.iloc[p]**2. * (2. * n.iloc[p] + 3.) +
12. * n.iloc[p] * N1 * N2 - 12. * n.iloc[p] ** 2. * N) / 72.
var = (
2.0 * (N1**3 + N2**3)
+ 3.0 * (N1**2 + N2**2)
- np.sum(n**2 * (2 * n + 3.0))
- n.iloc[p] ** 2.0 * (2.0 * n.iloc[p] + 3.0)
+ 12.0 * n.iloc[p] * N1 * N2
- 12.0 * n.iloc[p] ** 2.0 * N
) / 72.0
return var

if p:
Expand All @@ -147,37 +157,37 @@ def _var_at(p, n):
est = _ap(p, U)
mean = _mean_at(p, n)
sd = np.sqrt(_var_at(p, n))
stat = (est - mean)/sd
p_value = ss.norm.sf(stat)
stat = (est - mean) / sd
p_value = ss.norm.sf(stat).item()
else:
U = _ustat(Rij, x[_group_col], k)
Ap = np.array([_ap(i, U) for i in range(k)]).ravel()
mean = np.array([_mean_at(i, n) for i in range(k)]).ravel()
var = np.array([_var_at(i, n) for i in range(k)]).ravel()
A = (Ap - mean) / np.sqrt(var)
stat = np.max(A)
stat = float(np.max(A))

mt = []
for _ in range(n_perm):

ix = Series(np.random.permutation(Rij))
uix = _ustat(ix, x[_group_col], k)
apix = np.array([_ap(i, uix) for i in range(k)])
astarix = (apix - mean) / np.sqrt(var)
mt.append(np.max(astarix))

mt = np.array(mt)
p_value = mt[mt > stat] / n_perm
p_value = mt[mt > stat].size / n_perm

return p_value, stat


def test_osrt(
data: Union[List, np.ndarray, DataFrame],
val_col: str = None,
group_col: str = None,
sort: bool = False) -> Tuple[float, float, int]:
'''Hayter's one-sided studentised range test (OSRT)
data: Union[List, np.ndarray, DataFrame],
val_col: Optional[str] = None,
group_col: Optional[str] = None,
sort: bool = False,
) -> Tuple[float, float, int]:
"""Hayter's one-sided studentised range test (OSRT)
Tests a hypothesis against an ordered alternative for normal data with
equal variances [1]_.
Expand Down Expand Up @@ -223,11 +233,13 @@ def test_osrt(
>>> x = pd.DataFrame({"a": [1,2,3,5,1], "b": [12,31,54,62,12], "c": [10,12,6,74,11]})
>>> x = x.melt(var_name='groups', value_name='values')
>>> sp.test_osrt(x, val_col='values', group_col='groups')
'''
"""
x, _val_col, _group_col = __convert_to_df(data, val_col, group_col)

if not sort:
x[_group_col] = Categorical(x[_group_col], categories=x[_group_col].unique(), ordered=True)
x[_group_col] = Categorical(
x[_group_col], categories=x[_group_col].unique(), ordered=True
)

x.sort_values(by=[_group_col], ascending=True, inplace=True)
groups = np.unique(x[_group_col])
Expand All @@ -245,13 +257,13 @@ def test_osrt(
for i in range(k):
for j in range(ni.iloc[i]):
c += 1
sigma2 += (x[_val_col].iat[c] - xi[i])**2. / df
sigma2 += (x[_val_col].iat[c] - xi[i]) ** 2.0 / df

sigma = np.sqrt(sigma2)

def compare(i, j):
dif = xi.loc[groups[j]] - xi.loc[groups[i]]
A = sigma / np.sqrt(2.) * np.sqrt(1. / ni[groups[j]] + 1. / ni[groups[i]])
A = sigma / np.sqrt(2.0) * np.sqrt(1.0 / ni[groups[j]] + 1.0 / ni[groups[i]])
qval = np.abs(dif) / A
return qval

Expand All @@ -267,14 +279,14 @@ def compare(i, j):


def test_durbin(
data: Union[List, np.ndarray, DataFrame],
y_col: Union[str, int] = None,
block_col: Union[str, int] = None,
group_col: Union[str, int] = None,
melted: bool = False,
sort: bool = True) -> Tuple[float, float, int]:

'''Durbin's test whether k groups (or treatments) in a two-way
data: Union[List, np.ndarray, DataFrame],
y_col: Optional[Union[str, int]] = None,
block_col: Optional[Union[str, int]] = None,
group_col: Optional[Union[str, int]] = None,
melted: bool = False,
sort: bool = True,
) -> Tuple[float, float, int]:
"""Durbin's test whether k groups (or treatments) in a two-way
balanced incomplete block design (BIBD) have identical effects. See
references for additional information [1]_, [2]_.
Expand Down Expand Up @@ -331,11 +343,15 @@ def test_durbin(
--------
>>> x = np.array([[31,27,24],[31,28,31],[45,29,46],[21,18,48],[42,36,46],[32,17,40]])
>>> sp.test_durbin(x)
'''
"""
if melted and not all([block_col, group_col, y_col]):
raise ValueError('block_col, group_col, y_col should be explicitly specified if using melted data')
raise ValueError(
"block_col, group_col, y_col should be explicitly specified if using melted data"
)

x, _y_col, _group_col, _block_col = __convert_to_block_df(data, y_col, group_col, block_col, melted)
x, _y_col, _group_col, _block_col = __convert_to_block_df(
data, y_col, group_col, block_col, melted
)

groups = x[_group_col].unique()
blocks = x[_block_col].unique()
Expand All @@ -350,21 +366,20 @@ def test_durbin(
r = np.unique(x.groupby(_group_col).count())
k = np.unique(x.groupby(_block_col).count())
if r.size > 1 and k.size > 1:
raise ValueError('Data appear to be unbalanced. Please correct your input data')
raise ValueError("Data appear to be unbalanced. Please correct your input data")
else:
r = r.item()
k = k.item()
x['y_ranks'] = x.groupby(_block_col)[_y_col].rank()
x['y_ranks_sum'] = x.groupby(_group_col)['y_ranks'].sum()
r = float(r.item())
k = float(k.item())
x["y_ranks"] = x.groupby(_block_col)[_y_col].rank()
x["y_ranks_sum"] = x.groupby(_group_col)["y_ranks"].sum()

A = np.sum(x['y_ranks'] ** 2.)
C = (b * k * (k + 1)**2.) / 4.
D = np.sum(x['y_ranks_sum'] ** 2.) - r * C
T1 = (t - 1.) / (A - C) * D
A = float(np.sum(x["y_ranks"] ** 2.0))
C = float(b * k * (k + 1) ** 2.0) / 4.0
D = float(np.sum(x["y_ranks_sum"] ** 2.0)) - r * C
T1 = (t - 1.0) / (A - C) * D

stat = T1
df = t - 1
pval = ss.chi2.sf(stat, df)
pval = ss.chi2.sf(stat, df).item()

return pval, stat, df

Loading

0 comments on commit d018ba3

Please sign in to comment.