diff --git a/src/obscure_stats/association/association.py b/src/obscure_stats/association/association.py index b5ed465..9ed0b4c 100644 --- a/src/obscure_stats/association/association.py +++ b/src/obscure_stats/association/association.py @@ -267,6 +267,7 @@ def tanimoto_similarity(x: np.ndarray, y: np.ndarray) -> float: It is very similar to Jaccard or Cosine similarity but differs in how dot product is normalized. + This version is designed for numeric values, instead of sets. Parameters ---------- diff --git a/src/obscure_stats/central_tendency/central_tendency.py b/src/obscure_stats/central_tendency/central_tendency.py index 604fe78..a26e21a 100644 --- a/src/obscure_stats/central_tendency/central_tendency.py +++ b/src/obscure_stats/central_tendency/central_tendency.py @@ -14,7 +14,7 @@ def midrange(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose midrange is desired. + Input array. Returns ------- @@ -40,7 +40,7 @@ def midhinge(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose midhinge is desired. + Input array. Returns ------- @@ -65,7 +65,7 @@ def trimean(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose trimean is desired. + Input array. Returns ------- @@ -92,7 +92,7 @@ def contraharmonic_mean(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose contraharmonic mean is desired. + Input array. Returns ------- @@ -116,7 +116,7 @@ def midmean(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose interquartile mean is desired. + Input array. Returns ------- @@ -164,9 +164,11 @@ def hodges_lehmann_sen_location(x: np.ndarray) -> float: This implementation uses cartesian product, so the time and memory complexity are N^2. It is best to not use it on large arrays. """ - walsh_sums = np.asarray(x).reshape(-1, 1) + np.asarray(x).reshape(1, -1) - mask = np.triu_indices(len(x), 1) # we need only upper trianle without diagonal - return np.nanmedian(walsh_sums[mask]) * 0.5 + # In the original paper authors suggest use only upper triangular + # of the cartesian product, but in this implementation we use + # whole matrix, which is equvalent. + product = np.meshgrid(x, x, sparse=True) + return np.nanmedian(product[0] + product[1]) * 0.5 def standard_trimmed_harrell_davis_quantile(x: np.ndarray, q: float = 0.5) -> float: diff --git a/src/obscure_stats/dispersion/__init__.py b/src/obscure_stats/dispersion/__init__.py index 037a39d..d648e0e 100644 --- a/src/obscure_stats/dispersion/__init__.py +++ b/src/obscure_stats/dispersion/__init__.py @@ -5,8 +5,6 @@ coefficient_of_variation, dispersion_ratio, efficiency, - hoover_index, - jains_fairness_index, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -20,12 +18,10 @@ "coefficient_of_variation", "dispersion_ratio", "efficiency", - "hoover_index", "lloyds_index", "morisita_index", "quartile_coefficient_of_dispersion", "sqad", "studentized_range", "robust_coefficient_of_variation", - "jains_fairness_index", ] diff --git a/src/obscure_stats/dispersion/dispersion.py b/src/obscure_stats/dispersion/dispersion.py index 48bf421..1ca430a 100644 --- a/src/obscure_stats/dispersion/dispersion.py +++ b/src/obscure_stats/dispersion/dispersion.py @@ -190,34 +190,6 @@ def dispersion_ratio(x: np.ndarray) -> float: return np.nanmean(x) / (stats.gmean(x, nan_policy="omit") + EPS) -def hoover_index(x: np.ndarray) -> float: - """Calculate Hoover index. - - It is also known as the Robin Hood index, Schutz index or Pietra ratio. - - Mostly used as measure of income inequality. - A value of 0 represents total equality, and 1 represents perfect inequality. - In general - measure of uniformity of the distribution. - - Parameters - ---------- - x : array_like - Input array. - - Returns - ------- - hi : float or array_like. - The value of the Hoover index. - - References - ---------- - Hoover Jr, E. M. (1936). - The Measurement of Industrial Localization. - Review of Economics and Statistics, 18, No. 162-71. - """ - return 0.5 * np.nansum(x - np.nanmean(x)) / np.nansum(x) - - def lloyds_index(x: np.ndarray) -> float: """Calculate Lloyd's index of mean crowding. @@ -296,35 +268,3 @@ def sqad(x: np.ndarray) -> float: med = np.nanmedian(x) # constant value to maximize efficiency for normal distribution return np.nanquantile(np.abs(x - med), q=0.682689492137086) - - -def jains_fairness_index(x: np.ndarray) -> float: - """Calculate Jain's Fairness Index. - - Jain's Fairness Index is a fairness measures commonly used in network engineering. - The result ranges from 1/n (worst case) to 1 (best case), - and it is maximum when all users receive the same allocation. - In general - measure of uniformity of the distribution. - - Parameters - ---------- - x : array_like - Input array. - - Returns - ------- - jfi : float or array_like. - The value of the coefficient of variation. - - References - ---------- - Jain, R.; Chiu, D. M.; Hawe, W. (1984). - A Quantitative Measure of Fairness and Discrimination - for Resource Allocation in Shared Computer Systems. - DEC Research Report TR-301. - """ - cv = coefficient_of_variation(x) - if cv is np.inf: - warnings.warn("CV is inf, Jain's Index is not defined.", stacklevel=2) - return np.inf - return 1.0 / (1.0 + cv**2) diff --git a/src/obscure_stats/kurtosis/kurtosis.py b/src/obscure_stats/kurtosis/kurtosis.py index de8a989..045c565 100644 --- a/src/obscure_stats/kurtosis/kurtosis.py +++ b/src/obscure_stats/kurtosis/kurtosis.py @@ -11,7 +11,7 @@ def moors_kurt(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Moor's kurtosis is desired. + Input array. Returns ------- @@ -35,7 +35,7 @@ def moors_octile_kurt(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Moor's octile kurtosis is desired. + Input array. Returns ------- @@ -64,7 +64,7 @@ def hogg_kurt(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Hogg's kurtosis coefficient is desired.s + Input array. Returns ------- @@ -96,7 +96,7 @@ def crow_siddiqui_kurt(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Crow & Siddiqui kurtosis coefficient is desired. + Input array. Returns ------- @@ -122,7 +122,7 @@ def reza_ma_kurt(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Reza & Ma kurtosis coefficient is desired. + Input array. Returns ------- diff --git a/src/obscure_stats/skewness/__init__.py b/src/obscure_stats/skewness/__init__.py index 600b7c9..f119628 100644 --- a/src/obscure_stats/skewness/__init__.py +++ b/src/obscure_stats/skewness/__init__.py @@ -2,6 +2,7 @@ from .skewness import ( auc_skew_gamma, + bickel_mode_skew, bowley_skew, forhad_shorna_rank_skew, groeneveld_skew, @@ -26,4 +27,5 @@ "pearson_median_skew", "pearson_mode_skew", "pearson_halfmode_skew", + "bickel_mode_skew", ] diff --git a/src/obscure_stats/skewness/skewness.py b/src/obscure_stats/skewness/skewness.py index f91c9df..a6c1000 100644 --- a/src/obscure_stats/skewness/skewness.py +++ b/src/obscure_stats/skewness/skewness.py @@ -16,7 +16,7 @@ def pearson_mode_skew(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Pearson's mode skew coefficient is desired. + Input array. Returns ------- @@ -44,7 +44,7 @@ def pearson_halfmode_skew(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Pearson's mode skew coefficient is desired. + Input array. Returns ------- @@ -63,13 +63,36 @@ def pearson_halfmode_skew(x: np.ndarray) -> float: return (mean - mode) / std +def bickel_mode_skew(x: np.ndarray) -> float: + """Calculate Robust mode skew with half sample mode. + + Parameters + ---------- + x : array_like + Input array. + + Returns + ------- + phmods : float or array_like. + The value of Bickel's mode skew coefficient. + + References + ---------- + Bickel, D. R. (2002). + Robust estimators of the mode and skewness of continuous data. + Computational Statistics & Data Analysis, Elsevier, 39(2), 153-163. + """ + mode = half_sample_mode(x) + return np.nanmean(np.sign(x - mode)) + + def pearson_median_skew(x: np.ndarray) -> float: """Calculatie Pearson's median skew coefficient. Parameters ---------- x : array_like - Array containing numbers whose Pearson's median skew coefficient is desired. + Input array. Returns ------- @@ -94,7 +117,7 @@ def medeen_skew(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Medeen's skewness statistic is desired. + Input array. Returns ------- @@ -121,7 +144,7 @@ def bowley_skew(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Bowley's skewness coefficinet is desired. + Input array. Returns ------- @@ -147,7 +170,7 @@ def groeneveld_skew(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Groeneveld's skewness coefficinet is desired. + Input array. Returns ------- @@ -175,7 +198,7 @@ def kelly_skew(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Kelly's skewness coefficinet is desired. + Input array. Returns ------- @@ -200,8 +223,7 @@ def hossain_adnan_skew(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Houssain and Adnan skewness coefficient - is desired. + Input array. Returns ------- @@ -226,8 +248,7 @@ def forhad_shorna_rank_skew(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Forhad-Shorna coefficient of Rank Skewness - is desired. + Input array. Returns ------- @@ -268,7 +289,7 @@ def auc_skew_gamma(x: np.ndarray, dp: float = 0.01) -> float: Parameters ---------- x : array_like - Array containing numbers whose AUC Bowley skewness is desired. + Input array. dp : float, default = 0.01 Step used in calculating area under the curve (integrating). @@ -297,7 +318,7 @@ def wauc_skew_gamma(x: np.ndarray, dp: float = 0.01) -> float: Parameters ---------- x : array_like - Array containing numbers whose AUC Bowley skewness is desired. + Input array. dp : float, default = 0.01 Step used in calculating area under the curve (integrating). diff --git a/src/obscure_stats/variation/variation.py b/src/obscure_stats/variation/variation.py index 32da536..fe03ba0 100644 --- a/src/obscure_stats/variation/variation.py +++ b/src/obscure_stats/variation/variation.py @@ -19,7 +19,7 @@ def mod_vr(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose mode variation ratio is desired. + Input array. Returns ------- @@ -49,7 +49,7 @@ def range_vr(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose range variation ratio is desired. + Input array. Returns ------- @@ -79,7 +79,7 @@ def gibbs_m1(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Gibbs M1 index is desired. + Input array. Returns ------- @@ -120,7 +120,7 @@ def gibbs_m2(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose Gibbs M2 index is desired. + Input array. Returns ------- @@ -149,7 +149,7 @@ def b_index(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose B index is desired. + Input array. Returns ------- @@ -178,7 +178,7 @@ def ada_index(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose ADA index is desired. + Input array. Returns ------- @@ -209,7 +209,7 @@ def extropy(x: np.ndarray) -> float: Parameters ---------- x : array_like - Array containing numbers whose extropy is desired. + Input array. Returns ------- diff --git a/tests/conftest.py b/tests/conftest.py index dc9759e..e03a5c3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -88,3 +88,145 @@ def c_array_nan(c_array_obj: np.ndarray) -> np.ndarray: temp = c_array_obj.copy() temp[1] = None return temp + + +@pytest.fixture(scope="session") +def rank_skewness_test_data() -> np.ndarray: + """Test data from the paper for Rank Skew.""" + return np.asarray( + ( + 73.3, + 80.5, + 50.4, + 64.8, + 74.0, + 72.8, + 72.0, + 59.7, + 90.9, + 76.9, + 71.4, + 45.6, + 77.5, + 60.6, + 67.5, + 54.6, + 71.0, + 66.0, + 71.0, + 74.0, + 72.7, + 73.6, + 97.5, + 89.6, + 70.5, + 78.1, + 84.6, + 92.5, + 76.9, + 76.9, + 59.0, + 82.4, + 56.8, + 83.0, + 76.5, + 72.6, + 65.9, + 70.0, + 130.0, + 76.9, + 88.2, + 63.4, + 123.7, + 65.6, + 80.2, + 84.7, + 82.6, + 76.5, + 80.6, + 72.3, + 99.6, + 80.7, + 73.3, + 77.4, + 68.1, + 74.6, + 70.5, + 58.8, + 93.7, + 61.3, + 76.9, + 78.2, + 85.4, + 72.2, + 100.0, + 55.7, + 79.3, + 109.0, + 84.4, + 76.4, + 86.4, + 67.7, + 74.0, + 92.3, + 76.9, + 64.5, + 88.7, + 72.4, + 65.7, + 73.6, + 79.6, + 64.1, + 76.9, + 68.6, + 73.2, + 66.3, + 70.0, + 91.9, + 55.5, + 100.0, + 79.6, + 72.7, + 78.1, + 68.3, + 65.9, + 74.0, + 67.3, + 66.3, + 96.0, + 73.8, + 70.0, + 50.5, + 73.0, + 55.0, + 80.0, + 84.0, + 50.9, + ), + ) + + +@pytest.fixture(scope="session") +def thdme_test_data() -> np.ndarray: + """Test data from the paper for Trimmed Harrles-Davies median.""" + return np.asarray( + (-0.565, -0.106, -0.095, 0.363, 0.404, 0.633, 1.371, 1.512, 2.018, 100_000), + ) + + +@pytest.fixture(scope="session") +def hls_test_data() -> np.ndarray: + """Test data from the paper for Hodges-Lehmann-Sen estimator.""" + return np.asarray((1, 5, 2, 2, 7, 4, 1, 6)) + + +@pytest.fixture(scope="session") +def hsm_test_data() -> np.ndarray: + """Test data for Half Sample Mode.""" + return np.asarray((1, 2, 2, 2, 7, 4, 1, 6)) + + +@pytest.fixture(scope="session") +def hls_test_data_big() -> list[int]: + """Test data from the paper for Hodges-Lehmann-Sen estimator.""" + return [10**100, 10**100, 2, 2, 7, 4, 1, 6] diff --git a/tests/test_central_tendency.py b/tests/test_central_tendency.py index e85f565..fafe8f2 100644 --- a/tests/test_central_tendency.py +++ b/tests/test_central_tendency.py @@ -45,22 +45,18 @@ def test_mock_aggregation_functions( func(data) -def test_thdm() -> None: +def test_thdm(thdme_test_data: np.ndarray) -> None: """Simple tets case for correctness of Trimmed Harrel Davis median.""" - x = np.asarray( - (-0.565, -0.106, -0.095, 0.363, 0.404, 0.633, 1.371, 1.512, 2.018, 100_000), - ) - result = standard_trimmed_harrell_davis_quantile(x) + result = standard_trimmed_harrell_davis_quantile(thdme_test_data) if result != pytest.approx(0.6268, rel=1e-4): msg = "Results from the test and paper do not match." raise ValueError(msg) -def test_edge_cases() -> None: +def test_edge_cases(x_array_float: np.ndarray) -> None: """Simple tets case for edge cases.""" - x = np.asarray([1]) - result = standard_trimmed_harrell_davis_quantile(x) - if result != pytest.approx(1.0, rel=1e-4): + result = standard_trimmed_harrell_davis_quantile(x_array_float[:1]) + if result != pytest.approx(x_array_float[0], rel=1e-4): msg = "Result does not match expected output." raise ValueError(msg) @@ -73,31 +69,33 @@ def test_q_in_sthdq(x_array_float: np.ndarray) -> None: standard_trimmed_harrell_davis_quantile(x_array_float, q=0) -def test_hls() -> None: +def test_hls(hls_test_data: np.ndarray, hls_test_data_big: list[int]) -> None: """Simple tets case for correctness of Hodges-Lehmann-Sen.""" - x = np.asarray((1, 5, 2, 2, 7, 4, 1, 6)) - result = hodges_lehmann_sen_location(x) + result = hodges_lehmann_sen_location(hls_test_data) if result != pytest.approx(3.5): msg = "Results from the test and paper do not match." raise ValueError(msg) + result = hodges_lehmann_sen_location(hls_test_data_big) # type: ignore[arg-type] + if result != pytest.approx(5.75): + msg = "Results from the test and paper do not match." + raise ValueError(msg) -def test_hsm() -> None: +def test_hsm(hsm_test_data: np.ndarray) -> None: """Simple tets case for correctness of Half Sample Mode.""" - x = np.asarray((1, 2, 2, 2, 7, 4, 1, 6)) - result = half_sample_mode(x) + result = half_sample_mode(hsm_test_data) if result != pytest.approx(2.0): msg = "Results from the test and paper do not match." raise ValueError(msg) - result = half_sample_mode(x[:3]) + result = half_sample_mode(hsm_test_data[:3]) if result != pytest.approx(2): msg = "Results from the test and paper do not match." raise ValueError(msg) - result = half_sample_mode(x[1:4]) + result = half_sample_mode(hsm_test_data[1:4]) if result != pytest.approx(2): msg = "Results from the test and paper do not match." raise ValueError(msg) - result = half_sample_mode(x[2:5]) + result = half_sample_mode(hsm_test_data[2:5]) if result != pytest.approx(2): msg = "Results from the test and paper do not match." raise ValueError(msg) diff --git a/tests/test_dispersion.py b/tests/test_dispersion.py index 15f27e5..497d019 100644 --- a/tests/test_dispersion.py +++ b/tests/test_dispersion.py @@ -9,8 +9,6 @@ coefficient_of_variation, dispersion_ratio, efficiency, - hoover_index, - jains_fairness_index, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -28,13 +26,11 @@ robust_coefficient_of_variation, dispersion_ratio, efficiency, - hoover_index, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, sqad, studentized_range, - jains_fairness_index, ], ) @pytest.mark.parametrize( @@ -59,7 +55,6 @@ def test_mock_aggregation_functions( robust_coefficient_of_variation, dispersion_ratio, efficiency, - hoover_index, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, @@ -85,7 +80,6 @@ def test_dispersion_sensibility(func: typing.Callable, seed: int) -> None: robust_coefficient_of_variation, quartile_coefficient_of_dispersion, efficiency, - jains_fairness_index, ], ) def test_cv_corner_cases(func: typing.Callable) -> None: @@ -105,13 +99,11 @@ def test_cv_corner_cases(func: typing.Callable) -> None: robust_coefficient_of_variation, dispersion_ratio, efficiency, - hoover_index, lloyds_index, morisita_index, quartile_coefficient_of_dispersion, sqad, studentized_range, - jains_fairness_index, ], ) def test_statistic_with_nans( diff --git a/tests/test_skewness.py b/tests/test_skewness.py index 5bb759b..9b174e6 100644 --- a/tests/test_skewness.py +++ b/tests/test_skewness.py @@ -6,6 +6,7 @@ import pytest from obscure_stats.skewness import ( auc_skew_gamma, + bickel_mode_skew, bowley_skew, forhad_shorna_rank_skew, groeneveld_skew, @@ -33,6 +34,7 @@ pearson_median_skew, pearson_mode_skew, pearson_halfmode_skew, + bickel_mode_skew, ], ) @pytest.mark.parametrize( @@ -63,6 +65,7 @@ def test_mock_aggregation_functions( pearson_median_skew, pearson_mode_skew, pearson_halfmode_skew, + bickel_mode_skew, ], ) @pytest.mark.parametrize("seed", [1, 42, 99]) @@ -76,120 +79,11 @@ def test_skew_sensibility(func: typing.Callable, seed: int) -> None: raise ValueError(msg) -def test_rank_skew() -> None: +def test_rank_skew(rank_skewness_test_data: np.ndarray) -> None: """Simple tets case for correctness of Rank skewness coefficient.""" - x = np.asarray( - ( - 73.3, - 80.5, - 50.4, - 64.8, - 74.0, - 72.8, - 72.0, - 59.7, - 90.9, - 76.9, - 71.4, - 45.6, - 77.5, - 60.6, - 67.5, - 54.6, - 71.0, - 66.0, - 71.0, - 74.0, - 72.7, - 73.6, - 97.5, - 89.6, - 70.5, - 78.1, - 84.6, - 92.5, - 76.9, - 76.9, - 59.0, - 82.4, - 56.8, - 83.0, - 76.5, - 72.6, - 65.9, - 70.0, - 130.0, - 76.9, - 88.2, - 63.4, - 123.7, - 65.6, - 80.2, - 84.7, - 82.6, - 76.5, - 80.6, - 72.3, - 99.6, - 80.7, - 73.3, - 77.4, - 68.1, - 74.6, - 70.5, - 58.8, - 93.7, - 61.3, - 76.9, - 78.2, - 85.4, - 72.2, - 100.0, - 55.7, - 79.3, - 109.0, - 84.4, - 76.4, - 86.4, - 67.7, - 74.0, - 92.3, - 76.9, - 64.5, - 88.7, - 72.4, - 65.7, - 73.6, - 79.6, - 64.1, - 76.9, - 68.6, - 73.2, - 66.3, - 70.0, - 91.9, - 55.5, - 100.0, - 79.6, - 72.7, - 78.1, - 68.3, - 65.9, - 74.0, - 67.3, - 66.3, - 96.0, - 73.8, - 70.0, - 50.5, - 73.0, - 55.0, - 80.0, - 84.0, - 50.9, - ), - ) - if forhad_shorna_rank_skew(x) != pytest.approx(0.93809, rel=1e-4): + if forhad_shorna_rank_skew(rank_skewness_test_data) != pytest.approx( + 0.93809, rel=1e-4 + ): msg = "Results from the test and paper do not match." raise ValueError(msg) @@ -208,6 +102,7 @@ def test_rank_skew() -> None: pearson_median_skew, pearson_mode_skew, pearson_halfmode_skew, + bickel_mode_skew, ], ) def test_statistic_with_nans(