diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 04146a8..daeab97 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -1,4 +1,4 @@ -name: Build and test [Python 3.6, 3.7, 3.8] +name: Build and test [Python 3.7, 3.8, 3.9] on: [push, pull_request] @@ -7,7 +7,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8, 3.9] steps: - name: Checkout diff --git a/.github/workflows/check_jupyterbook.yml b/.github/workflows/check_jupyterbook.yml index d2174d5..49f4e0d 100644 --- a/.github/workflows/check_jupyterbook.yml +++ b/.github/workflows/check_jupyterbook.yml @@ -15,7 +15,7 @@ jobs: with: activate-environment: microdf environment-file: environment.yml - python-version: 3.8 + python-version: 3.9 auto-activate-base: false - name: Build # Build Jupyter Book diff --git a/.github/workflows/deploy_jupyterbook.yml b/.github/workflows/deploy_jupyterbook.yml index 677b49a..f78dcc5 100644 --- a/.github/workflows/deploy_jupyterbook.yml +++ b/.github/workflows/deploy_jupyterbook.yml @@ -18,7 +18,7 @@ jobs: with: activate-environment: microdf environment-file: environment.yml - python-version: 3.8 + python-version: 3.9 auto-activate-base: false - name: Build diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index c887c4e..133ddf6 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.8] + python-version: [3.9] steps: - name: Checkout diff --git a/environment.yml b/environment.yml index 7a104f5..13c28af 100644 --- a/environment.yml +++ b/environment.yml @@ -1,7 +1,6 @@ name: microdf channels: - conda-forge -- PSLmodels dependencies: - codecov - flake8 @@ -12,7 +11,6 @@ dependencies: - pytest - seaborn - setuptools -- taxcalc - pip: - matplotlib-label-lines - jupyter-book diff --git a/microdf/inequality.py b/microdf/inequality.py index 765a451..8678d30 100644 --- a/microdf/inequality.py +++ b/microdf/inequality.py @@ -3,7 +3,7 @@ import microdf as mdf -def gini(df, col, w=None, negatives=None): +def gini(df, col, w=None, negatives=None, groupby=None): """Calculates Gini index. :param df: DataFrame. @@ -16,96 +16,116 @@ def gini(df, col, w=None, negatives=None): when this minimum is negative. That is, it adds the absolute minimum value. Defaults to None, which leaves negative values as they are. + :param groupby: Column, or list of columns, to group by. :returns: A float, the Gini index. """ - # Requires float numpy arrays (not pandas Series or lists) to work. - x = np.array(df[col]).astype("float") - if negatives == "zero": - x[x < 0] = 0 - if negatives == "shift" and np.amin(x) < 0: - x -= np.amin(x) - if w is not None: - w = np.array(df[w]).astype("float") - sorted_indices = np.argsort(x) - sorted_x = x[sorted_indices] - sorted_w = w[sorted_indices] - cumw = np.cumsum(sorted_w) - cumxw = np.cumsum(sorted_x * sorted_w) - return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / ( - cumxw[-1] * cumw[-1] - ) - else: - sorted_x = np.sort(x) - n = len(x) - cumxw = np.cumsum(sorted_x) - # The above formula, with all weights equal to 1 simplifies to: - return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n - - -def top_x_pct_share(df, col, top_x_pct, w=None): + + def _gini(df, col, w=None, negatives=None): + # Requires float numpy arrays (not pandas Series or lists) to work. + x = np.array(df[col]).astype("float") + if negatives == "zero": + x[x < 0] = 0 + if negatives == "shift" and np.amin(x) < 0: + x -= np.amin(x) + if w is not None: + w = np.array(df[w]).astype("float") + sorted_indices = np.argsort(x) + sorted_x = x[sorted_indices] + sorted_w = w[sorted_indices] + cumw = np.cumsum(sorted_w) + cumxw = np.cumsum(sorted_x * sorted_w) + return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / ( + cumxw[-1] * cumw[-1] + ) + else: + sorted_x = np.sort(x) + n = len(x) + cumxw = np.cumsum(sorted_x) + # The above formula, with all weights equal to 1 simplifies to: + return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n + + if groupby is None: + return _gini(df, col, w, negatives) + return df.groupby(groupby).apply(lambda x: _gini(x, col, w, negatives)) + + +def top_x_pct_share(df, col, top_x_pct, w=None, groupby=None): """Calculates top x% share. :param df: DataFrame. :param col: Name of column in df representing value. :param top_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001. :param w: Column representing weight in df. + :param groupby: Column, or list of columns, to group by. :returns: The share of w-weighted val held by the top x%. """ - threshold = mdf.weighted_quantile(df, col, w, 1 - top_x_pct) - top_x_pct_sum = mdf.weighted_sum(df[df[col] >= threshold], col, w) - total_sum = mdf.weighted_sum(df, col, w) - return top_x_pct_sum / total_sum + + def _top_x_pct_share(df, col, top_x_pct, w=None): + threshold = mdf.weighted_quantile(df, col, w, 1 - top_x_pct) + top_x_pct_sum = mdf.weighted_sum(df[df[col] >= threshold], col, w) + total_sum = mdf.weighted_sum(df, col, w) + return top_x_pct_sum / total_sum + + if groupby is None: + return _top_x_pct_share(df, col, top_x_pct, w) + return df.groupby(groupby).apply( + lambda x: _top_x_pct_share(x, col, top_x_pct, w) + ) -def bottom_x_pct_share(df, col, bottom_x_pct, w=None): +def bottom_x_pct_share(df, col, bottom_x_pct, w=None, groupby=None): """Calculates bottom x% share. :param df: DataFrame. :param col: Name of column in df representing value. :param bottom_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001. :param w: Column representing weight in df. + :param groupby: Column, or list of columns, to group by. :returns: The share of w-weighted val held by the bottom x%. """ - return 1 - top_x_pct_share(df, col, 1 - bottom_x_pct, w, top=False) + return 1 - top_x_pct_share(df, col, 1 - bottom_x_pct, w, groupby) -def bottom_50_pct_share(df, col, w=None): +def bottom_50_pct_share(df, col, w=None, groupby=None): """Calculates bottom 50% share. :param df: DataFrame. :param col: Name of column in df representing value. :param w: Column representing weight in df. + :param groupby: Column, or list of columns, to group by. :returns: The share of w-weighted val held by the bottom 50%. """ - return bottom_x_pct_share(df, col, 0.5, w) + return bottom_x_pct_share(df, col, 0.5, w, groupby) -def top_50_pct_share(df, col, w=None): +def top_50_pct_share(df, col, w=None, groupby=None): """Calculates top 50% share. :param df: DataFrame. :param col: Name of column in df representing value. :param w: Column representing weight in df. + :param groupby: Column, or list of columns, to group by. :returns: The share of w-weighted val held by the top 50%. """ - return top_x_pct_share(df, col, 0.5, w) + return top_x_pct_share(df, col, 0.5, w, groupby) -def top_10_pct_share(df, col, w=None): +def top_10_pct_share(df, col, w=None, groupby=None): """Calculates top 10% share. :param df: DataFrame. :param col: Name of column in df representing value. :param w: Column representing weight in df. + :param groupby: Column, or list of columns, to group by. :returns: The share of w-weighted val held by the top 10%. """ - return top_x_pct_share(df, col, 0.1, w) + return top_x_pct_share(df, col, 0.1, w, groupby) def top_1_pct_share(df, col, w=None): @@ -114,32 +134,37 @@ def top_1_pct_share(df, col, w=None): :param df: DataFrame. :param col: Name of column in df representing value. :param w: Column representing weight in df. + :param groupby: Column, or list of columns, to group by. :returns: The share of w-weighted val held by the top 1%. """ return top_x_pct_share(df, col, 0.01, w) -def top_0_1_pct_share(df, col, w=None): +def top_0_1_pct_share(df, col, w=None, groupby=None): """Calculates top 0.1% share. :param df: DataFrame. :param col: Name of column in df representing value. :param w: Column representing weight in df. + :param groupby: Column, or list of columns, to group by. :returns: The share of w-weighted val held by the top 0.1%. """ - return top_x_pct_share(df, col, 0.001, w) + return top_x_pct_share(df, col, 0.001, w, groupby) -def t10_b50(df, col, w=None): +def t10_b50(df, col, w=None, groupby=None): """Calculates ratio between the top 10% and bottom 50% shares. :param df: DataFrame. :param col: Name of column in df representing value. :param w: Column representing weight in df. + :param groupby: Column, or list of columns, to group by. :returns: The share of w-weighted val held by the top 10% divided by the share of w-weighted val held by the bottom 50%. """ - return top_10_pct_share(df, col, w) / bottom_50_pct_share(df, col, w) + t10 = top_10_pct_share(df, col, w, groupby) + b50 = bottom_50_pct_share(df, col, w, groupby) + return t10 / b50 diff --git a/microdf/tests/test_weighted.py b/microdf/tests/test_weighted.py index c481b4f..bc23251 100644 --- a/microdf/tests/test_weighted.py +++ b/microdf/tests/test_weighted.py @@ -49,3 +49,14 @@ def test_weighted_sum(): # Test grouped. mdf.weighted_sum(dfg, "x", "w", "g") mdf.weighted_sum(dfg, ["x", "y"], "w", "g") + + +def test_gini(): + # Unweighted + mdf.gini(df, "x") + # Weighted + mdf.gini(df, "x", "w") + # Unweighted, grouped + mdf.gini(dfg, "x", groupby="g") + # Weighted, grouped + mdf.gini(dfg, "x", "w", groupby="g")