Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add groupby to inequality metrics #158

Merged
merged 10 commits into from
Jan 23, 2021
Merged
4 changes: 2 additions & 2 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Build and test [Python 3.6, 3.7, 3.8]
name: Build and test [Python 3.7, 3.8, 3.9]

on: [push, pull_request]

Expand All @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.7, 3.8, 3.9]

steps:
- name: Checkout
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/check_jupyterbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
with:
activate-environment: microdf
environment-file: environment.yml
python-version: 3.8
python-version: 3.9
auto-activate-base: false

- name: Build # Build Jupyter Book
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/deploy_jupyterbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
with:
activate-environment: microdf
environment-file: environment.yml
python-version: 3.8
python-version: 3.9
auto-activate-base: false

- name: Build
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]
python-version: [3.9]

steps:
- name: Checkout
Expand Down
2 changes: 0 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
name: microdf
channels:
- conda-forge
- PSLmodels
dependencies:
- codecov
- flake8
Expand All @@ -12,7 +11,6 @@ dependencies:
- pytest
- seaborn
- setuptools
- taxcalc
- pip:
- matplotlib-label-lines
- jupyter-book
109 changes: 67 additions & 42 deletions microdf/inequality.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import microdf as mdf


def gini(df, col, w=None, negatives=None):
def gini(df, col, w=None, negatives=None, groupby=None):
"""Calculates Gini index.

:param df: DataFrame.
Expand All @@ -16,96 +16,116 @@ def gini(df, col, w=None, negatives=None):
when this minimum is negative. That is, it adds the absolute
minimum value.
Defaults to None, which leaves negative values as they are.
:param groupby: Column, or list of columns, to group by.
:returns: A float, the Gini index.

"""
# Requires float numpy arrays (not pandas Series or lists) to work.
x = np.array(df[col]).astype("float")
if negatives == "zero":
x[x < 0] = 0
if negatives == "shift" and np.amin(x) < 0:
x -= np.amin(x)
if w is not None:
w = np.array(df[w]).astype("float")
sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_w = w[sorted_indices]
cumw = np.cumsum(sorted_w)
cumxw = np.cumsum(sorted_x * sorted_w)
return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / (
cumxw[-1] * cumw[-1]
)
else:
sorted_x = np.sort(x)
n = len(x)
cumxw = np.cumsum(sorted_x)
# The above formula, with all weights equal to 1 simplifies to:
return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n


def top_x_pct_share(df, col, top_x_pct, w=None):

def _gini(df, col, w=None, negatives=None):
# Requires float numpy arrays (not pandas Series or lists) to work.
x = np.array(df[col]).astype("float")
if negatives == "zero":
x[x < 0] = 0
if negatives == "shift" and np.amin(x) < 0:
x -= np.amin(x)
if w is not None:
w = np.array(df[w]).astype("float")
sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_w = w[sorted_indices]
cumw = np.cumsum(sorted_w)
cumxw = np.cumsum(sorted_x * sorted_w)
return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / (
cumxw[-1] * cumw[-1]
)
else:
sorted_x = np.sort(x)
n = len(x)
cumxw = np.cumsum(sorted_x)
# The above formula, with all weights equal to 1 simplifies to:
return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n

if groupby is None:
return _gini(df, col, w, negatives)
return df.groupby(groupby).apply(lambda x: _gini(x, col, w, negatives))


def top_x_pct_share(df, col, top_x_pct, w=None, groupby=None):
"""Calculates top x% share.

:param df: DataFrame.
:param col: Name of column in df representing value.
:param top_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top x%.

"""
threshold = mdf.weighted_quantile(df, col, w, 1 - top_x_pct)
top_x_pct_sum = mdf.weighted_sum(df[df[col] >= threshold], col, w)
total_sum = mdf.weighted_sum(df, col, w)
return top_x_pct_sum / total_sum

def _top_x_pct_share(df, col, top_x_pct, w=None):
threshold = mdf.weighted_quantile(df, col, w, 1 - top_x_pct)
top_x_pct_sum = mdf.weighted_sum(df[df[col] >= threshold], col, w)
total_sum = mdf.weighted_sum(df, col, w)
return top_x_pct_sum / total_sum

if groupby is None:
return _top_x_pct_share(df, col, top_x_pct, w)
return df.groupby(groupby).apply(
lambda x: _top_x_pct_share(x, col, top_x_pct, w)
)


def bottom_x_pct_share(df, col, bottom_x_pct, w=None):
def bottom_x_pct_share(df, col, bottom_x_pct, w=None, groupby=None):
"""Calculates bottom x% share.

:param df: DataFrame.
:param col: Name of column in df representing value.
:param bottom_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the bottom x%.

"""
return 1 - top_x_pct_share(df, col, 1 - bottom_x_pct, w, top=False)
return 1 - top_x_pct_share(df, col, 1 - bottom_x_pct, w, groupby)


def bottom_50_pct_share(df, col, w=None):
def bottom_50_pct_share(df, col, w=None, groupby=None):
"""Calculates bottom 50% share.

:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the bottom 50%.

"""
return bottom_x_pct_share(df, col, 0.5, w)
return bottom_x_pct_share(df, col, 0.5, w, groupby)


def top_50_pct_share(df, col, w=None):
def top_50_pct_share(df, col, w=None, groupby=None):
"""Calculates top 50% share.

:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 50%.

"""
return top_x_pct_share(df, col, 0.5, w)
return top_x_pct_share(df, col, 0.5, w, groupby)


def top_10_pct_share(df, col, w=None):
def top_10_pct_share(df, col, w=None, groupby=None):
"""Calculates top 10% share.

:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 10%.

"""
return top_x_pct_share(df, col, 0.1, w)
return top_x_pct_share(df, col, 0.1, w, groupby)


def top_1_pct_share(df, col, w=None):
Expand All @@ -114,32 +134,37 @@ def top_1_pct_share(df, col, w=None):
:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 1%.

"""
return top_x_pct_share(df, col, 0.01, w)


def top_0_1_pct_share(df, col, w=None):
def top_0_1_pct_share(df, col, w=None, groupby=None):
"""Calculates top 0.1% share.

:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 0.1%.

"""
return top_x_pct_share(df, col, 0.001, w)
return top_x_pct_share(df, col, 0.001, w, groupby)


def t10_b50(df, col, w=None):
def t10_b50(df, col, w=None, groupby=None):
"""Calculates ratio between the top 10% and bottom 50% shares.

:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 10% divided by
the share of w-weighted val held by the bottom 50%.

"""
return top_10_pct_share(df, col, w) / bottom_50_pct_share(df, col, w)
t10 = top_10_pct_share(df, col, w, groupby)
b50 = bottom_50_pct_share(df, col, w, groupby)
return t10 / b50
11 changes: 11 additions & 0 deletions microdf/tests/test_weighted.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,14 @@ def test_weighted_sum():
# Test grouped.
mdf.weighted_sum(dfg, "x", "w", "g")
mdf.weighted_sum(dfg, ["x", "y"], "w", "g")


def test_gini():
# Unweighted
mdf.gini(df, "x")
# Weighted
mdf.gini(df, "x", "w")
# Unweighted, grouped
mdf.gini(dfg, "x", groupby="g")
# Weighted, grouped
mdf.gini(dfg, "x", "w", groupby="g")