Skip to content

Commit

Permalink
Merge pull request #158 from MaxGhenis/master
Browse files Browse the repository at this point in the history
Add groupby to inequality metrics
  • Loading branch information
MaxGhenis authored Jan 23, 2021
2 parents a2422e8 + 223683f commit 3a7bfb4
Show file tree
Hide file tree
Showing 7 changed files with 83 additions and 49 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Build and test [Python 3.6, 3.7, 3.8]
name: Build and test [Python 3.7, 3.8, 3.9]

on: [push, pull_request]

Expand All @@ -7,7 +7,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.6, 3.7, 3.8]
python-version: [3.7, 3.8, 3.9]

steps:
- name: Checkout
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/check_jupyterbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
with:
activate-environment: microdf
environment-file: environment.yml
python-version: 3.8
python-version: 3.9
auto-activate-base: false

- name: Build # Build Jupyter Book
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/deploy_jupyterbook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
with:
activate-environment: microdf
environment-file: environment.yml
python-version: 3.8
python-version: 3.9
auto-activate-base: false

- name: Build
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [3.8]
python-version: [3.9]

steps:
- name: Checkout
Expand Down
2 changes: 0 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
name: microdf
channels:
- conda-forge
- PSLmodels
dependencies:
- codecov
- flake8
Expand All @@ -12,7 +11,6 @@ dependencies:
- pytest
- seaborn
- setuptools
- taxcalc
- pip:
- matplotlib-label-lines
- jupyter-book
109 changes: 67 additions & 42 deletions microdf/inequality.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import microdf as mdf


def gini(df, col, w=None, negatives=None):
def gini(df, col, w=None, negatives=None, groupby=None):
"""Calculates Gini index.
:param df: DataFrame.
Expand All @@ -16,96 +16,116 @@ def gini(df, col, w=None, negatives=None):
when this minimum is negative. That is, it adds the absolute
minimum value.
Defaults to None, which leaves negative values as they are.
:param groupby: Column, or list of columns, to group by.
:returns: A float, the Gini index.
"""
# Requires float numpy arrays (not pandas Series or lists) to work.
x = np.array(df[col]).astype("float")
if negatives == "zero":
x[x < 0] = 0
if negatives == "shift" and np.amin(x) < 0:
x -= np.amin(x)
if w is not None:
w = np.array(df[w]).astype("float")
sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_w = w[sorted_indices]
cumw = np.cumsum(sorted_w)
cumxw = np.cumsum(sorted_x * sorted_w)
return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / (
cumxw[-1] * cumw[-1]
)
else:
sorted_x = np.sort(x)
n = len(x)
cumxw = np.cumsum(sorted_x)
# The above formula, with all weights equal to 1 simplifies to:
return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n


def top_x_pct_share(df, col, top_x_pct, w=None):

def _gini(df, col, w=None, negatives=None):
# Requires float numpy arrays (not pandas Series or lists) to work.
x = np.array(df[col]).astype("float")
if negatives == "zero":
x[x < 0] = 0
if negatives == "shift" and np.amin(x) < 0:
x -= np.amin(x)
if w is not None:
w = np.array(df[w]).astype("float")
sorted_indices = np.argsort(x)
sorted_x = x[sorted_indices]
sorted_w = w[sorted_indices]
cumw = np.cumsum(sorted_w)
cumxw = np.cumsum(sorted_x * sorted_w)
return np.sum(cumxw[1:] * cumw[:-1] - cumxw[:-1] * cumw[1:]) / (
cumxw[-1] * cumw[-1]
)
else:
sorted_x = np.sort(x)
n = len(x)
cumxw = np.cumsum(sorted_x)
# The above formula, with all weights equal to 1 simplifies to:
return (n + 1 - 2 * np.sum(cumxw) / cumxw[-1]) / n

if groupby is None:
return _gini(df, col, w, negatives)
return df.groupby(groupby).apply(lambda x: _gini(x, col, w, negatives))


def top_x_pct_share(df, col, top_x_pct, w=None, groupby=None):
"""Calculates top x% share.
:param df: DataFrame.
:param col: Name of column in df representing value.
:param top_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top x%.
"""
threshold = mdf.weighted_quantile(df, col, w, 1 - top_x_pct)
top_x_pct_sum = mdf.weighted_sum(df[df[col] >= threshold], col, w)
total_sum = mdf.weighted_sum(df, col, w)
return top_x_pct_sum / total_sum

def _top_x_pct_share(df, col, top_x_pct, w=None):
threshold = mdf.weighted_quantile(df, col, w, 1 - top_x_pct)
top_x_pct_sum = mdf.weighted_sum(df[df[col] >= threshold], col, w)
total_sum = mdf.weighted_sum(df, col, w)
return top_x_pct_sum / total_sum

if groupby is None:
return _top_x_pct_share(df, col, top_x_pct, w)
return df.groupby(groupby).apply(
lambda x: _top_x_pct_share(x, col, top_x_pct, w)
)


def bottom_x_pct_share(df, col, bottom_x_pct, w=None):
def bottom_x_pct_share(df, col, bottom_x_pct, w=None, groupby=None):
"""Calculates bottom x% share.
:param df: DataFrame.
:param col: Name of column in df representing value.
:param bottom_x_pct: Decimal between 0 and 1 of the top %, e.g. 0.1, 0.001.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the bottom x%.
"""
return 1 - top_x_pct_share(df, col, 1 - bottom_x_pct, w, top=False)
return 1 - top_x_pct_share(df, col, 1 - bottom_x_pct, w, groupby)


def bottom_50_pct_share(df, col, w=None):
def bottom_50_pct_share(df, col, w=None, groupby=None):
"""Calculates bottom 50% share.
:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the bottom 50%.
"""
return bottom_x_pct_share(df, col, 0.5, w)
return bottom_x_pct_share(df, col, 0.5, w, groupby)


def top_50_pct_share(df, col, w=None):
def top_50_pct_share(df, col, w=None, groupby=None):
"""Calculates top 50% share.
:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 50%.
"""
return top_x_pct_share(df, col, 0.5, w)
return top_x_pct_share(df, col, 0.5, w, groupby)


def top_10_pct_share(df, col, w=None):
def top_10_pct_share(df, col, w=None, groupby=None):
"""Calculates top 10% share.
:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 10%.
"""
return top_x_pct_share(df, col, 0.1, w)
return top_x_pct_share(df, col, 0.1, w, groupby)


def top_1_pct_share(df, col, w=None):
Expand All @@ -114,32 +134,37 @@ def top_1_pct_share(df, col, w=None):
:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 1%.
"""
return top_x_pct_share(df, col, 0.01, w)


def top_0_1_pct_share(df, col, w=None):
def top_0_1_pct_share(df, col, w=None, groupby=None):
"""Calculates top 0.1% share.
:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 0.1%.
"""
return top_x_pct_share(df, col, 0.001, w)
return top_x_pct_share(df, col, 0.001, w, groupby)


def t10_b50(df, col, w=None):
def t10_b50(df, col, w=None, groupby=None):
"""Calculates ratio between the top 10% and bottom 50% shares.
:param df: DataFrame.
:param col: Name of column in df representing value.
:param w: Column representing weight in df.
:param groupby: Column, or list of columns, to group by.
:returns: The share of w-weighted val held by the top 10% divided by
the share of w-weighted val held by the bottom 50%.
"""
return top_10_pct_share(df, col, w) / bottom_50_pct_share(df, col, w)
t10 = top_10_pct_share(df, col, w, groupby)
b50 = bottom_50_pct_share(df, col, w, groupby)
return t10 / b50
11 changes: 11 additions & 0 deletions microdf/tests/test_weighted.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,14 @@ def test_weighted_sum():
# Test grouped.
mdf.weighted_sum(dfg, "x", "w", "g")
mdf.weighted_sum(dfg, ["x", "y"], "w", "g")


def test_gini():
# Unweighted
mdf.gini(df, "x")
# Weighted
mdf.gini(df, "x", "w")
# Unweighted, grouped
mdf.gini(dfg, "x", groupby="g")
# Weighted, grouped
mdf.gini(dfg, "x", "w", groupby="g")

0 comments on commit 3a7bfb4

Please sign in to comment.