diff --git a/RELEASES.md b/RELEASES.md index 29156cb94..2371b4e10 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -4,13 +4,15 @@ Go [here](https://github.com/open-source-economics/Tax-Calculator/pulls?q=is%3Apr+is%3Aclosed) for a complete commit history. -Release 0.Y.Z on 2017-??-?? +Release 0.11.0 on 2017-??-?? ---------------------------- (last merged pull request is [#xxxx](https://github.com/open-source-economics/Tax-Calculator/pull/xxxx)) **API Changes** -- None +- Revise dropq distribution and difference tables used by TaxBrain + [[#1537](https://github.com/open-source-economics/Tax-Calculator/pull/1537) + by Anderson Frailey and Martin Holmer] **New Features** - None diff --git a/taxcalc/dropq/dropq.py b/taxcalc/dropq/dropq.py index 913e1511e..b649b21a9 100644 --- a/taxcalc/dropq/dropq.py +++ b/taxcalc/dropq/dropq.py @@ -1,6 +1,11 @@ """ The dropq functions are used by TaxBrain to call Tax-Calculator in order -to maintain the privacy of the micro data being used by TaxBrain. +to maintain the privacy of the IRS-SOI PUF data being used by TaxBrain. +This is done by "fuzzing" reform results for several randomly selected +filing units in each table cell. The filing units randomly selected +differ for each policy reform and the "fuzzing" involves replacing the +post-reform tax results for the selected units with their pre-reform +tax results. """ # CODING-STYLE CHECKS: # pep8 --ignore=E402 dropq.py @@ -20,7 +25,7 @@ # specify constants PLAN_COLUMN_TYPES = [float] * len(TABLE_LABELS) -DIFF_COLUMN_TYPES = [int, int, int, float, float, str, str, str] +DIFF_COLUMN_TYPES = [int, int, int, float, float, str, str, str, str] DECILE_ROW_NAMES = ['perc0-10', 'perc10-20', 'perc20-30', 'perc30-40', 'perc40-50', 'perc50-60', 'perc60-70', 'perc70-80', @@ -102,98 +107,111 @@ def run_nth_year_tax_calc_model(year_n, start_year, np.random.seed(seed) # pylint: disable=no-member # construct dropq summary results from raw results - (m2_dec, m1_dec, df_dec, pdf_dec, cdf_dec, - m2_bin, m1_bin, df_bin, pdf_bin, cdf_bin, - itax_sumd, ptax_sumd, comb_sumd, - itax_sum1, ptax_sum1, comb_sum1, - itax_sum2, ptax_sum2, comb_sum2) = dropq_summary(rawres1, rawres2, mask) - - # construct DataFrames containing selected summary results - totsd = [itax_sumd, ptax_sumd, comb_sumd] - fiscal_tots_diff = pd.DataFrame(data=totsd, index=TOTAL_ROW_NAMES) - - tots1 = [itax_sum1, ptax_sum1, comb_sum1] - fiscal_tots_baseline = pd.DataFrame(data=tots1, index=TOTAL_ROW_NAMES) - - tots2 = [itax_sum2, ptax_sum2, comb_sum2] - fiscal_tots_reform = pd.DataFrame(data=tots2, index=TOTAL_ROW_NAMES) - - # remove negative incomes from selected summary results - df_bin.drop(df_bin.index[0], inplace=True) - pdf_bin.drop(pdf_bin.index[0], inplace=True) - cdf_bin.drop(cdf_bin.index[0], inplace=True) - m2_bin.drop(m2_bin.index[0], inplace=True) - m1_bin.drop(m1_bin.index[0], inplace=True) + (dist1_dec, dist2_dec, + diff_itax_dec, diff_ptax_dec, diff_comb_dec, + dist1_bin, dist2_bin, + diff_itax_bin, diff_ptax_bin, diff_comb_bin, + aggr_itax_d, aggr_ptax_d, aggr_comb_d, + aggr_itax_1, aggr_ptax_1, aggr_comb_1, + aggr_itax_2, aggr_ptax_2, aggr_comb_2) = dropq_summary(rawres1, + rawres2, + mask) + + # construct DataFrames containing aggregate tax totals + # ... for reform-minus-baseline difference + aggrd = [aggr_itax_d, aggr_ptax_d, aggr_comb_d] + aggr_d = pd.DataFrame(data=aggrd, index=TOTAL_ROW_NAMES) + # ... for baseline + aggr1 = [aggr_itax_1, aggr_ptax_1, aggr_comb_1] + aggr_1 = pd.DataFrame(data=aggr1, index=TOTAL_ROW_NAMES) + # ... for reform + aggr2 = [aggr_itax_2, aggr_ptax_2, aggr_comb_2] + aggr_2 = pd.DataFrame(data=aggr2, index=TOTAL_ROW_NAMES) elapsed_time = time.time() - start_time print('elapsed time for this run: ', elapsed_time) - def append_year(tdf): + def append_year(pdf): """ - append_year embedded function + append_year embedded function revises all column names in pdf """ - tdf.columns = [str(col) + '_{}'.format(year_n) for col in tdf.columns] - return tdf + pdf.columns = [str(col) + '_{}'.format(year_n) for col in pdf.columns] + return pdf # optionally return non-JSON results if not return_json: - return (append_year(m2_dec), append_year(m1_dec), append_year(df_dec), - append_year(pdf_dec), append_year(cdf_dec), - append_year(m2_bin), append_year(m1_bin), append_year(df_bin), - append_year(pdf_bin), append_year(cdf_bin), - append_year(fiscal_tots_diff), - append_year(fiscal_tots_baseline), - append_year(fiscal_tots_reform)) - - # optionally construct JSON results - decile_row_names_i = [x + '_' + str(year_n) for x in DECILE_ROW_NAMES] - m2_dec_table_i = create_json_table(m2_dec, - row_names=decile_row_names_i, - column_types=PLAN_COLUMN_TYPES) - m1_dec_table_i = create_json_table(m1_dec, - row_names=decile_row_names_i, - column_types=PLAN_COLUMN_TYPES) - df_dec_table_i = create_json_table(df_dec, - row_names=decile_row_names_i, - column_types=DIFF_COLUMN_TYPES) - pdf_dec_table_i = create_json_table(pdf_dec, - row_names=decile_row_names_i, - column_types=DIFF_COLUMN_TYPES) - cdf_dec_table_i = create_json_table(cdf_dec, - row_names=decile_row_names_i, - column_types=DIFF_COLUMN_TYPES) - bin_row_names_i = [x + '_' + str(year_n) for x in BIN_ROW_NAMES] - m2_bin_table_i = create_json_table(m2_bin, - row_names=bin_row_names_i, - column_types=PLAN_COLUMN_TYPES) - m1_bin_table_i = create_json_table(m1_bin, - row_names=bin_row_names_i, - column_types=PLAN_COLUMN_TYPES) - df_bin_table_i = create_json_table(df_bin, - row_names=bin_row_names_i, - column_types=DIFF_COLUMN_TYPES) - pdf_bin_table_i = create_json_table(pdf_bin, - row_names=bin_row_names_i, - column_types=DIFF_COLUMN_TYPES) - cdf_bin_table_i = create_json_table(cdf_bin, - row_names=bin_row_names_i, - column_types=DIFF_COLUMN_TYPES) - total_row_names_i = [x + '_' + str(year_n) for x in TOTAL_ROW_NAMES] - fiscal_yr_total_df = create_json_table(fiscal_tots_diff, - row_names=total_row_names_i) - fiscal_yr_total_df = dict((k, v[0]) for k, v in fiscal_yr_total_df.items()) - fiscal_yr_total_bl = create_json_table(fiscal_tots_baseline, - row_names=total_row_names_i) - fiscal_yr_total_bl = dict((k, v[0]) for k, v in fiscal_yr_total_bl.items()) - fiscal_yr_total_rf = create_json_table(fiscal_tots_reform, - row_names=total_row_names_i) - fiscal_yr_total_rf = dict((k, v[0]) for k, v in fiscal_yr_total_rf.items()) + return (append_year(dist2_dec), + append_year(dist1_dec), + append_year(diff_itax_dec), + append_year(diff_ptax_dec), + append_year(diff_comb_dec), + append_year(dist2_bin), + append_year(dist1_bin), + append_year(diff_itax_bin), + append_year(diff_ptax_bin), + append_year(diff_comb_bin), + append_year(aggr_d), + append_year(aggr_1), + append_year(aggr_2)) + + # optionally construct JSON results tables for year n + dec_row_names_n = [x + '_' + str(year_n) for x in DECILE_ROW_NAMES] + dist2_dec_table_n = create_json_table(dist2_dec, + row_names=dec_row_names_n, + column_types=PLAN_COLUMN_TYPES) + dist1_dec_table_n = create_json_table(dist1_dec, + row_names=dec_row_names_n, + column_types=PLAN_COLUMN_TYPES) + diff_itax_dec_table_n = create_json_table(diff_itax_dec, + row_names=dec_row_names_n, + column_types=DIFF_COLUMN_TYPES) + diff_ptax_dec_table_n = create_json_table(diff_ptax_dec, + row_names=dec_row_names_n, + column_types=DIFF_COLUMN_TYPES) + diff_comb_dec_table_n = create_json_table(diff_comb_dec, + row_names=dec_row_names_n, + column_types=DIFF_COLUMN_TYPES) + bin_row_names_n = [x + '_' + str(year_n) for x in BIN_ROW_NAMES] + dist2_bin_table_n = create_json_table(dist2_bin, + row_names=bin_row_names_n, + column_types=PLAN_COLUMN_TYPES) + dist1_bin_table_n = create_json_table(dist1_bin, + row_names=bin_row_names_n, + column_types=PLAN_COLUMN_TYPES) + diff_itax_bin_table_n = create_json_table(diff_itax_bin, + row_names=bin_row_names_n, + column_types=DIFF_COLUMN_TYPES) + diff_ptax_bin_table_n = create_json_table(diff_ptax_bin, + row_names=bin_row_names_n, + column_types=DIFF_COLUMN_TYPES) + diff_comb_bin_table_n = create_json_table(diff_comb_bin, + row_names=bin_row_names_n, + column_types=DIFF_COLUMN_TYPES) + total_row_names_n = [x + '_' + str(year_n) for x in TOTAL_ROW_NAMES] + aggr_d_table_n = create_json_table(aggr_d, + row_names=total_row_names_n) + aggr_d_table_n = dict((k, v[0]) for k, v in aggr_d_table_n.items()) + aggr_1_table_n = create_json_table(aggr_1, + row_names=total_row_names_n) + aggr_1_table_n = dict((k, v[0]) for k, v in aggr_1_table_n.items()) + aggr_2_table_n = create_json_table(aggr_2, + row_names=total_row_names_n) + aggr_2_table_n = dict((k, v[0]) for k, v in aggr_2_table_n.items()) # return JSON results - return (m2_dec_table_i, m1_dec_table_i, df_dec_table_i, pdf_dec_table_i, - cdf_dec_table_i, m2_bin_table_i, m1_bin_table_i, df_bin_table_i, - pdf_bin_table_i, cdf_bin_table_i, fiscal_yr_total_df, - fiscal_yr_total_bl, fiscal_yr_total_rf) + return (dist2_dec_table_n, + dist1_dec_table_n, + diff_itax_dec_table_n, + diff_ptax_dec_table_n, + diff_comb_dec_table_n, + dist2_bin_table_n, + dist1_bin_table_n, + diff_itax_bin_table_n, + diff_ptax_bin_table_n, + diff_comb_bin_table_n, + aggr_d_table_n, + aggr_1_table_n, + aggr_2_table_n) def run_nth_year_gdp_elast_model(year_n, start_year, @@ -218,10 +236,10 @@ def run_nth_year_gdp_elast_model(year_n, start_year, # return gdp_effect results if return_json: gdp_df = pd.DataFrame(data=[gdp_effect], columns=['col0']) - gdp_elast_names_i = [x + '_' + str(year_n) + gdp_elast_names_n = [x + '_' + str(year_n) for x in GDP_ELAST_ROW_NAMES] gdp_elast_total = create_json_table(gdp_df, - row_names=gdp_elast_names_i, + row_names=gdp_elast_names_n, num_decimals=5) gdp_elast_total = dict((k, v[0]) for k, v in gdp_elast_total.items()) return gdp_elast_total diff --git a/taxcalc/dropq/dropq_utils.py b/taxcalc/dropq/dropq_utils.py index 848054f96..8646b2290 100644 --- a/taxcalc/dropq/dropq_utils.py +++ b/taxcalc/dropq/dropq_utils.py @@ -8,19 +8,13 @@ import copy import hashlib import numpy as np -import pandas as pd from taxcalc import (Policy, Records, Calculator, Consumption, Behavior, Growfactors, Growdiff) -from taxcalc.utils import (add_income_bins, add_weighted_income_bins, - means_and_comparisons, get_sums, - weighted, weighted_avg_allcols, - create_distribution_table, results, +from taxcalc.utils import (add_income_bins, add_quantile_bins, results, + create_difference_table, create_distribution_table, STATS_COLUMNS, TABLE_COLUMNS, WEBAPP_INCOME_BINS) -EPSILON = 1e-3 - - def check_years(start_year, year_n): """ Ensure start_year and year_n values are consistent with Policy constants. @@ -121,13 +115,13 @@ def dropq_calculate(year_n, start_year, calc1p.increment_year() calc1p.calc_all() assert calc1p.current_year == start_year - # compute mask that shows which of the calc1 and calc1p results differ - # mask is true if a filing unit's tax liability changed after a dollar - # was added to the filing unit's income + # compute mask showing which of the calc1 and calc1p results differ; + # mask is true if a filing unit's income tax liability changed after + # a dollar was added to the filing unit's wage and salary income res1 = results(calc1.records) res1p = results(calc1p.records) mask = np.logical_not( # pylint: disable=no-member - np.isclose(res1.iitax, res1p.iitax, atol=EPSILON, rtol=0.0) + np.isclose(res1.iitax, res1p.iitax, atol=0.001, rtol=0.0) ) else: mask = None @@ -210,286 +204,193 @@ def random_seed_from_subdict(subdict): return seed % np.iinfo(np.uint32).max # pylint: disable=no-member -NUM_TO_DROP = 3 +NUM_TO_FUZZ = 3 def chooser(agg): """ - This is a transformation function that should be called on each group. - It is assumed that the chunk 'agg' is a chunk of the 'mask' column. - This chooser selects NUM_TO_DROP of those mask indices with the output for - those NUM_TO_DROP indices being zero and the output for all the other - indices being one. + This is a transformation function that should be called on each group + (that is, each cell in a table). It is assumed that the chunk 'agg' is + a chunk of the 'mask' column. This chooser selects NUM_TO_FUZZ of those + mask indices with the output for those NUM_TO_FUZZ indices being zero and + the output for all the other indices being one. """ # select indices of records with change in tax liability after # a one dollar increase in income indices = np.where(agg) - if len(indices[0]) >= NUM_TO_DROP: + if len(indices[0]) >= NUM_TO_FUZZ: choices = np.random.choice(indices[0], # pylint: disable=no-member - size=NUM_TO_DROP, replace=False) + size=NUM_TO_FUZZ, replace=False) else: msg = ('Not enough differences in income tax when adding ' 'one dollar for chunk with name: {}') raise ValueError(msg.format(agg.name)) - # drop chosen records + # mark the records chosen to be fuzzed ans = [1] * len(agg) for idx in choices: ans[idx] = 0 return ans -def drop_records(df1, df2, mask): +def fuzz_df2_records(df1, df2, mask): """ - Modify df1 and df2 by adding statistical fuzz for data privacy. + Modify df2 by adding random fuzz for data privacy. Parameters ---------- df1: Pandas DataFrame - contains results for the standard plan X and X'. + contains results for the baseline plan df2: Pandas DataFrame - contains results for the user-specified plan (Plan Y). + contains results for the reform plan mask: boolean numpy array - contains info about whether or not each element of X and X' are same + contains info about whether or not each row might be fuzzed Returns ------- - fuzzed_df1: Pandas DataFrame - - fuzzed_df2: Pandas DataFrame + fuzzed df2: Pandas DataFrame Notes ----- This function groups both DataFrames based on the web application's - income groupings (both weighted decile and income bins), and then - pseudo-randomly picks NUM_TO_DROP records to 'drop' within each bin. - We keep track of the NUM_TO_DROP dropped records in both group-by - strategies and then use these 'flag' columns to modify all - columns of interest, creating new '_dec' columns for - statistics based on weighted deciles and '_bin' columns for - statitistics based on income bins. Lastly we calculate - individual income tax differences, payroll tax differences, and - combined tax differences between the baseline and reform - for the two groupings. + income groupings (both decile and income bins), and then randomly + selects NUM_TO_FUZZ records to fuzz within each bin. The fuzzing + involves overwriting df2 columns in cols_to_fuzz with df1 values. """ - # perform all statistics on (Y + X') - X - - # Group first + # nested function that does the fuzzing + def fuzz(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz): + """ + Fuzz some df2 records in each bin defined by bin_type and imeasure. + The fuzzed records have their post-reform tax results (in df2) + set to their pre-reform tax results (in df1). + """ + # pylint: disable=too-many-arguments + assert bin_type == 'dec' or bin_type == 'bin' or bin_type == 'agg' + if bin_type == 'dec': + df2 = add_quantile_bins(df2, imeasure, 10) + elif bin_type == 'bin': + df2 = add_income_bins(df2, imeasure, bins=WEBAPP_INCOME_BINS) + else: + df2 = add_quantile_bins(df2, imeasure, 1) + gdf2 = df2.groupby('bins') + df2['nofuzz'] = gdf2['mask'].transform(chooser) + for col in cols_to_fuzz: + df2[col + suffix] = (df2[col] * df2['nofuzz'] - + df1[col] * df2['nofuzz'] + df1[col]) + # main logic of fuzz_df2_records + cols_to_skip = set(['num_returns_ItemDed', 'num_returns_StandardDed', + 'num_returns_AMT', 's006']) + columns_to_fuzz = (set(TABLE_COLUMNS) | set(STATS_COLUMNS)) - cols_to_skip df2['mask'] = mask - df1['mask'] = mask - - df2 = add_weighted_income_bins(df2) - df1 = add_weighted_income_bins(df1) - gp2_dec = df2.groupby('bins') - - df2 = add_income_bins(df2, bins=WEBAPP_INCOME_BINS) - df1 = add_income_bins(df1, bins=WEBAPP_INCOME_BINS) - gp2_bin = df2.groupby('bins') - - # Transform to get the 'flag' column that marks dropped records in each bin - df2['flag_dec'] = gp2_dec['mask'].transform(chooser) - df2['flag_bin'] = gp2_bin['mask'].transform(chooser) - - # first calculate all of X' - columns_to_make_noisy = set(TABLE_COLUMNS) | set(STATS_COLUMNS) - # these don't exist yet - columns_to_make_noisy.remove('num_returns_ItemDed') - columns_to_make_noisy.remove('num_returns_StandardDed') - columns_to_make_noisy.remove('num_returns_AMT') - for col in columns_to_make_noisy: - df2[col + '_dec'] = (df2[col] * df2['flag_dec'] - - df1[col] * df2['flag_dec'] + df1[col]) - df2[col + '_bin'] = (df2[col] * df2['flag_bin'] - - df1[col] * df2['flag_bin'] + df1[col]) - - # Difference in plans - # Positive values are the magnitude of the tax increase - # Negative values are the magnitude of the tax decrease - df2['tax_diff_dec'] = df2['iitax_dec'] - df1['iitax'] - df2['tax_diff_bin'] = df2['iitax_bin'] - df1['iitax'] - df2['payrolltax_diff_dec'] = df2['payrolltax_dec'] - df1['payrolltax'] - df2['payrolltax_diff_bin'] = df2['payrolltax_bin'] - df1['payrolltax'] - df2['combined_diff_dec'] = df2['combined_dec'] - df1['combined'] - df2['combined_diff_bin'] = df2['combined_bin'] - df1['combined'] - - return df1, df2 + # always use expanded income in df1 baseline to groupby into bins + df2['expanded_income_baseline'] = df1['expanded_income'] + fuzz(df1, df2, 'dec', 'expanded_income_baseline', '_xdec', columns_to_fuzz) + fuzz(df1, df2, 'bin', 'expanded_income_baseline', '_xbin', columns_to_fuzz) + fuzz(df1, df2, 'agg', 'expanded_income_baseline', '_agg', columns_to_fuzz) + return df2 def dropq_summary(df1, df2, mask): """ - df1 contains raw results for the standard plan X and X' - df2 contains raw results the user-specified plan (Plan Y) - mask is the boolean mask where X and X' match + df1 contains raw results for baseline plan + df2 contains raw results for reform plan + mask is the boolean array specifying which rows might be fuzzed """ # pylint: disable=too-many-locals - df1, df2 = drop_records(df1, df2, mask) - - # Totals for diff between baseline and reform - dec_sum = (df2['tax_diff_dec'] * df2['s006']).sum() - bin_sum = (df2['tax_diff_bin'] * df2['s006']).sum() - pr_dec_sum = (df2['payrolltax_diff_dec'] * df2['s006']).sum() - pr_bin_sum = (df2['payrolltax_diff_bin'] * df2['s006']).sum() - combined_dec_sum = (df2['combined_diff_dec'] * df2['s006']).sum() - combined_bin_sum = (df2['combined_diff_bin'] * df2['s006']).sum() - - # Totals for baseline - sum_baseline = (df1['iitax'] * df1['s006']).sum() - pr_sum_baseline = (df1['payrolltax'] * df1['s006']).sum() - combined_sum_baseline = (df1['combined'] * df1['s006']).sum() - - # Totals for reform - sum_reform = (df2['iitax_dec'] * df2['s006']).sum() - pr_sum_reform = (df2['payrolltax_dec'] * df2['s006']).sum() - combined_sum_reform = (df2['combined_dec'] * df2['s006']).sum() - - # Create difference tables, grouped by deciles and bins - diffs_dec = dropq_diff_table(df1, df2, - groupby='weighted_deciles', - res_col='tax_diff', - diff_col='iitax', - suffix='_dec', wsum=dec_sum) - - diffs_bin = dropq_diff_table(df1, df2, - groupby='webapp_income_bins', - res_col='tax_diff', - diff_col='iitax', - suffix='_bin', wsum=bin_sum) - - pr_diffs_dec = dropq_diff_table(df1, df2, - groupby='weighted_deciles', - res_col='payrolltax_diff', - diff_col='payrolltax', - suffix='_dec', wsum=pr_dec_sum) - - pr_diffs_bin = dropq_diff_table(df1, df2, - groupby='webapp_income_bins', - res_col='payrolltax_diff', - diff_col='payrolltax', - suffix='_bin', wsum=pr_bin_sum) - - comb_diffs_dec = dropq_diff_table(df1, df2, - groupby='weighted_deciles', - res_col='combined_diff', - diff_col='combined', - suffix='_dec', wsum=combined_dec_sum) - - comb_diffs_bin = dropq_diff_table(df1, df2, - groupby='webapp_income_bins', - res_col='combined_diff', - diff_col='combined', - suffix='_bin', wsum=combined_bin_sum) - - m1_dec = create_distribution_table(df1, groupby='weighted_deciles', - result_type='weighted_sum') - - m2_dec = dropq_dist_table(df2, groupby='weighted_deciles', - result_type='weighted_sum', suffix='_dec') - - m1_bin = create_distribution_table(df1, groupby='webapp_income_bins', - result_type='weighted_sum') - - m2_bin = dropq_dist_table(df2, groupby='webapp_income_bins', - result_type='weighted_sum', suffix='_bin') - - return (m2_dec, m1_dec, diffs_dec, pr_diffs_dec, comb_diffs_dec, - m2_bin, m1_bin, diffs_bin, pr_diffs_bin, comb_diffs_bin, - dec_sum, pr_dec_sum, combined_dec_sum, - sum_baseline, pr_sum_baseline, combined_sum_baseline, - sum_reform, pr_sum_reform, combined_sum_reform) - - -def dropq_diff_table(df1, df2, groupby, res_col, diff_col, suffix, wsum): - """ - Create and return dropq difference table. - """ - # pylint: disable=too-many-arguments,too-many-locals - if groupby == "weighted_deciles": - gdf = add_weighted_income_bins(df2, num_bins=10) - elif groupby == "small_income_bins": - gdf = add_income_bins(df2, compare_with="soi") - elif groupby == "large_income_bins": - gdf = add_income_bins(df2, compare_with="tpc") - elif groupby == "webapp_income_bins": - gdf = add_income_bins(df2, compare_with="webapp") - else: - err = ("groupby must be either 'weighted_deciles' or " - "'small_income_bins' or 'large_income_bins' or " - "'webapp_income_bins'") - raise ValueError(err) - # Difference in plans - # Positive values are the magnitude of the tax increase - # Negative values are the magnitude of the tax decrease - df2[res_col + suffix] = df2[diff_col + suffix] - df1[diff_col] - diffs = means_and_comparisons(res_col + suffix, - gdf.groupby('bins', as_index=False), - wsum + EPSILON) - sum_row = get_sums(diffs)[diffs.columns] - diffs = diffs.append(sum_row) # pylint: disable=redefined-variable-type - pd.options.display.float_format = '{:8,.0f}'.format - srs_inc = ["{0:.2f}%".format(val * 100) for val in diffs['perc_inc']] - diffs['perc_inc'] = pd.Series(srs_inc, index=diffs.index) - srs_cut = ["{0:.2f}%".format(val * 100) for val in diffs['perc_cut']] - diffs['perc_cut'] = pd.Series(srs_cut, index=diffs.index) - srs_change = ["{0:.2f}%".format(val * 100) for val in - diffs['share_of_change']] - diffs['share_of_change'] = pd.Series(srs_change, index=diffs.index) - # columns containing weighted values relative to the binning mechanism - non_sum_cols = [x for x in diffs.columns if 'mean' in x or 'perc' in x] - for col in non_sum_cols: - diffs.loc['sums', col] = 'n/a' - return diffs - - -def dropq_dist_table(resdf, groupby, result_type, suffix): - """ - Create and return dropq distribution table. - """ - # pylint: disable=too-many-locals - res = resdf - c04470_s = 'c04470' + suffix - c00100_s = 'c00100' + suffix - c09600_s = 'c09600' + suffix - standard_s = 'standard' + suffix - s006_s = 's006' + suffix - returns_ided_s = 'num_returns_ItemDed' + suffix - returns_sded_s = 'num_returns_StandardDed' + suffix - returns_amt_s = 'num_returns_AMT' + suffix - res[c04470_s] = res[c04470_s].where(((res[c00100_s] > 0) & - (res[c04470_s] > res[standard_s])), 0) - res[returns_ided_s] = res[s006_s].where(((res[c00100_s] > 0) & - (res[c04470_s] > 0)), 0) - res[returns_sded_s] = res[s006_s].where(((res[c00100_s] > 0) & - (res[standard_s] > 0)), 0) - res[returns_amt_s] = res[s006_s].where(res[c09600_s] > 0, 0) - if groupby == "weighted_deciles": - dframe = add_weighted_income_bins(res, num_bins=10) - elif groupby == "small_income_bins": - dframe = add_income_bins(res, compare_with="soi") - elif groupby == "large_income_bins": - dframe = add_income_bins(res, compare_with="tpc") - elif groupby == "webapp_income_bins": - dframe = add_income_bins(res, compare_with="webapp") - else: - err = ("groupby must be either 'weighted_deciles' or " - "'small_income_bins' or 'large_income_bins' or " - "'webapp_income_bins'") - raise ValueError(err) - pd.options.display.float_format = '{:8,.0f}'.format - if result_type == "weighted_sum": - dframe = weighted(dframe, [col + suffix for col in STATS_COLUMNS]) - gby_bins = dframe.groupby('bins', as_index=False) - gp_mean = gby_bins[[col + suffix for col in TABLE_COLUMNS]].sum() - gp_mean.drop('bins', axis=1, inplace=True) - sum_row = get_sums(dframe)[[col + suffix for col in TABLE_COLUMNS]] - elif result_type == "weighted_avg": - gp_mean = weighted_avg_allcols(dframe, - [col + suffix for col in TABLE_COLUMNS]) - all_sums = get_sums(dframe, not_available=True) - sum_row = all_sums[[col + suffix for col in TABLE_COLUMNS]] - else: - err = ("result_type must be either 'weighted_sum' or " - "'weighted_avg'") - raise ValueError(err) - return gp_mean.append(sum_row) + df2 = fuzz_df2_records(df1, df2, mask) + + # tax difference totals between reform and baseline + tdiff = df2['iitax_agg'] - df1['iitax'] + aggr_itax_d = (tdiff * df2['s006']).sum() + tdiff = df2['payrolltax_agg'] - df1['payrolltax'] + aggr_ptax_d = (tdiff * df2['s006']).sum() + tdiff = df2['combined_agg'] - df1['combined'] + aggr_comb_d = (tdiff * df2['s006']).sum() + + # totals for baseline + aggr_itax_1 = (df1['iitax'] * df1['s006']).sum() + aggr_ptax_1 = (df1['payrolltax'] * df1['s006']).sum() + aggr_comb_1 = (df1['combined'] * df1['s006']).sum() + + # totals for reform + aggr_itax_2 = (df2['iitax_agg'] * df2['s006']).sum() + aggr_ptax_2 = (df2['payrolltax_agg'] * df2['s006']).sum() + aggr_comb_2 = (df2['combined_agg'] * df2['s006']).sum() + + # create difference tables grouped by deciles and bins + df2['iitax'] = df2['iitax_xdec'] + diff_itax_dec = create_difference_table(df1, df2, + groupby='weighted_deciles', + income_measure='expanded_income', + tax_to_diff='iitax') + df2['payrolltax'] = df2['payrolltax_xdec'] + diff_ptax_dec = create_difference_table(df1, df2, + groupby='weighted_deciles', + income_measure='expanded_income', + tax_to_diff='payrolltax') + df2['combined'] = df2['combined_xdec'] + diff_comb_dec = create_difference_table(df1, df2, + groupby='weighted_deciles', + income_measure='expanded_income', + tax_to_diff='combined') + df2['iitax'] = df2['iitax_xbin'] + diff_itax_bin = create_difference_table(df1, df2, + groupby='webapp_income_bins', + income_measure='expanded_income', + tax_to_diff='iitax') + df2['payrolltax'] = df2['payrolltax_xbin'] + diff_ptax_bin = create_difference_table(df1, df2, + groupby='webapp_income_bins', + income_measure='expanded_income', + tax_to_diff='iitax') + df2['combined'] = df2['combined_xbin'] + diff_comb_bin = create_difference_table(df1, df2, + groupby='webapp_income_bins', + income_measure='expanded_income', + tax_to_diff='combined') + + # create distribution tables grouped by deciles and bins + dist1_dec = create_distribution_table(df1, groupby='weighted_deciles', + income_measure='expanded_income', + result_type='weighted_sum') + dist1_bin = create_distribution_table(df1, groupby='webapp_income_bins', + income_measure='expanded_income', + result_type='weighted_sum') + suffix = '_xdec' + df2_cols_with_suffix = [c for c in list(df2) if c.endswith(suffix)] + for col in df2_cols_with_suffix: + root_col_name = col.replace(suffix, '') + df2[root_col_name] = df2[col] + df2['expanded_income_baseline'] = df1['expanded_income'] + dist2_dec = \ + create_distribution_table(df2, groupby='weighted_deciles', + income_measure='expanded_income_baseline', + result_type='weighted_sum') + suffix = '_xbin' + df2_cols_with_suffix = [c for c in list(df2) if c.endswith(suffix)] + for col in df2_cols_with_suffix: + root_col_name = col.replace(suffix, '') + df2[root_col_name] = df2[col] + df2['expanded_income_baseline'] = df1['expanded_income'] + dist2_bin = \ + create_distribution_table(df2, groupby='webapp_income_bins', + income_measure='expanded_income_baseline', + result_type='weighted_sum') + + # remove negative-income bin from each bin result + dist1_bin.drop(dist1_bin.index[0], inplace=True) + dist2_bin.drop(dist2_bin.index[0], inplace=True) + diff_itax_bin.drop(diff_itax_bin.index[0], inplace=True) + diff_ptax_bin.drop(diff_ptax_bin.index[0], inplace=True) + diff_comb_bin.drop(diff_comb_bin.index[0], inplace=True) + + # return tupl of summary results + return (dist1_dec, dist2_dec, + diff_itax_dec, diff_ptax_dec, diff_comb_dec, + dist1_bin, dist2_bin, + diff_itax_bin, diff_ptax_bin, diff_comb_bin, + aggr_itax_d, aggr_ptax_d, aggr_comb_d, + aggr_itax_1, aggr_ptax_1, aggr_comb_1, + aggr_itax_2, aggr_ptax_2, aggr_comb_2) diff --git a/taxcalc/reforms/earnings_shifting.py b/taxcalc/reforms/earnings_shifting.py index 1dfaf0594..8039f3c3c 100644 --- a/taxcalc/reforms/earnings_shifting.py +++ b/taxcalc/reforms/earnings_shifting.py @@ -286,9 +286,8 @@ def write_decile_table(dfx): Write to stdout the distributional table specified in dfx DataFrame. """ # create expanded-income decile information - dfx = add_weighted_income_bins(dfx, num_bins=10, - income_measure='expanded_income', - weight_by_income_measure=False) + dfx = add_quantile_bins(dfx, 'expanded_income', 10, + weight_by_income_measure=False) gdfx = dfx.groupby('bins', as_index=False) rtns_series = gdfx.apply(unweighted_sum, 's006') xinc_series = gdfx.apply(weighted_sum, 'expanded_income') diff --git a/taxcalc/taxcalcio.py b/taxcalc/taxcalcio.py index 964fa07d7..e0cff28ec 100644 --- a/taxcalc/taxcalcio.py +++ b/taxcalc/taxcalcio.py @@ -21,7 +21,7 @@ from taxcalc.utils import (delete_file, ce_aftertax_income, atr_graph_data, mtr_graph_data, xtr_graph_plot, write_graph_file, - add_weighted_income_bins, + add_quantile_bins, unweighted_sum, weighted_sum) @@ -442,9 +442,8 @@ def write_decile_table(dfx, tfile, tkind='Totals'): """ Write to tfile the tkind decile table using dfx DataFrame. """ - dfx = add_weighted_income_bins(dfx, num_bins=10, - income_measure='expanded_income', - weight_by_income_measure=False) + dfx = add_quantile_bins(dfx, 'expanded_income', 10, + weight_by_income_measure=False) gdfx = dfx.groupby('bins', as_index=False) rtns_series = gdfx.apply(unweighted_sum, 's006') xinc_series = gdfx.apply(weighted_sum, 'expanded_income') diff --git a/taxcalc/tests/test_calculate.py b/taxcalc/tests/test_calculate.py index e3b844d05..3be32d134 100644 --- a/taxcalc/tests/test_calculate.py +++ b/taxcalc/tests/test_calculate.py @@ -184,10 +184,12 @@ def test_Calculator_create_distribution_table(cps_subsample): 'Combined Payroll and Individual Income Tax Liabilities'] dt1 = create_distribution_table(calc.records, groupby="weighted_deciles", + income_measure='expanded_income', result_type="weighted_sum") dt1.columns = dist_labels dt2 = create_distribution_table(calc.records, groupby="small_income_bins", + income_measure='expanded_income', result_type="weighted_avg") assert isinstance(dt1, pd.DataFrame) assert isinstance(dt2, pd.DataFrame) @@ -265,7 +267,9 @@ def test_Calculator_create_difference_table(cps_subsample): calc2.calc_all() # create difference table and check that it is a Pandas DataFrame dtable = create_difference_table(calc1.records, calc2.records, - groupby="weighted_deciles") + groupby='weighted_deciles', + income_measure='expanded_income', + tax_to_diff='payrolltax') assert isinstance(dtable, pd.DataFrame) diff --git a/taxcalc/tests/test_dropq.py b/taxcalc/tests/test_dropq.py index 5cf9f0f1a..c8792e9ca 100644 --- a/taxcalc/tests/test_dropq.py +++ b/taxcalc/tests/test_dropq.py @@ -2,15 +2,12 @@ test_dropq.py uses only PUF input data because the dropq algorithm is designed to work exclusively with private IRS-SOI PUF input data. """ -import os -import six import numpy as np import pandas as pd import pytest from taxcalc.dropq.dropq_utils import * from taxcalc.dropq import * from taxcalc import (Policy, Records, Calculator, - create_difference_table, multiyear_diagnostic_table, results) @@ -77,8 +74,7 @@ def test_run_nth_year_value_errors(puf_subsample): @pytest.mark.requires_pufcsv @pytest.mark.parametrize('resjson', [True, False]) def test_run_tax_calc_model(puf_subsample, resjson): - usermods = USER_MODS - res = run_nth_year_tax_calc_model(2, 2016, puf_subsample, usermods, + res = run_nth_year_tax_calc_model(2, 2016, puf_subsample, USER_MODS, return_json=resjson) assert len(res) == 13 dump = False # set to True in order to dump returned results and fail test @@ -154,61 +150,6 @@ def test_create_json_table(): create_json_table(dframe) -@pytest.mark.requires_pufcsv -@pytest.mark.parametrize('groupby, result_type', - [('small_income_bins', 'weighted_sum'), - ('large_income_bins', 'weighted_sum'), - ('large_income_bins', 'weighted_avg'), - ('other_income_bins', 'weighted_avg'), - ('large_income_bins', 'other_avg')]) -def test_dropq_dist_table(groupby, result_type, puf_subsample): - calc = Calculator(policy=Policy(), records=Records(data=puf_subsample)) - calc.calc_all() - res = results(calc.records) - mask = np.ones(len(res.index)) - (res, _) = drop_records(res, res, mask) - if groupby == 'other_income_bins' or result_type == 'other_avg': - with pytest.raises(ValueError): - dropq_dist_table(res, groupby=groupby, - result_type=result_type, suffix='_bin') - else: - dropq_dist_table(res, groupby=groupby, - result_type=result_type, suffix='_bin') - - -@pytest.mark.requires_pufcsv -@pytest.mark.parametrize('groupby, res_column', - [('weighted_deciles', 'tax_diff'), - ('webapp_income_bins', 'tax_diff'), - ('small_income_bins', 'tax_diff'), - ('large_income_bins', 'tax_diff'), - ('other_deciles', 'tax_diff')]) -def test_dropq_diff_table(groupby, res_column, puf_subsample): - recs1 = Records(data=puf_subsample) - calc1 = Calculator(policy=Policy(), records=recs1) - recs2 = Records(data=puf_subsample) - pol2 = Policy() - pol2.implement_reform(USER_MODS['policy']) - calc2 = Calculator(policy=pol2, records=recs2) - calc1.calc_all() - calc2.calc_all() - res1 = results(calc1.records) - res2 = results(calc2.records) - assert len(res1.index) == len(res2.index) - mask = np.ones(len(res1.index)) - (res1, res2) = drop_records(res1, res2, mask) - dec_sum = (res2['tax_diff_dec'] * res2['s006']).sum() - if groupby == 'other_deciles': - with pytest.raises(ValueError): - dropq_diff_table(res1, res2, groupby=groupby, - res_col=res_column, diff_col='iitax', - suffix='_dec', wsum=dec_sum) - else: - dropq_diff_table(res1, res2, groupby=groupby, - res_col=res_column, diff_col='iitax', - suffix='_dec', wsum=dec_sum) - - @pytest.mark.requires_pufcsv def test_with_pufcsv(puf_fullsample): # specify usermods dictionary in code @@ -285,71 +226,3 @@ def test_reform_warnings_errors(): msg_dict = reform_warnings_errors(bad2_mods) assert len(msg_dict['warnings']) == 0 assert len(msg_dict['errors']) > 0 - - -@pytest.mark.requires_pufcsv -def test_dropq_diff_vs_util_diff(puf_subsample): - recs1 = Records(data=puf_subsample) - calc1 = Calculator(policy=Policy(), records=recs1) - recs2 = Records(data=puf_subsample) - pol2 = Policy() - pol2.implement_reform(USER_MODS['policy']) - calc2 = Calculator(policy=pol2, records=recs2) - calc1.advance_to_year(2016) - calc2.advance_to_year(2016) - calc1.calc_all() - calc2.calc_all() - # generate diff table using utility function - udf = create_difference_table(calc1.records, calc2.records, - groupby='weighted_deciles', - income_measure='expanded_income', - tax_to_present='iitax') - assert isinstance(udf, pd.DataFrame) - # generate diff table using dropq functions without dropping any records - res1 = results(calc1.records) - res2 = results(calc2.records) - res2['iitax_dec'] = res2['iitax'] # TODO: ??? drop ??? - res2['tax_diff_dec'] = res2['iitax'] - res1['iitax'] # TODO: ??? drop ??? - qdf = dropq_diff_table(res1, res2, - groupby='weighted_deciles', - res_col='tax_diff', - diff_col='iitax', - suffix='_dec', - wsum=(res2['tax_diff_dec'] * res2['s006']).sum()) - assert isinstance(qdf, pd.DataFrame) - # check that each element in the two DataFrames are the same - if 'aftertax_perc' not in list(qdf): - qdf = qdf.assign(aftertax_perc=['-0.00%', - '0.00%', - '0.00%', - '0.00%', - '0.00%', - '0.00%', - '0.34%', - '0.90%', - '1.51%', - '2.69%', - 'n/a']) - assert udf.shape[0] == qdf.shape[0] # same number of rows - assert udf.shape[1] == qdf.shape[1] # same number of cols - for col in list(qdf): - for row in range(0, qdf.shape[0]): - same = False - qel_str_type = isinstance(qdf[col][row], six.string_types) - uel_str_type = isinstance(udf[col][row], six.string_types) - assert qel_str_type == uel_str_type - if qel_str_type: - same = qdf[col][row] == udf[col][row] - else: - qel_flt_type = isinstance(qdf[col][row], float) - uel_flt_type = isinstance(udf[col][row], float) - assert qel_flt_type == uel_flt_type - if qel_flt_type: - same = np.allclose([qdf[col][row]], [udf[col][row]]) - if not same: - msg = '{} {} : [{}] {} [{}] {}'.format(col, row, - qdf[col][row], - type(qdf[col][row]), - udf[col][row], - type(udf[col][row])) - assert msg == 'qdf element not equal to udf element' diff --git a/taxcalc/tests/test_utils.py b/taxcalc/tests/test_utils.py index 31b28fd18..be5f46b74 100644 --- a/taxcalc/tests/test_utils.py +++ b/taxcalc/tests/test_utils.py @@ -21,8 +21,8 @@ weighted_count, weighted_sum, weighted_mean, wage_weighted, agi_weighted, expanded_income_weighted, - weighted_perc_inc, weighted_perc_dec, - add_income_bins, add_weighted_income_bins, + weighted_perc_inc, weighted_perc_cut, + add_income_bins, add_quantile_bins, multiyear_diagnostic_table, mtr_graph_data, atr_graph_data, xtr_graph_plot, write_graph_file, @@ -72,7 +72,9 @@ def test_create_tables(cps_subsample): # test creating various difference tables diff = create_difference_table(calc1.records, calc2.records, - groupby='large_income_bins') + groupby='large_income_bins', + income_measure='expanded_income', + tax_to_diff='combined') assert isinstance(diff, pd.DataFrame) expected = ['0.00%', '0.01%', @@ -85,10 +87,12 @@ def test_create_tables(cps_subsample): '0.78%', '0.27%', 'n/a'] - assert np.array_equal(diff['aftertax_perc'], expected) + assert np.array_equal(diff['perc_aftertax'], expected) diff = create_difference_table(calc1.records, calc2.records, - groupby='webapp_income_bins') + groupby='webapp_income_bins', + income_measure='expanded_income', + tax_to_diff='iitax') assert isinstance(diff, pd.DataFrame) expected = ['0.00%', '0.01%', @@ -103,10 +107,12 @@ def test_create_tables(cps_subsample): '0.08%', '0.07%', 'n/a'] - assert np.array_equal(diff['aftertax_perc'], expected) + assert np.array_equal(diff['perc_aftertax'], expected) diff = create_difference_table(calc1.records, calc2.records, - groupby='small_income_bins') + groupby='small_income_bins', + income_measure='expanded_income', + tax_to_diff='iitax') assert isinstance(diff, pd.DataFrame) expected = ['0.00%', '0.01%', @@ -128,10 +134,12 @@ def test_create_tables(cps_subsample): '0.02%', '0.00%', 'n/a'] - assert np.array_equal(diff['aftertax_perc'], expected) + assert np.array_equal(diff['perc_aftertax'], expected) diff = create_difference_table(calc1.records, calc2.records, - groupby='weighted_deciles') + groupby='weighted_deciles', + income_measure='expanded_income', + tax_to_diff='combined') assert isinstance(diff, pd.DataFrame) expected = ['0.00%', '0.02%', @@ -144,38 +152,103 @@ def test_create_tables(cps_subsample): '0.91%', '0.50%', 'n/a'] - assert np.array_equal(diff['aftertax_perc'], expected) + assert np.array_equal(diff['perc_aftertax'], expected) with pytest.raises(ValueError): create_difference_table(calc1.records, calc2.records, - groupby='bad_bins') + groupby='bad_bins', + income_measure='expanded_income', + tax_to_diff='iitax') # test creating various distribution tables + dist = create_distribution_table(calc2.records, + groupby='weighted_deciles', + income_measure='expanded_income', + result_type='weighted_sum') + assert isinstance(dist, pd.DataFrame) + expected = [-8851215, + -99666120, + -123316561, + -85895787, + -47357458, + 207462144, + 443391189, + 978487989, + 1709504845, + 7631268907, + 10605027933] + assert np.allclose(dist['iitax'], expected, + atol=0.5, rtol=0.0) + expected = [1202, + 1688, + 13506, + 18019, + 30130, + 48244, + 80994, + 112788, + 131260, + 146001, + 583832] + assert np.allclose(dist['num_returns_ItemDed'].tolist(), expected, + atol=0.5, rtol=0.0) + + dist = create_distribution_table(calc2.records, + groupby='webapp_income_bins', + income_measure='expanded_income', + result_type='weighted_sum') + assert isinstance(dist, pd.DataFrame) + expected = [-103274, + -83144506, + -152523834, + -129881470, + 85802556, + 255480678, + 832529135, + 1066963515, + 3023956558, + 2876331264, + 1008672459, + 1820944852, + 10605027933] + assert np.allclose(dist['iitax'], expected, + atol=0.5, rtol=0.0) + expected = [0, + 1202, + 22654, + 31665, + 30547, + 49851, + 124786, + 97349, + 160147, + 56806, + 5803, + 3023, + 583832] + assert np.allclose(dist['num_returns_ItemDed'].tolist(), expected, + atol=0.5, rtol=0.0) + + setattr(calc2.records, 'expanded_income_baseline', + getattr(calc2.records, 'expanded_income')) + dist = create_distribution_table(calc2.records, + groupby='webapp_income_bins', + income_measure='expanded_income_baseline', + result_type='weighted_sum') + assert isinstance(dist, pd.DataFrame) + with pytest.raises(ValueError): create_distribution_table(calc2.records, groupby='small_income_bins', + income_measure='expanded_income', result_type='bad_result_type') with pytest.raises(ValueError): create_distribution_table(calc2.records, groupby='bad_bins', + income_measure='expanded_income', result_type='weighted_sum') - dist = create_distribution_table(calc2.records, - groupby='small_income_bins', - result_type='weighted_sum', - baseline_obj=calc1.records, diffs=True) - assert isinstance(dist, pd.DataFrame) - calc1.increment_year() - with pytest.raises(ValueError): - create_difference_table(calc1.records, calc2.records, - groupby='large_income_bins') - with pytest.raises(ValueError): - create_distribution_table(calc2.records, - groupby='small_income_bins', - result_type='weighted_sum', - baseline_obj=calc1.records, diffs=True) - def test_weighted_count_lt_zero(): df1 = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label']) @@ -262,10 +335,10 @@ def test_weighted_perc_inc(): pd.util.testing.assert_series_equal(exp, diffs) -def test_weighted_perc_dec(): +def test_weighted_perc_cut(): dfx = pd.DataFrame(data=DATA, columns=['tax_diff', 's006', 'label']) grouped = dfx.groupby('label') - diffs = grouped.apply(weighted_perc_dec, 'tax_diff') + diffs = grouped.apply(weighted_perc_cut, 'tax_diff') exp = pd.Series(data=[4. / 12., 0.0], index=['a', 'b']) exp.index.name = 'label' pd.util.testing.assert_series_equal(exp, diffs) @@ -277,16 +350,18 @@ def test_weighted_perc_dec(): def test_add_income_bins(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) - bins = [-1e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999, - 200000, 1e99] - dfr = add_income_bins(dfx, compare_with='tpc', bins=None) + bins = [-9e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999, + 200000, 9e99] + dfr = add_income_bins(dfx, 'expanded_income', bin_type='tpc', bins=None, + right=True) groupedr = dfr.groupby('bins') idx = 1 for name, _ in groupedr: assert name.closed == 'right' assert abs(name.right - bins[idx]) < EPSILON idx += 1 - dfl = add_income_bins(dfx, compare_with='tpc', bins=None, right=False) + dfl = add_income_bins(dfx, 'expanded_income', bin_type='tpc', bins=None, + right=False) groupedl = dfl.groupby('bins') idx = 1 for name, _ in groupedl: @@ -298,17 +373,17 @@ def test_add_income_bins(): def test_add_income_bins_soi(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) - bins = [-1e99, 0, 4999, 9999, 14999, 19999, 24999, 29999, 39999, + bins = [-9e99, 0, 4999, 9999, 14999, 19999, 24999, 29999, 39999, 49999, 74999, 99999, 199999, 499999, 999999, 1499999, - 1999999, 4999999, 9999999, 1e99] - dfr = add_income_bins(dfx, compare_with='soi', bins=None) + 1999999, 4999999, 9999999, 9e99] + dfr = add_income_bins(dfx, 'expanded_income', bin_type='soi', right=True) groupedr = dfr.groupby('bins') idx = 1 for name, _ in groupedr: assert name.closed == 'right' assert abs(name.right - bins[idx]) < EPSILON idx += 1 - dfl = add_income_bins(dfx, compare_with='soi', bins=None, right=False) + dfl = add_income_bins(dfx, 'expanded_income', bin_type='soi', right=False) groupedl = dfl.groupby('bins') idx = 1 for name, _ in groupedl: @@ -320,15 +395,15 @@ def test_add_income_bins_soi(): def test_add_exp_income_bins(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) - bins = [-1e99, 0, 4999, 9999, 14999, 19999, 29999, 32999, 43999, 1e99] - dfr = add_income_bins(dfx, bins=bins) + bins = [-9e99, 0, 4999, 9999, 14999, 19999, 29999, 32999, 43999, 9e99] + dfr = add_income_bins(dfx, 'expanded_income', bins=bins, right=True) groupedr = dfr.groupby('bins') idx = 1 for name, _ in groupedr: assert name.closed == 'right' assert abs(name.right - bins[idx]) < EPSILON idx += 1 - dfl = add_income_bins(dfx, bins=bins, right=False) + dfl = add_income_bins(dfx, 'expanded_income', bins=bins, right=False) groupedl = dfl.groupby('bins') idx = 1 for name, _ in groupedl: @@ -341,21 +416,24 @@ def test_add_income_bins_raises(): dta = np.arange(1, 1e6, 5000) dfx = pd.DataFrame(data=dta, columns=['expanded_income']) with pytest.raises(ValueError): - dfx = add_income_bins(dfx, compare_with='stuff') + dfx = add_income_bins(dfx, 'expanded_income', bin_type='stuff') -def test_add_weighted_income_bins(): +def test_add_quantile_bins(): dfx = pd.DataFrame(data=DATA, columns=['expanded_income', 's006', 'label']) - dfb = add_weighted_income_bins(dfx, num_bins=100) + dfb = add_quantile_bins(dfx, 'expanded_income', 100, + weight_by_income_measure=False) bin_labels = dfb['bins'].unique() default_labels = set(range(1, 101)) for lab in bin_labels: assert lab in default_labels # custom labels - dfb = add_weighted_income_bins(dfx, weight_by_income_measure=True) + dfb = add_quantile_bins(dfx, 'expanded_income', 100, + weight_by_income_measure=True) assert 'bins' in dfb custom_labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] - dfb = add_weighted_income_bins(dfx, labels=custom_labels) + dfb = add_quantile_bins(dfx, 'expanded_income', 10, + labels=custom_labels) assert 'bins' in dfb bin_labels = dfb['bins'].unique() for lab in bin_labels: @@ -368,13 +446,16 @@ def test_dist_table_sum_row(cps_subsample): calc.calc_all() tb1 = create_distribution_table(calc.records, groupby='small_income_bins', + income_measure='expanded_income', result_type='weighted_sum') tb2 = create_distribution_table(calc.records, groupby='large_income_bins', + income_measure='expanded_income', result_type='weighted_sum') assert np.allclose(tb1[-1:], tb2[-1:]) tb3 = create_distribution_table(calc.records, groupby='small_income_bins', + income_measure='expanded_income', result_type='weighted_avg') assert isinstance(tb3, pd.DataFrame) @@ -394,43 +475,22 @@ def test_diff_table_sum_row(cps_subsample): calc2.calc_all() # create two difference tables and compare their content tdiff1 = create_difference_table(calc1.records, calc2.records, - groupby='small_income_bins') + groupby='small_income_bins', + income_measure='expanded_income', + tax_to_diff='iitax') tdiff2 = create_difference_table(calc1.records, calc2.records, - groupby='large_income_bins') + groupby='large_income_bins', + income_measure='expanded_income', + tax_to_diff='iitax') non_digit_cols = ['mean', 'perc_inc', 'perc_cut', 'share_of_change', - 'aftertax_perc'] - digit_cols = [x for x in tdiff1.columns.tolist() if - x not in non_digit_cols] + 'perc_aftertax'] + digit_cols = [c for c in list(tdiff1) if c not in non_digit_cols] assert np.allclose(tdiff1[digit_cols][-1:], tdiff2[digit_cols][-1:]) np.testing.assert_array_equal(tdiff1[non_digit_cols][-1:], tdiff2[non_digit_cols][-1:]) -def test_row_classifier(cps_subsample): - # create a current-law Policy object and Calculator calc1 - policy1 = Policy() - records1 = Records.cps_constructor(data=cps_subsample) - calc1 = Calculator(policy=policy1, records=records1) - calc1.calc_all() - calc1_s006 = create_distribution_table(calc1.records, - groupby='webapp_income_bins', - result_type='weighted_sum').s006 - # create a policy-reform Policy object and Calculator calc2 - reform = {2013: {'_ALD_StudentLoan_hc': [1]}} - policy2 = Policy() - policy2.implement_reform(reform) - records2 = Records.cps_constructor(data=cps_subsample) - calc2 = Calculator(policy=policy2, records=records2) - calc2.calc_all() - calc2_s006 = create_distribution_table(calc2.records, - groupby='webapp_income_bins', - result_type='weighted_sum', - baseline_obj=calc1.records).s006 - # use weighted sum of weights in each cell to check classifer - assert np.allclose(calc1_s006, calc2_s006, atol=0.00, rtol=0.0) - - def test_mtr_graph_data(cps_subsample): calc = Calculator(policy=Policy(), records=Records.cps_constructor(data=cps_subsample)) @@ -455,11 +515,6 @@ def test_mtr_graph_data(cps_subsample): income_measure='wages', dollar_weighting=True) assert isinstance(gdata, dict) - with pytest.raises(ValueError): - calcx = Calculator(policy=Policy(), - records=Records.cps_constructor(data=cps_subsample)) - calcx.advance_to_year(2020) - gdata = mtr_graph_data(calcx, calc) def test_atr_graph_data(cps_subsample): diff --git a/taxcalc/utils.py b/taxcalc/utils.py index f8348ce04..f57e5bf74 100644 --- a/taxcalc/utils.py +++ b/taxcalc/utils.py @@ -23,7 +23,7 @@ weighted_count, weighted_mean, wage_weighted, agi_weighted, expanded_income_weighted, - weighted_perc_inc, weighted_perc_dec, + weighted_perc_inc, weighted_perc_cut, EPSILON) @@ -50,20 +50,26 @@ 'Combined Payroll and Individual Income Tax Liabilities'] # Following list is used in our difference table to label its columns. -DIFF_TABLE_LABELS = ['Tax Units with Tax Cut', 'Tax Units with Tax Increase', - 'Count', 'Average Tax Change', 'Total Tax Difference', - 'Percent with Tax Increase', 'Percent with Tax Decrease', - 'Share of Overall Change'] +DIFF_TABLE_LABELS = ['Tax Units with Tax Cut', + 'Tax Units with Tax Increase', + 'Count', + 'Average Tax Change', + 'Total Tax Difference', + 'Percent with Tax Increase', + 'Percent with Tax Decrease', + 'Share of Overall Change', + 'Change as % of Aftertax Income'] -LARGE_INCOME_BINS = [-1e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999, - 200000, 1e99] -SMALL_INCOME_BINS = [-1e99, 0, 4999, 9999, 14999, 19999, 24999, 29999, 39999, - 49999, 74999, 99999, 199999, 499999, 999999, 1499999, - 1999999, 4999999, 9999999, 1e99] +WEBAPP_INCOME_BINS = [-9e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999, + 199999, 499999, 1000000, 9e99] + +LARGE_INCOME_BINS = [-9e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999, + 200000, 9e99] -WEBAPP_INCOME_BINS = [-1e99, 0, 9999, 19999, 29999, 39999, 49999, 74999, 99999, - 199999, 499999, 1000000, 1e99] +SMALL_INCOME_BINS = [-9e99, 0, 4999, 9999, 14999, 19999, 24999, 29999, 39999, + 49999, 74999, 99999, 199999, 499999, 999999, 1499999, + 1999999, 4999999, 9999999, 9e99] def unweighted_sum(pdf, col_name): @@ -80,13 +86,15 @@ def weighted_sum(pdf, col_name): return (pdf[col_name] * pdf['s006']).sum() -def add_weighted_income_bins(pdf, num_bins=10, labels=None, - income_measure='expanded_income', - weight_by_income_measure=False): +def add_quantile_bins(pdf, income_measure, num_bins, + weight_by_income_measure=False, labels=None): """ Add a column of income bins to specified Pandas DataFrame, pdf, with - the new column being named 'bins'. Assumes that specified pdf contains - columns for the specified income_measure and for sample weights, s006. + the new column being named 'bins'. The bins hold equal number of + filing units when weight_by_income_measure=False or equal number of + income dollars when weight_by_income_measure=True. Assumes that + specified pdf contains columns for the specified income_measure and + for sample weights, s006. """ pdf.sort_values(by=income_measure, inplace=True) if weight_by_income_measure: @@ -109,78 +117,51 @@ def add_weighted_income_bins(pdf, num_bins=10, labels=None, return pdf -def add_income_bins(pdf, compare_with='soi', bins=None, right=True, - income_measure='expanded_income'): +def add_income_bins(pdf, income_measure, + bin_type='soi', bins=None, right=True): """ - Add a column of income bins of income_measure using pandas 'cut'. - This will serve as a 'grouper' later on. + Add a column of income bins of income_measure using Pandas 'cut' function. Parameters ---------- - pdf: Pandas DataFrame object + pdf: Pandas DataFrame the object to which we are adding bins - compare_with: String, optional - options for input: 'tpc', 'soi', 'webapp' - determines which types of bins will be added + income_measure: String + specifies income variable used to construct bins + + bin_type: String, optional + options for input: 'webapp', 'tpc', 'soi' default: 'soi' - bins: iterable of scalars, optional income breakpoints. - Follows pandas convention. The breakpoint is inclusive if - right=True. This argument overrides any choice of compare_with. + bins: iterable of scalars, optional income breakpoints + follows Pandas convention; the breakpoint is inclusive if + right=True; this argument overrides the compare_with argument right : bool, optional - Indicates whether the bins include the rightmost edge or not. - If right == True (the default), then the bins [1,2,3,4] - indicate (1,2], (2,3], (3,4]. + indicates whether the bins include the rightmost edge or not; + if right == True (the default), then bins=[1,2,3,4] implies + this bin grouping (1,2], (2,3], (3,4] Returns ------- - pdf: Pandas DataFrame object - the original input that bins have been added to + pdf: Pandas DataFrame + the original input plus the added 'bin' column """ if not bins: - if compare_with == 'tpc': + if bin_type == 'webapp': + bins = WEBAPP_INCOME_BINS + elif bin_type == 'tpc': bins = LARGE_INCOME_BINS - elif compare_with == 'soi': + elif bin_type == 'soi': bins = SMALL_INCOME_BINS - elif compare_with == 'webapp': - bins = WEBAPP_INCOME_BINS else: - msg = 'Unknown compare_with arg {0}'.format(compare_with) + msg = 'Unknown bin_type argument {}'.format(bin_type) raise ValueError(msg) - # Groupby income_measure bins pdf['bins'] = pd.cut(pdf[income_measure], bins, right=right) return pdf -def means_and_comparisons(col_name, gpdf, weighted_total): - """ - Return new Pandas DataFrame based on grouped values of specified - col_name in specified gpdf Pandas DataFrame. - col_name: the column name to calculate against - gpdf: grouped Pandas DataFrame - """ - def weighted_share_of_total(pdf, col_name, total): - """ - Nested function that returns the ratio of - weighted_sum(pdf, col_name) and the specified total. - """ - return weighted_sum(pdf, col_name) / (float(total) + EPSILON) - # tabulate who has a tax cut and who has a tax increase - diffs = gpdf.apply(weighted_count_lt_zero, col_name) - diffs = pd.DataFrame(data=diffs, columns=['tax_cut']) - diffs['tax_inc'] = gpdf.apply(weighted_count_gt_zero, col_name) - diffs['count'] = gpdf.apply(weighted_count) - diffs['mean'] = gpdf.apply(weighted_mean, col_name) - diffs['tot_change'] = gpdf.apply(weighted_sum, col_name) - diffs['perc_inc'] = gpdf.apply(weighted_perc_inc, col_name) - diffs['perc_cut'] = gpdf.apply(weighted_perc_dec, col_name) - diffs['share_of_change'] = gpdf.apply(weighted_share_of_total, - col_name, weighted_total) - return diffs - - def weighted(pdf, col_names): """ Return Pandas DataFrame in which each pdf column variable has been @@ -211,9 +192,9 @@ def get_sums(pdf, not_available=False): return pd.Series(sums, name='sums') -def results(obj): +def results(obj, cols=None): """ - Get results from object and organize them into a table. + Get cols results from object and organize them into a table. Parameters ---------- @@ -221,12 +202,20 @@ def results(obj): Examples include a Tax-Calculator Records object and a Pandas DataFrame object + cols : list of object results columns to put into table + if None, the use STATS_COLUMNS as cols list + Returns ------- - Pandas DataFrame object + table : Pandas DataFrame object """ - arrays = [getattr(obj, name) for name in STATS_COLUMNS] - return pd.DataFrame(data=np.column_stack(arrays), columns=STATS_COLUMNS) + if cols is None: + columns = STATS_COLUMNS + else: + columns = cols + arrays = [getattr(obj, name) for name in columns] + tbl = pd.DataFrame(data=np.column_stack(arrays), columns=columns) + return tbl def weighted_avg_allcols(pdf, col_list, income_measure='expanded_income'): @@ -250,37 +239,29 @@ def weighted_avg_allcols(pdf, col_list, income_measure='expanded_income'): return wadf -def create_distribution_table(obj, groupby, result_type, - income_measure='expanded_income', - baseline_obj=None, diffs=False): +def create_distribution_table(obj, groupby, income_measure, result_type): """ - Get results from object, sort them based on groupby, manipulate them - based on result_type, and return them as a table. + Get results from object, sort them based on groupby using income_measure, + manipulate them based on result_type, and return them as a table. Parameters ---------- obj : any object with array-like attributes named as in STATS_COLUMNS list Examples include a Tax-Calculator Records object and a - Pandas DataFrame object, but if baseline_obj is specified, both obj - and baseline_obj must have a current_year attribute + Pandas DataFrame object. groupby : String object - options for input: 'weighted_deciles', 'small_income_bins', - 'large_income_bins', 'webapp_income_bins'; + options for input: 'weighted_deciles', 'webapp_income_bins', + 'large_income_bins', 'small_income_bins'; determines how the columns in the resulting Pandas DataFrame are sorted result_type : String object options for input: 'weighted_sum' or 'weighted_avg'; determines how the data should be manipulated - baseline_obj : any object with array-like attributes named as in - the STATS_COLUMNS list and having a current_year attribute - Examples include a Tax-Calculator Records object - - diffs : boolean - indicates showing the results from reform or the difference between - the baseline and reform. Turn this switch to True if you want to see - the difference + income_measure : String object + options for input: 'expanded_income', 'c00100'(AGI), + 'expanded_income_baseline', 'c00100_baseline' Notes ----- @@ -295,9 +276,9 @@ def create_distribution_table(obj, groupby, result_type, Returns ------- - Pandas DataFrame object + distribution table as a Pandas DataFrame """ - # pylint: disable=too-many-arguments + # nested function that specifies calculated columns def add_columns(pdf): """ Nested function that adds several columns to @@ -316,41 +297,32 @@ def add_columns(pdf): # weight of returns with positive Alternative Minimum Tax (AMT) pdf['num_returns_AMT'] = pdf['s006'].where(pdf['c09600'] > 0., 0.) return pdf - # create distribution table - res = results(obj) + # main logic of create_distribution_table + assert (income_measure == 'expanded_income' or + income_measure == 'c00100' or + income_measure == 'expanded_income_baseline' or + income_measure == 'c00100_baseline') + if income_measure not in STATS_COLUMNS: + columns = STATS_COLUMNS + [income_measure] + else: + columns = None + res = results(obj, cols=columns) res = add_columns(res) - if baseline_obj is not None: - res_base = results(baseline_obj) - if obj.current_year != baseline_obj.current_year: - msg = 'current_year differs in baseline obj and reform obj' - raise ValueError(msg) - baseline_income_measure = income_measure + '_baseline' - res[baseline_income_measure] = res_base[income_measure] - income_measure = baseline_income_measure - if diffs: - res_base = add_columns(res_base) - res = res.subtract(res_base) - res['s006'] = res_base['s006'] - # sort the data + # sort the data given specified groupby and income_measure if groupby == 'weighted_deciles': - pdf = add_weighted_income_bins(res, num_bins=10, - income_measure=income_measure) - elif groupby == 'small_income_bins': - pdf = add_income_bins(res, compare_with='soi', - income_measure=income_measure) - elif groupby == 'large_income_bins': - pdf = add_income_bins(res, compare_with='tpc', - income_measure=income_measure) + pdf = add_quantile_bins(res, income_measure, 10) elif groupby == 'webapp_income_bins': - pdf = add_income_bins(res, compare_with='webapp', - income_measure=income_measure) + pdf = add_income_bins(res, income_measure, bin_type='webapp') + elif groupby == 'large_income_bins': + pdf = add_income_bins(res, income_measure, bin_type='tpc') + elif groupby == 'small_income_bins': + pdf = add_income_bins(res, income_measure, bin_type='soi') else: msg = ("groupby must be either 'weighted_deciles' or " - "'small_income_bins' or 'large_income_bins' or " - "'webapp_income_bins'") + "'webapp_income_bins' or 'large_income_bins' or " + "'small_income_bins'") raise ValueError(msg) - # manipulate the data - pd.options.display.float_format = '{:8,.0f}'.format + # manipulate the data given specified result_type if result_type == 'weighted_sum': pdf = weighted(pdf, STATS_COLUMNS) gpdf_mean = pdf.groupby('bins', as_index=False)[TABLE_COLUMNS].sum() @@ -363,96 +335,117 @@ def add_columns(pdf): else: msg = "result_type must be either 'weighted_sum' or 'weighted_avg'" raise ValueError(msg) - return gpdf_mean.append(sum_row) + dist_table = gpdf_mean.append(sum_row) + # set print display format for float table elements + pd.options.display.float_format = '{:8,.0f}'.format + return dist_table -def create_difference_table(recs1, recs2, groupby, - income_measure='expanded_income', - tax_to_present='iitax'): +def create_difference_table(res1, res2, groupby, income_measure, tax_to_diff): """ - Get results from two different Records objects for the same year, compare - the two results, and return the differences as a Pandas DataFrame that is - sorted according to the variable specified by the groupby argument. + Get results from two different res, compare the two tax-diff results, + and return the difference statistics as a Pandas DataFrame that is sorted + according to the variable specified by the groupby argument. Parameters ---------- - recs1 : a Tax-Calculator Records object that refers to the baseline + res1 : baseline object is either a Tax-Calculator Records object or + a Pandas DataFrame including columns in STATS_COLUMNS list - recs2 : a Tax-Calculator Records object that refers to the reform + res2 : reform object is either a Tax-Calculator Records object or + a Pandas DataFrame including columns in STATS_COLUMNS list groupby : String object - options for input: 'weighted_deciles', 'small_income_bins', - 'large_income_bins', 'webapp_income_bins' - determines how the columns in the resulting Pandas DataFrame are sorted + options for input: 'weighted_deciles', 'webapp_income_bins', + 'large_income_bins', 'small_income_bins' + specifies kind of bins used to group filing units income_measure : String object - options for input: 'expanded_income', 'iitax' - classifier of income bins/deciles + options for input: 'expanded_income', 'c00100'(AGI) + specifies statistic to place filing units in bins - tax_to_present : String object + tax_to_diff : String object options for input: 'iitax', 'payrolltax', 'combined' + specifies which tax to difference Returns ------- - Pandas DataFrame object + difference table as a Pandas DataFrame """ - # pylint: disable=too-many-locals - if recs1.current_year != recs2.current_year: - msg = 'recs1.current_year not equal to recs2.current_year' - raise ValueError(msg) - res1 = results(recs1) - res2 = results(recs2) + # nested function that actually creates the difference table + def diff_table_stats(res2, groupby, income_measure): + """ + Return new Pandas DataFrame containing difference table statistics + based on grouped values of specified col_name in the specified res2. + + res2: reform difference results Pandas DataFrame + groupby: string naming type of bins + income_measure: string naming column used to create res2 bins + """ + # pylint: disable=too-many-locals + def weighted_share_of_total(gpdf, colname, total): + """ + Nested function that returns the ratio of the + weighted_sum(pdf, colname) and specified total. + """ + return weighted_sum(gpdf, colname) / (total + EPSILON) + # add bin column to res2 given specified groupby and income_measure + if groupby == 'weighted_deciles': + pdf = add_quantile_bins(res2, income_measure, 10) + elif groupby == 'webapp_income_bins': + pdf = add_income_bins(res2, income_measure, bin_type='webapp') + elif groupby == 'large_income_bins': + pdf = add_income_bins(res2, income_measure, bin_type='tpc') + elif groupby == 'small_income_bins': + pdf = add_income_bins(res2, income_measure, bin_type='soi') + else: + msg = ("groupby must be either " + "'weighted_deciles' or 'webapp_income_bins' " + "or 'large_income_bins' or 'small_income_bins'") + raise ValueError(msg) + # create grouped Pandas DataFrame + gpdf = pdf.groupby('bins', as_index=False) + # create difference table statistics from gpdf in a new DataFrame + diffs = pd.DataFrame() + diffs['tax_cut'] = gpdf.apply(weighted_count_lt_zero, 'tax_diff') + diffs['tax_inc'] = gpdf.apply(weighted_count_gt_zero, 'tax_diff') + diffs['count'] = gpdf.apply(weighted_count) + diffs['mean'] = gpdf.apply(weighted_mean, 'tax_diff') + diffs['tot_change'] = gpdf.apply(weighted_sum, 'tax_diff') + diffs['perc_inc'] = gpdf.apply(weighted_perc_inc, 'tax_diff') + diffs['perc_cut'] = gpdf.apply(weighted_perc_cut, 'tax_diff') + wtotal = (res2['tax_diff'] * res2['s006']).sum() + diffs['share_of_change'] = gpdf.apply(weighted_share_of_total, + 'tax_diff', wtotal) + diffs['perc_aftertax'] = gpdf.apply(weighted_mean, 'perc_aftertax') + # add sum row at bottom and convert some cols to percentages + sum_row = get_sums(diffs)[diffs.columns] + difs = diffs.append(sum_row) + pct_cols = ['perc_inc', 'perc_cut', 'share_of_change', 'perc_aftertax'] + for col in pct_cols: + newvals = ['{:.2f}%'.format(val * 100) for val in difs[col]] + difs[col] = pd.Series(newvals, index=difs.index) + # specify some column sum elements to be 'n/a' + non_sum_cols = [c for c in difs.columns if 'mean' in c or 'perc' in c] + for col in non_sum_cols: + difs.loc['sums', col] = 'n/a' + # set print display format for float table elements + pd.options.display.float_format = '{:8,.0f}'.format + return difs + # main logic of create_difference_table + isdf1 = isinstance(res1, pd.DataFrame) + isdf2 = isinstance(res2, pd.DataFrame) + assert isdf1 == isdf2 + if not isdf1: + assert res1.current_year == res2.current_year + res1 = results(res1) + res2 = results(res2) + assert income_measure == 'expanded_income' or income_measure == 'c00100' baseline_income_measure = income_measure + '_baseline' res2[baseline_income_measure] = res1[income_measure] - res2['aftertax_baseline'] = res1['aftertax_income'] - income_measure = baseline_income_measure - if groupby == 'weighted_deciles': - pdf = add_weighted_income_bins(res2, num_bins=10, - income_measure=income_measure) - elif groupby == 'small_income_bins': - pdf = add_income_bins(res2, compare_with='soi', - income_measure=income_measure) - elif groupby == 'large_income_bins': - pdf = add_income_bins(res2, compare_with='tpc', - income_measure=income_measure) - elif groupby == 'webapp_income_bins': - pdf = add_income_bins(res2, compare_with='webapp', - income_measure=income_measure) - else: - msg = ("groupby must be either " - "'weighted_deciles' or 'small_income_bins' " - "or 'large_income_bins' or 'webapp_income_bins'") - raise ValueError(msg) - # compute difference in results - # Positive values are the magnitude of the tax increase - # Negative values are the magnitude of the tax decrease - res2['tax_diff'] = res2[tax_to_present] - res1[tax_to_present] - res2['aftertax_perc'] = res2['tax_diff'] / res2['aftertax_baseline'] - diffs = means_and_comparisons('tax_diff', - pdf.groupby('bins', as_index=False), - (res2['tax_diff'] * res2['s006']).sum()) - aftertax_perc = pdf.groupby('bins', as_index=False).apply(weighted_mean, - 'aftertax_perc') - diffs['aftertax_perc'] = aftertax_perc - sum_row = get_sums(diffs)[diffs.columns.values.tolist()] - diffs = diffs.append(sum_row) # pylint: disable=redefined-variable-type - pd.options.display.float_format = '{:8,.0f}'.format - srs_inc = ['{0:.2f}%'.format(val * 100) for val in diffs['perc_inc']] - diffs['perc_inc'] = pd.Series(srs_inc, index=diffs.index) - - srs_cut = ['{0:.2f}%'.format(val * 100) for val in diffs['perc_cut']] - diffs['perc_cut'] = pd.Series(srs_cut, index=diffs.index) - srs_change = ['{0:.2f}%'.format(val * 100) - for val in diffs['share_of_change']] - diffs['share_of_change'] = pd.Series(srs_change, index=diffs.index) - srs_aftertax_perc = ['{0:.2f}%'.format(val * 100) - for val in diffs['aftertax_perc']] - diffs['aftertax_perc'] = pd.Series(srs_aftertax_perc, index=diffs.index) - # columns containing weighted values relative to the binning mechanism - non_sum_cols = [col for col in diffs.columns - if 'mean' in col or 'perc' in col] - for col in non_sum_cols: - diffs.loc['sums', col] = 'n/a' + res2['tax_diff'] = res2[tax_to_diff] - res1[tax_to_diff] + res2['perc_aftertax'] = res2['tax_diff'] / res1['aftertax_income'] + diffs = diff_table_stats(res2, groupby, baseline_income_measure) return diffs @@ -675,11 +668,8 @@ def mtr_graph_data(calc1, calc2, # pylint: disable=too-many-arguments,too-many-statements, # pylint: disable=too-many-locals,too-many-branches # check that two calculator objects have the same current_year - if calc1.current_year == calc2.current_year: - year = calc1.current_year - else: - msg = 'calc1.current_year={} != calc2.current_year={}' - raise ValueError(msg.format(calc1.current_year, calc2.current_year)) + assert calc1.current_year == calc2.current_year + year = calc1.current_year # check validity of function arguments # . . check income_measure value weighting_function = weighted_mean @@ -757,9 +747,8 @@ def mtr_graph_data(calc1, calc2, if mars != 'ALL': dfx = dfx[dfx['MARS'] == mars] # create 'bins' column given specified income_var and dollar_weighting - dfx = add_weighted_income_bins(dfx, num_bins=100, - income_measure=income_var, - weight_by_income_measure=dollar_weighting) + dfx = add_quantile_bins(dfx, income_var, 100, + weight_by_income_measure=dollar_weighting) # split dfx into groups specified by 'bins' column gdfx = dfx.groupby('bins', as_index=False) # apply the weighting_function to percentile-grouped mtr values @@ -894,8 +883,7 @@ def atr_graph_data(calc1, calc2, if mars != 'ALL': dfx = dfx[dfx['MARS'] == mars] # create 'bins' column - dfx = add_weighted_income_bins(dfx, num_bins=100, - income_measure='expanded_income') + dfx = add_quantile_bins(dfx, 'expanded_income', 100) # split dfx into groups specified by 'bins' column gdfx = dfx.groupby('bins', as_index=False) # apply weighted_mean function to percentile-grouped income/tax values diff --git a/taxcalc/utilsprvt.py b/taxcalc/utilsprvt.py index e3b9d71b2..2032e87f9 100644 --- a/taxcalc/utilsprvt.py +++ b/taxcalc/utilsprvt.py @@ -34,8 +34,8 @@ def weighted_mean(pdf, col_name): """ Return weighted mean of Pandas DataFrame col_name items. """ - return (float((pdf[col_name] * pdf['s006']).sum()) / - float(pdf['s006'].sum() + EPSILON)) + return ((pdf[col_name] * pdf['s006']).sum() / + (pdf['s006'].sum() + EPSILON)) def wage_weighted(pdf, col_name): @@ -44,8 +44,8 @@ def wage_weighted(pdf, col_name): """ swght = 's006' wage = 'e00200' - return (float((pdf[col_name] * pdf[swght] * pdf[wage]).sum()) / - float((pdf[swght] * pdf[wage]).sum() + EPSILON)) + return (((pdf[col_name] * pdf[swght] * pdf[wage]).sum()) / + ((pdf[swght] * pdf[wage]).sum() + EPSILON)) def agi_weighted(pdf, col_name): @@ -54,8 +54,8 @@ def agi_weighted(pdf, col_name): """ swght = 's006' agi = 'c00100' - return (float((pdf[col_name] * pdf[swght] * pdf[agi]).sum()) / - float((pdf[swght] * pdf[agi]).sum() + EPSILON)) + return ((pdf[col_name] * pdf[swght] * pdf[agi]).sum() / + ((pdf[swght] * pdf[agi]).sum() + EPSILON)) def expanded_income_weighted(pdf, col_name): @@ -64,8 +64,8 @@ def expanded_income_weighted(pdf, col_name): """ swght = 's006' expinc = 'expanded_income' - return (float((pdf[col_name] * pdf[swght] * pdf[expinc]).sum()) / - float((pdf[swght] * pdf[expinc]).sum() + EPSILON)) + return ((pdf[col_name] * pdf[swght] * pdf[expinc]).sum() / + ((pdf[swght] * pdf[expinc]).sum() + EPSILON)) def weighted_perc_inc(pdf, col_name): @@ -73,14 +73,14 @@ def weighted_perc_inc(pdf, col_name): Return weighted fraction (not percent) of positive values for the variable with col_name in the specified Pandas DataFrame. """ - return (float(weighted_count_gt_zero(pdf, col_name)) / - float(weighted_count(pdf) + EPSILON)) + return (weighted_count_gt_zero(pdf, col_name) / + (weighted_count(pdf) + EPSILON)) -def weighted_perc_dec(pdf, col_name): +def weighted_perc_cut(pdf, col_name): """ Return weighted fraction (not percent) of negative values for the variable with col_name in the specified Pandas DataFrame. """ - return (float(weighted_count_lt_zero(pdf, col_name)) / - float(weighted_count(pdf) + EPSILON)) + return (weighted_count_lt_zero(pdf, col_name) / + (weighted_count(pdf) + EPSILON))