From 0f1c1c7baf1f89c4923c88fcc64a79ba6ec36916 Mon Sep 17 00:00:00 2001 From: martinholmer Date: Tue, 13 Mar 2018 20:30:35 -0400 Subject: [PATCH] Reorganize taxcalc validation code --- TESTING.md | 2 +- .../validation/csv_input.py => puf_fuzz.py | 25 ++-- .../validation/taxsim/simtax.py => simtax.py | 8 +- taxcalc/__init__.py | 1 + taxcalc/csv_show.sh | 26 ++++ taxcalc/csv_vars.sh | 13 ++ .../{validation/taxsim => }/simpletaxio.py | 7 +- .../taxsim => tests}/test_simpletaxio.py | 2 +- taxcalc/validation/csv_extract.py | 125 ------------------ taxcalc/validation/taxsim/test.sh | 4 +- 10 files changed, 66 insertions(+), 147 deletions(-) rename taxcalc/validation/csv_input.py => puf_fuzz.py (91%) rename taxcalc/validation/taxsim/simtax.py => simtax.py (97%) create mode 100755 taxcalc/csv_show.sh create mode 100755 taxcalc/csv_vars.sh rename taxcalc/{validation/taxsim => }/simpletaxio.py (99%) rename taxcalc/{validation/taxsim => tests}/test_simpletaxio.py (99%) delete mode 100644 taxcalc/validation/csv_extract.py diff --git a/TESTING.md b/TESTING.md index 0117d8a75..65b3c114d 100644 --- a/TESTING.md +++ b/TESTING.md @@ -26,7 +26,7 @@ against the PEP8 standard. Do the checks this way: ``` cd taxcalc -pep8 --ignore=E402 . +pep8 . ``` No messages indicate the PEP8 tests pass. Once you get that result, diff --git a/taxcalc/validation/csv_input.py b/puf_fuzz.py similarity index 91% rename from taxcalc/validation/csv_input.py rename to puf_fuzz.py index 8904aad7f..930108abe 100644 --- a/taxcalc/validation/csv_input.py +++ b/puf_fuzz.py @@ -1,5 +1,5 @@ """ -Tax-Calculator validation script that adds random amounts to most +Tax-Calculator Python script that adds random amounts to most variables in the puf.csv input file, which must be located in the top-level directory of the Tax-Calculator source code tree. The resulting input file is xYY.csv, where YY denotes the tax year. @@ -9,19 +9,17 @@ generated by the standard puf.csv input file. """ # CODING-STYLE CHECKS: -# pep8 --ignore=E402 csv_input.py -# pylint --disable=locally-disabled csv_input.py +# pep8 --ignore=E402 puf_fuzz.py +# pylint --disable=locally-disabled puf_fuzz.py import argparse import sys import os import numpy as np import pandas as pd -CUR_PATH = os.path.abspath(os.path.dirname(__file__)) -sys.path.append(os.path.join(CUR_PATH, '..', '..')) -# pylint: disable=import-error,wrong-import-position from taxcalc import Records + # specify maximum allowed values for command-line parameters MAX_YEAR = 2023 # maximum tax year allowed for tax calculations MAX_SEED = 999999999 # maximum allowed seed for random-number generator @@ -40,11 +38,13 @@ 'e09700', 'e09800', 'e09900', 'e11200']) # specify set of variables whose values are not to be randomized +Records.read_var_info() if DEBUG: SKIP_VARS = Records.USABLE_READ_VARS else: SKIP_VARS = set(['RECID', 'MARS', 'DSI', 'MIDR', 'FLPDYR', 'age_head', 'age_spouse', + 'nu18', 'n1820', 'n21', 'XTOT', 'EIC', 'n24', 'f2441', 'f6251']) @@ -131,7 +131,8 @@ def main(taxyear, rnseed, ssize): Contains high-level logic of the script. """ # read puf.csv file into a Pandas DataFrame - pufcsv_filename = os.path.join(CUR_PATH, '..', '..', 'puf.csv') + current_path = os.path.abspath(os.path.dirname(__file__)) + pufcsv_filename = os.path.join(current_path, '..', '..', 'puf.csv') if not os.path.isfile(pufcsv_filename): msg = 'ERROR: puf.csv file not found in top-level directory' sys.stderr.write(msg + '\n') @@ -182,17 +183,17 @@ def main(taxyear, rnseed, ssize): if __name__ == '__main__': # parse command-line arguments: PARSER = argparse.ArgumentParser( - prog='python csv_input.py', + prog='python puf_fuzz.py', description=('Adds random amounts to certain variables in ' 'puf.csv input file and writes the randomized ' 'CSV-formatted input file to xYY.csv file.')) - PARSER.add_argument('YEAR', nargs='?', type=int, default=0, + PARSER.add_argument('YEAR', type=int, default=0, help=('YEAR is tax year; ' 'must be in [2013,{}] range.'.format(MAX_YEAR))) - PARSER.add_argument('SEED', nargs='?', type=int, default=0, + PARSER.add_argument('SEED', type=int, default=0, help=('SEED is random-number seed; ' 'must be in [1,{}] range.'.format(MAX_SEED))) - PARSER.add_argument('SIZE', nargs='?', type=int, default=0, + PARSER.add_argument('SIZE', type=int, default=0, help=('SIZE is sample size; ' 'must be in [1,{}] range.'.format(MAX_SIZE))) ARGS = PARSER.parse_args() @@ -211,7 +212,7 @@ def main(taxyear, rnseed, ssize): sys.stderr.write('ERROR: SIZE {} not in {}\n'.format(ARGS.SIZE, RSTR)) ARGS_ERROR = True if ARGS_ERROR: - sys.stderr.write('USAGE: python csv_input.py --help\n') + sys.stderr.write('USAGE: python puf_fuzz.py --help\n') RCODE = 1 else: RCODE = main(ARGS.YEAR, ARGS.SEED, ARGS.SIZE) diff --git a/taxcalc/validation/taxsim/simtax.py b/simtax.py similarity index 97% rename from taxcalc/validation/taxsim/simtax.py rename to simtax.py index 1fef4141d..a8b40312e 100644 --- a/taxcalc/validation/taxsim/simtax.py +++ b/simtax.py @@ -1,5 +1,5 @@ """ -SIMple input-output capabilities for TAX-calculator. +SIMple input-output capabilities for TAX-calculator used in validation work """ # CODING-STYLE CHECKS: # pep8 --ignore=E402 simtax.py @@ -7,7 +7,11 @@ import argparse import sys -from simpletaxio import SimpleTaxIO +import os +import re +import six +import pandas as pd +from taxcalc import SimpleTaxIO def main(): diff --git a/taxcalc/__init__.py b/taxcalc/__init__.py index b824cf0c6..58cdaf03d 100755 --- a/taxcalc/__init__.py +++ b/taxcalc/__init__.py @@ -9,6 +9,7 @@ from taxcalc.growfactors import * from taxcalc.growdiff import * from taxcalc.records import * +from taxcalc.simpletaxio import * from taxcalc.taxcalcio import * from taxcalc.utils import * from taxcalc.macro_elasticity import * diff --git a/taxcalc/csv_show.sh b/taxcalc/csv_show.sh new file mode 100755 index 000000000..0892b43df --- /dev/null +++ b/taxcalc/csv_show.sh @@ -0,0 +1,26 @@ +#!/bin/bash +if [[ "$#" -ne 2 ]]; then + echo "csvshow prints all non-zero CSV file column values for RECID" + echo "ERROR: must specify exactly two command-line arguments" + echo "USAGE: csvshow FILENAME RECID" + exit 1 +fi +awk -F, ' +BEGIN { + recid_varnum = 0 +} +NR == 1 { + for ( i = 1; i <= NF; i++ ) { + varname[i] = $i + if ( $i == "RECID" ) recid_varnum = i + } +} +$recid_varnum == id { + for ( i = 1; i <= NF; i++ ) { + if ( $i != 0 ) { + print i, varname[i], $i + } + } + exit +} +' id=$2 $1 diff --git a/taxcalc/csv_vars.sh b/taxcalc/csv_vars.sh new file mode 100755 index 000000000..633baa56a --- /dev/null +++ b/taxcalc/csv_vars.sh @@ -0,0 +1,13 @@ +#!/bin/bash +if [[ "$#" -ne 1 ]]; then + echo "csvvars prints all CSV file column numbers and names" + echo "ERROR: number of command-line arguments not equal to one" + echo "USAGE: csvvars FILENAME" + exit 1 +fi +awk -F, ' +NR == 1 { + for( i = 1; i <= NF; i++ ) { + print i, $i +} +' $1 diff --git a/taxcalc/validation/taxsim/simpletaxio.py b/taxcalc/simpletaxio.py similarity index 99% rename from taxcalc/validation/taxsim/simpletaxio.py rename to taxcalc/simpletaxio.py index a006ca75b..b4cef4f50 100644 --- a/taxcalc/validation/taxsim/simpletaxio.py +++ b/taxcalc/simpletaxio.py @@ -10,10 +10,9 @@ import re import six import pandas as pd -CUR_PATH = os.path.abspath(os.path.dirname(__file__)) -sys.path.append(os.path.join(CUR_PATH, '..', '..', '..')) -# pylint: disable=import-error,wrong-import-position -from taxcalc import Policy, Records, Calculator +from taxcalc.policy import Policy +from taxcalc.records import Records +from taxcalc.calculate import Calculator class SimpleTaxIO(object): diff --git a/taxcalc/validation/taxsim/test_simpletaxio.py b/taxcalc/tests/test_simpletaxio.py similarity index 99% rename from taxcalc/validation/taxsim/test_simpletaxio.py rename to taxcalc/tests/test_simpletaxio.py index 645a832e5..73166a0dc 100644 --- a/taxcalc/validation/taxsim/test_simpletaxio.py +++ b/taxcalc/tests/test_simpletaxio.py @@ -8,7 +8,7 @@ import os import tempfile import pytest -from simpletaxio import SimpleTaxIO +from taxcalc import SimpleTaxIO # pylint: disable=import-error NUM_INPUT_LINES = 4 diff --git a/taxcalc/validation/csv_extract.py b/taxcalc/validation/csv_extract.py deleted file mode 100644 index 628b561cb..000000000 --- a/taxcalc/validation/csv_extract.py +++ /dev/null @@ -1,125 +0,0 @@ -""" -Tax-Calculator validation script that extracts non-zero input variables for -the filing unit with specified RECID in specified CSV-formated file. -""" -# CODING-STYLE CHECKS: -# pep8 --ignore=E402 csv_extract.py -# pylint --disable=locally-disabled csv_extract.py - -import argparse -import sys -import os -import pandas as pd -CUR_PATH = os.path.abspath(os.path.dirname(__file__)) -sys.path.append(os.path.join(CUR_PATH, '..', '..')) -# pylint: disable=import-error,wrong-import-position -from taxcalc import Records - - -def main(filename, recid, input_vars_only, transpose): - """ - Contains high-level logic of the script. - """ - # read all file content into Pandas DataFrame - adf = pd.read_csv(filename) - adf_vars = set(adf.columns) # pylint: disable=no-member - - # check that both files contain required tax variables - required_input_vars = set(['RECID', 'MARS']) - required_input_vars_str = 'RECID, MARS' - if not required_input_vars.issubset(adf_vars): - msg = 'ERROR: FILE does not include required input variables: {}\n' - sys.stderr.write(msg.format(required_input_vars_str)) - return 1 - - # check that RECID actually identifies a filing unit in FILE - if recid not in adf['RECID'].values: - msg = 'ERROR: RECID={} not in FILE\n' - sys.stderr.write(msg.format(recid)) - return 1 - - # extract the adf row with specified recid - edf = adf[adf['RECID'] == recid] - edf.is_copy = False - - # optionally remove all but Tax-Calculator usable input variables from edf - if input_vars_only: - Records.read_var_info() - for colname in edf.columns: - if colname not in Records.USABLE_READ_VARS: - edf.drop(colname, axis=1, inplace=True) - - # remove all zero-valued variables from edf - for colname in edf.columns: - if edf[colname].iloc[0] == 0: - edf.drop(colname, axis=1, inplace=True) - - # write edf to CSV-formatted output file - if transpose: - ofilename = '{}-{}T.csv'.format(filename[:-4], recid) - tstr = transposed(edf) - with open(ofilename, 'w') as ofile: - ofile.write(tstr) - else: - ofilename = '{}-{}.csv'.format(filename[:-4], recid) - edf.to_csv(path_or_buf=ofilename, columns=sorted(edf.columns), - index=False, float_format='%.2f') - sys.stdout.write('EXTRACT IN {}\n'.format(ofilename)) - - # normal return code - return 0 -# end of main function code - - -def transposed(dframe): - """ - Returns transpose of dframe that contains only one row as a string. - """ - # confirm that dframe has only one row - assert dframe.shape[0] == 1 - # construct alphabetical list of variable,value rows - tstr = 'variable,value\n' - for col in sorted(dframe.columns): - tstr += '{},{}\n'.format(col, dframe[col].iloc[0]) - return tstr - - -if __name__ == '__main__': - # parse command-line arguments: - PARSER = argparse.ArgumentParser( - prog='python csv_extract.py', - description=('Writes CSV-formatted file that contains all non-zero ' - 'variables from CSV-formatted FILE for row with RECID.')) - PARSER.add_argument('FILE', type=str, default='', - help=('Name of file, which must end with ".csv".')) - PARSER.add_argument('RECID', type=int, default=0, - help=('RECID value of filing unit row to extract.')) - PARSER.add_argument('--inputonly', default=False, action='store_true', - help=('optional flag that includes only variables ' - 'that are Tax-Calculator usable input.')) - PARSER.add_argument('--transpose', default=False, action='store_true', - help=('optional flag that transposes extract.')) - ARGS = PARSER.parse_args() - # check for invalid command-line argument values - ARGS_ERROR = False - if ARGS.FILE == '': - sys.stderr.write('ERROR: FILE must be specified\n') - ARGS_ERROR = True - if not os.path.isfile(ARGS.FILE): - MSG = 'ERROR: FILE [{}] does not exist\n' - sys.stderr.write(MSG.format(ARGS.FILE)) - ARGS_ERROR = True - if not ARGS.FILE.endswith('.csv'): - MSG = 'ERROR: FILE [{}] does not end with ".csv"\n' - sys.stderr.write(MSG.format(ARGS.FILE)) - ARGS_ERROR = True - if ARGS.RECID <= 0: - MSG = 'ERROR: RECID [{}] must be positive\n' - sys.stderr.write(MSG.format(ARGS.RECID)) - ARGS_ERROR = True - if ARGS_ERROR: - sys.stderr.write('USAGE: python csv_extract.py --help\n') - RCODE = 1 - else: - RCODE = main(ARGS.FILE, ARGS.RECID, ARGS.inputonly, ARGS.transpose) - sys.exit(RCODE) diff --git a/taxcalc/validation/taxsim/test.sh b/taxcalc/validation/taxsim/test.sh index f326968f2..255b6cd16 100755 --- a/taxcalc/validation/taxsim/test.sh +++ b/taxcalc/validation/taxsim/test.sh @@ -35,12 +35,12 @@ YY=${LYY:1:2} tclsh taxsim_in.tcl 20$YY $L > $LYY.in # Generate simtax.py OUTPUT for specified INPUT and REFORM if [[ "$REFORM" == "." ]] ; then - python simtax.py --taxsim2441 $LYY.in + python ../../../simtax.py --taxsim2441 $LYY.in SUFFIX="" OVAR4="" else RJSON="reform-$REFORM.json" - python simtax.py --taxsim2441 --reform $RJSON $LYY.in + python ../../../simtax.py --taxsim2441 --reform $RJSON $LYY.in SUFFIX="-reform-$REFORM" OVAR4="--ovar4" fi