forked from PSLmodels/Tax-Calculator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
puf_fuzz.py
219 lines (192 loc) · 7.62 KB
/
puf_fuzz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
"""
Tax-Calculator Python script that adds random amounts to most
variables in the puf.csv input file, which must be located in the
top-level directory of the Tax-Calculator source code tree.
The resulting input file is xYY.csv, where YY denotes the tax year.
When setting DEBUG = True, the aggregate weighted income and payroll tax
revenues generated by the xYY.csv input file are exactly the same as those
generated by the standard puf.csv input file.
"""
# CODING-STYLE CHECKS:
# pycodestyle --ignore=E402 puf_fuzz.py
# pylint --disable=locally-disabled puf_fuzz.py
import argparse
import sys
import os
import numpy as np
import pandas as pd
from taxcalc import Records
# specify maximum allowed values for command-line parameters
MAX_YEAR = 2023 # maximum tax year allowed for tax calculations
MAX_SEED = 999999999 # maximum allowed seed for random-number generator
MAX_SIZE = 100000 # maximum size of sample to draw from puf.csv
DEBUG = False # True implies no variable randomization or record sampling
TRACE = False # True implies tracing output written to stdout
# specify set of variables not included in xYY.csv file
if DEBUG:
DROP_VARS = set(['filer'])
else:
DROP_VARS = set(['filer', 's006', 'cmbtp',
'nu05', 'nu13', 'elderly_dependent',
'e09700', 'e09800', 'e09900', 'e11200'])
# specify set of variables whose values are not to be randomized
Records.read_var_info()
if DEBUG:
SKIP_VARS = Records.USABLE_READ_VARS
else:
SKIP_VARS = set(['RECID', 'MARS', 'DSI', 'MIDR', 'FLPDYR',
'age_head', 'age_spouse',
'nu18', 'n1820', 'n21',
'XTOT', 'EIC', 'n24', 'f2441',
'f6251'])
ANNUAL_DRIFT = 0.03
NORM_STD_DEV = 0.25
def randomize_data(xdf, taxyear, rnseed):
"""
Randomizes data variables.
Parameters
----------
xdf: Pandas DataFrame
contains data to be randomized.
taxyear: integer
specifies year for which data is to be randomized.
rnseed: integer
specifies random-number seed to use in the randomization.
"""
# pylint: disable=no-member
# (above pylint comment eliminates several bogus np warnings)
xdf['FLPDYR'] = taxyear
num = xdf['FLPDYR'].size
nmean = 1.0 + ANNUAL_DRIFT * (taxyear - 2009)
nsdev = NORM_STD_DEV
np.random.seed(rnseed)
num_skips = 0
for varname in list(xdf):
if varname in SKIP_VARS:
num_skips += 1
continue
# randomize nonzero variable amounts
old = xdf[varname]
oldint = old.round(decimals=0)
oldint = oldint.astype(dtype=np.int32)
rfactor = np.random.normal(loc=nmean, scale=nsdev, size=num)
addon = oldint * rfactor # addon is zero if oldint is zero
raw = oldint + addon.round(decimals=0)
raw = raw.astype(dtype=np.int32)
if oldint.min() < 0:
new = raw
else:
new = raw.clip(lower=0)
if TRACE:
info = '{} {} {} {} {}'.format(varname, old.dtype, old.min(),
new.dtype, new.min())
sys.stdout.write(info + '\n')
xdf[varname] = new
if TRACE:
info = 'number_variable_randomization_skips={}'.format(num_skips)
sys.stdout.write(info + '\n')
def constrain_data(xdf):
"""
Constrains data variable values as required by Records class
Parameters
----------
xdf: Pandas DataFrame
contains randomized data to be constrained.
"""
# pylint: disable=no-member
# (above pylint comment eliminates several bogus np warnings)
if DEBUG:
return
# constraint: e00200 = e00200p + e00200s
xdf['e00200'] = xdf['e00200p'] + xdf['e00200s']
# constraint: e00900 = e00900p + e00900s
xdf['e00900'] = xdf['e00900p'] + xdf['e00900s']
# constraint: e02100 = e02100p + e02100s
xdf['e02100'] = xdf['e02100p'] + xdf['e02100s']
# constraint: e00600 >= e00650
xdf['e00600'] = np.maximum(xdf['e00600'], xdf['e00650'])
# constraint: e01500 >= e01700
xdf['e01500'] = np.maximum(xdf['e01500'], xdf['e01700'])
def main(taxyear, rnseed, ssize):
"""
Contains high-level logic of the script.
"""
# read puf.csv file into a Pandas DataFrame
current_path = os.path.abspath(os.path.dirname(__file__))
pufcsv_filename = os.path.join(current_path, '..', '..', 'puf.csv')
if not os.path.isfile(pufcsv_filename):
msg = 'ERROR: puf.csv file not found in top-level directory'
sys.stderr.write(msg + '\n')
return 1
xdf = pd.read_csv(pufcsv_filename)
# pylint: disable=no-member
# remove xdf variables not needed in xYY.csv file
if TRACE:
info = 'df.shape before dropping = {}'.format(xdf.shape)
sys.stdout.write(info + '\n')
for var in DROP_VARS:
if var not in Records.USABLE_READ_VARS:
msg = 'ERROR: variable {} already dropped'.format(var)
sys.stderr.write(msg + '\n')
return 1
xdf.drop(var, axis=1, inplace=True)
if TRACE:
info = 'df.shape after dropping = {}'.format(xdf.shape)
sys.stdout.write(info + '\n')
# add random amounts to xdf variables
randomize_data(xdf, taxyear, rnseed)
# constrain values of certain variables as required by Records class
constrain_data(xdf)
# sample xdf without replacement to get ssize observations
if DEBUG:
(sample_size, _) = xdf.shape
xxdf = xdf
else:
sample_size = ssize
xxdf = xdf.sample(n=sample_size, random_state=rnseed)
xxdf['RECID'] = [rid + 1 for rid in range(sample_size)]
if TRACE:
info = 'df.shape after sampling = {}'.format(xxdf.shape)
sys.stdout.write(info + '\n')
# write randomized and sampled xxdf to xYY.csv file
xxdf.to_csv('x{}.csv'.format(taxyear % 100), index=False)
# normal return code
return 0
# end of main function code
if __name__ == '__main__':
# parse command-line arguments:
PARSER = argparse.ArgumentParser(
prog='python puf_fuzz.py',
description=('Adds random amounts to certain variables in '
'puf.csv input file and writes the randomized '
'CSV-formatted input file to xYY.csv file.'))
PARSER.add_argument('YEAR', type=int, default=0,
help=('YEAR is tax year; '
'must be in [2013,{}] range.'.format(MAX_YEAR)))
PARSER.add_argument('SEED', type=int, default=0,
help=('SEED is random-number seed; '
'must be in [1,{}] range.'.format(MAX_SEED)))
PARSER.add_argument('SIZE', type=int, default=0,
help=('SIZE is sample size; '
'must be in [1,{}] range.'.format(MAX_SIZE)))
ARGS = PARSER.parse_args()
# check for invalid command-line argument values
ARGS_ERROR = False
if ARGS.YEAR < 2013 or ARGS.YEAR > MAX_YEAR:
RSTR = '[2013,{}] range'.format(MAX_YEAR)
sys.stderr.write('ERROR: YEAR {} not in {}\n'.format(ARGS.YEAR, RSTR))
ARGS_ERROR = True
if ARGS.SEED < 1 or ARGS.SEED > MAX_SEED:
RSTR = '[1,{}] range'.format(MAX_SEED)
sys.stderr.write('ERROR: SEED {} not in {}\n'.format(ARGS.SEED, RSTR))
ARGS_ERROR = True
if ARGS.SIZE < 1 or ARGS.SIZE > MAX_SIZE:
RSTR = '[1,{}] range'.format(MAX_SIZE)
sys.stderr.write('ERROR: SIZE {} not in {}\n'.format(ARGS.SIZE, RSTR))
ARGS_ERROR = True
if ARGS_ERROR:
sys.stderr.write('USAGE: python puf_fuzz.py --help\n')
RCODE = 1
else:
RCODE = main(ARGS.YEAR, ARGS.SEED, ARGS.SIZE)
sys.exit(RCODE)