Skip to content

Commit

Permalink
Merge pull request #114 from jdebacker/flex_data
Browse files Browse the repository at this point in the history
Merging
  • Loading branch information
rickecon authored Jun 7, 2024
2 parents 29acbee + 60a8920 commit e994433
Show file tree
Hide file tree
Showing 11 changed files with 267 additions and 69 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ jobs:
shell: bash -l {0}
working-directory: ./
run: |
python -m pytest -m "not local" --cov=./ --cov-report=xml
python -m pytest -m "not local and not needs_puf and not needs_tmd" --cov=./ --cov-report=xml
- name: Upload coverage to Codecov
if: matrix.os == 'ubuntu-latest' && contains(github.repository, 'PSLmodels/OG-USA')
uses: codecov/codecov-action@v4
Expand Down
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.1.9] - 2024-06-07 12:00:00

### Added

- Updates the `get_micro_data.py` and `calibration.py` modules to allow for the user to use the CPS, PUF, and TMD files with Tax-Calculator or to provide their own custom datafile, with associated grow factors and weights.


## [0.1.8] - 2024-05-20 12:00:00

Expand Down Expand Up @@ -105,6 +111,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0



[0.1.9]: https://github.com/PSLmodels/OG-USA/compare/v0.1.8...v0.1.9
[0.1.8]: https://github.com/PSLmodels/OG-USA/compare/v0.1.7...v0.1.8
[0.1.7]: https://github.com/PSLmodels/OG-USA/compare/v0.1.6...v0.1.7
[0.1.6]: https://github.com/PSLmodels/OG-USA/compare/v0.1.5...v0.1.6
Expand Down
50 changes: 43 additions & 7 deletions cs-config/cs_config/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,22 @@
import pickle
import json
import inspect
import pandas as pd
import paramtools
from distributed import Client
from taxcalc import Policy
from taxcalc import Policy, Records, GrowFactors
from collections import OrderedDict
from .helpers import retrieve_puf
from .helpers import retrieve_puf, retrieve_tmd
from cs2tc import convert_policy_adjustment

AWS_ACCESS_KEY_ID = os.environ.get("AWS_ACCESS_KEY_ID", "")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY", "")
PUF_S3_FILE_LOCATION = os.environ.get(
"PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
TMD_S3_FILE_LOCATION = os.environ.get(
"TMD_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
CUR_DIR = os.path.dirname(os.path.realpath(__file__))

# Get Tax-Calculator default parameters
Expand Down Expand Up @@ -78,7 +82,7 @@ class MetaParams(paramtools.Parameters):


def get_version():
return "0.1.2"
return "0.1.9"


def get_inputs(meta_param_dict):
Expand Down Expand Up @@ -188,16 +192,46 @@ def run_model(meta_param_dict, adjustment):

meta_params = MetaParams()
meta_params.adjust(meta_param_dict)
# Get data chosen by user
if meta_params.data_source == "PUF":
data = retrieve_puf(
PUF_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
)
weights = Records.PUF_WEIGHTS_FILENAME
records_start_year = Records.PUFCSV_YEAR
# set name of cached baseline file in case use below
cached_pickle = "TxFuncEst_baseline_PUF.pkl"
else:
if data is not None:
if not isinstance(data, pd.DataFrame):
raise TypeError("'data' must be a Pandas DataFrame.")
else:
# Access keys are not available. Default to the CPS.
print("Defaulting to the CPS")
meta_params.adjust({"data_source": "CPS"})
elif meta_params.data_source == "TMD":
data = retrieve_tmd(
TMD_S3_FILE_LOCATION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY
)
weights = Records.TMD_WEIGHTS_FILENAME
records_start_year = Records.TMDCSV_YEAR
if data is not None:
if not isinstance(data, pd.DataFrame):
raise TypeError("'data' must be a Pandas DataFrame.")
else:
# Access keys are not available. Default to the CPS.
print("Defaulting to the CPS")
meta_params.adjust({"data_source": "CPS"})
elif meta_params.data_source == "CPS":
data = "cps"
weights = Records.PUF_WEIGHTS_FILENAME
records_start_year = Records.CPSCSV_YEAR
# set name of cached baseline file in case use below
cached_pickle = "TxFuncEst_baseline_CPS.pkl"
else:
raise ValueError(
f"Data source '{meta_params.data_source}' is not supported."
)

# Get TC params adjustments
iit_mods = convert_policy_adjustment(
adjustment["Tax-Calculator Parameters"]
Expand All @@ -211,7 +245,7 @@ def run_model(meta_param_dict, adjustment):

# Dask parmeters
num_workers = 2
memory_limit = "10GiB"
memory_per_worker = "10GiB"
client = Client(
n_workers=num_workers,
threads_per_worker=1,
Expand All @@ -222,8 +256,7 @@ def run_model(meta_param_dict, adjustment):
# num_workers_txf = 5
# num_workers_mod = 6

# whether to estimate tax functions from microdata
run_micro = True
# Read in whether user chose to solve for transition path
time_path = meta_param_dict["time_path"][0]["value"]

# filter out OG-USA params that will not change between baseline and
Expand Down Expand Up @@ -363,6 +396,9 @@ def run_model(meta_param_dict, adjustment):
iit_reform=iit_mods,
estimate_tax_functions=True,
data=data,
gfactors=GrowFactors.FILE_NAME,
weights=weights,
records_start_year=records_start_year,
client=client,
)
# update tax function parameters in Specifications Object
Expand Down
40 changes: 40 additions & 0 deletions cs-config/cs_config/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
PUF_S3_FILE_LOCATION = os.environ.get(
"PUF_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
TMD_S3_FILE_LOCATION = os.environ.get(
"TMD_S3_LOCATION", "s3://ospc-data-files/puf.20210720.csv.gz"
)
TC_LAST_YEAR = Policy.LAST_BUDGET_YEAR

POLICY_SCHEMA = {
Expand Down Expand Up @@ -120,3 +123,40 @@ def retrieve_puf(
f"s3_reader_installed={s3_reader_installed})"
)
return None


def retrieve_tmd(
tmd_s3_file_location=TMD_S3_FILE_LOCATION,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
):
"""
Function for retrieving the TMD from the S3 bucket
"""
s3_reader_installed = S3FileSystem is not None
has_credentials = (
aws_access_key_id is not None and aws_secret_access_key is not None
)
if tmd_s3_file_location and has_credentials and s3_reader_installed:
print("Reading tmd from S3 bucket.", tmd_s3_file_location)
fs = S3FileSystem(
key=AWS_ACCESS_KEY_ID,
secret=AWS_SECRET_ACCESS_KEY,
)
with fs.open(tmd_s3_file_location) as f:
# Skips over header from top of file.
tmd_df = pd.read_csv(f)
return tmd_df
elif Path("tmd.csv.gz").exists():
print("Reading tmd from tmd.csv.gz.")
return pd.read_csv("tmd.csv.gz", compression="gzip")
elif Path("tmd.csv").exists():
print("Reading tmd from tmd.csv.")
return pd.read_csv("tmd.csv")
else:
warnings.warn(
f"TMD file not available (tmd_location={tmd_s3_file_location}, "
f"has_credentials={has_credentials}, "
f"s3_reader_installed={s3_reader_installed})"
)
return None
2 changes: 1 addition & 1 deletion docs/book/content/api/get_micro_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ ogusa.get_micro_data
------------------------------------------

.. automodule:: ogusa.get_micro_data
:members: get_calculator, get_data, taxcalc_advance, cap_inc_mtr
:members: get_calculator, get_data, taxcalc_advance, cap_inc_mtr, update_policy, is_paramtools_format
28 changes: 25 additions & 3 deletions ogusa/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ogusa import get_micro_data
import os
import numpy as np
from taxcalc import Records
from ogcore import txfunc, demographics
from ogcore.utils import safe_read_pickle, mkdirs
import pkg_resources
Expand All @@ -23,6 +24,9 @@ def __init__(
iit_reform={},
guid="",
data="cps",
gfactors=None,
weights=None,
records_start_year=Records.CPSCSV_YEAR,
client=None,
num_workers=1,
demographic_data_path=None,
Expand All @@ -33,7 +37,7 @@ def __init__(
parameter values for the OG-USA model.
Args:
p (OGUSA Parameters object): parameters object
p (OG-USA Parameters object): parameters object
estimate_tax_functions (bool): whether to estimate tax functions
estimate_beta (bool): whether to estimate beta
estimate_chi_n (bool): whether to estimate chi_n
Expand All @@ -42,7 +46,13 @@ def __init__(
iit_baseline (dict): baseline policy to use
iit_reform (dict): reform tax parameters
guid (str): id for tax function parameters
data (str): data source for microsimulation model
data (str or Pandas DataFrame): path or DataFrame with
data for Tax-Calculator model
gfactors (str or Pandas DataFrame ): path or DataFrame with
growth factors for Tax-Calculator model
weights (str or Pandas DataFrame): path or DataFrame with
weights for Tax-Calculator model
records_start_year (int): year micro data begins
client (Dask client object): client
num_workers (int): number of workers for Dask client
output_path (str): path to save output to
Expand All @@ -69,6 +79,9 @@ def __init__(
iit_reform,
guid,
data,
gfactors,
weights,
records_start_year,
client,
num_workers,
run_micro=run_micro,
Expand Down Expand Up @@ -143,6 +156,9 @@ def get_tax_function_parameters(
iit_reform={},
guid="",
data="",
gfactors=None,
weights=None,
records_start_year=Records.CPSCSV_YEAR,
client=None,
num_workers=1,
run_micro=False,
Expand All @@ -157,7 +173,13 @@ def get_tax_function_parameters(
iit_baseline (dict): baseline policy to use
iit_reform (dict): reform tax parameters
guid (string): id for tax function parameters
data (string): data source for microsimulation model
data (str or Pandas DataFrame): path or DataFrame with
data for Tax-Calculator model
gfactors (str or Pandas DataFrame ): path or DataFrame with
growth factors for Tax-Calculator model
weights (str or Pandas DataFrame): path or DataFrame with
weights for Tax-Calculator model
records_start_year (int): year micro data begins
client (Dask client object): client
num_workers (int): number of workers for Dask client
run_micro (bool): whether to estimate parameters from
Expand Down
3 changes: 0 additions & 3 deletions ogusa/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,6 @@
DEFAULT_START_YEAR = 2021
# Tax-Calculator start year
TC_LAST_YEAR = taxcalc.Policy.LAST_BUDGET_YEAR
# Years of the PUF and CPS files
PUF_START_YEAR = taxcalc.Records.PUFCSV_YEAR
CPS_START_YEAR = taxcalc.Records.CPSCSV_YEAR

VAR_LABELS = {
"Y": "GDP ($Y_t$)",
Expand Down
Loading

0 comments on commit e994433

Please sign in to comment.