diff --git a/scripts/us_hud/income/__init__.py b/scripts/us_hud/income/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/scripts/us_hud/income/manifest.json b/scripts/us_hud/income/manifest.json new file mode 100644 index 0000000000..51d4e92e36 --- /dev/null +++ b/scripts/us_hud/income/manifest.json @@ -0,0 +1,22 @@ +{ + "import_specifications": [ + { + "import_name": "HUD_IncomeLimits", + "curator_emails": ["rbhande@google.com"], + "provenance_url": "https://www.huduser.gov/portal/datasets/il.html", + "provenance_description": "HUD sets income limits for eligibility in assisted housing programs based on Median Family Income for metropolitan and non-metropolitan areas.", + "scripts": ["process.py"], + "import_inputs": [ + { + "template_mcf": "hud.tmcf", + "cleaned_csv": "csv/output_all_years.csv" + } + ], + "cron_schedule": "15 22 * * 4" + } + ] +} + + + + diff --git a/scripts/us_hud/income/process.py b/scripts/us_hud/income/process.py index fb9fc767b9..6559f69503 100644 --- a/scripts/us_hud/income/process.py +++ b/scripts/us_hud/income/process.py @@ -1,30 +1,11 @@ -# Copyright 2023 Google LLC -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -'''Generates cleaned CSVs for HUD Income Limits data. - -Produces: -* csv/output_[YEAR].csv - -Usage: -python3 process.py -''' import csv import datetime import os import pandas as pd from absl import app from absl import flags +from typing import IO, Iterator +import python_calamine FLAGS = flags.FLAGS flags.DEFINE_string('income_output_dir', 'csv', 'Path to write cleaned CSVs.') @@ -33,17 +14,14 @@ def get_url(year): - '''Return xls url for year. - - Args: - year: Input year. - - Returns: - xls url for given year. - ''' + '''Return xls url for year.''' if year < 2006: return '' suffix = str(year)[-2:] + if year == 2023: + return 'Section8-FY23.xlsx' # Directly reference 2023 file for download + elif year == 2024: + return 'Section8-FY24.xlsx' # Directly reference 2024 file for download if year >= 2016: return f'{URL_PREFIX}{suffix}/Section8-FY{suffix}.xlsx' elif year == 2015: @@ -64,68 +42,108 @@ def get_url(year): return '' -def compute_150(df, person): - '''Compute 150th percentile income in-place. +def iter_excel_calamine(file: IO[bytes]) -> Iterator[dict[str, object]]: + '''Reads Excel file using python_calamine.''' + workbook = python_calamine.CalamineWorkbook.from_filelike( + file) # type: ignore[arg-type] + rows = iter(workbook.get_sheet_by_index(0).to_python()) + headers = list(map(str, next(rows))) # Get headers from the first row + for row in rows: + yield dict(zip(headers, row)) + - Args: - df: Input dataframe (will be modified). - person: Number of people in household. - ''' +def compute_150(df, person): + '''Compute 150th percentile income in-place.''' df[f'l150_{person}'] = df.apply( lambda x: round(x[f'l80_{person}'] / 80 * 150), axis=1) -def process(year, matches, output_dir): - '''Generate cleaned CSV. - - Args: - year: Input year. - matches: Map of fips dcid -> city dcid. - output_dir: Directory to write cleaned CSV. - ''' +def process(year, matches, output_data): + '''Generate cleaned data and accumulate it in output_data.''' url = get_url(year) - try: - df = pd.read_excel(url) - except: - print(f'No file found for {url}.') - return + + # Handle 2023 and 2024 separately (read from file using python_calamine) + if year == 2023 or year == 2024: + try: + with open(url, 'rb') as f: + rows = iter_excel_calamine(f) + data = [row for row in rows + ] # Collect all rows as a list of dicts + df = pd.DataFrame(data) + except FileNotFoundError: + print(f'No file found for {year}: {url}.') + return + else: + # For other years, download via URL + try: + df = pd.read_excel(url) + except: + print(f'No file found for {url}.') + return + + # Process the DataFrame (common code for all years) if 'fips2010' in df: df = df.rename(columns={'fips2010': 'fips'}) - # Filter to 80th percentile income stats for each household size. + # Filter to 80th percentile income stats for each household size df = df.loc[:, [ 'fips', 'l80_1', 'l80_2', 'l80_3', 'l80_4', 'l80_5', 'l80_6', 'l80_7', 'l80_8' ]] + # Format FIPS codes df['fips'] = df.apply(lambda x: 'dcs:geoId/' + str(x['fips']).zfill(10), axis=1) df['fips'] = df.apply(lambda x: x['fips'][:-5] if x['fips'][-5:] == '99999' else x['fips'], axis=1) + + # Compute 150th percentile for each household size for i in range(1, 9): compute_150(df, i) - df['year'] = [year for i in range(len(df))] - # Add stats for matching dcids. + # Add year column + df['year'] = [year for _ in range(len(df))] + + # Add stats for matching dcids df_match = df.copy().loc[df['fips'].isin(matches)] if not df_match.empty: df_match['fips'] = df_match.apply(lambda x: matches[x['fips']], axis=1) df = pd.concat([df, df_match]) - df.to_csv(os.path.join(output_dir, f'output_{year}.csv'), index=False) + # Append this year's data to the output_data list + output_data.append(df) def main(argv): + '''Main function to process data for all years and merge into a single CSV.''' with open('match_bq.csv') as f: reader = csv.DictReader(f) matches = {'dcs:' + row['fips']: 'dcs:' + row['city'] for row in reader} + + # Ensure the output directory exists if not os.path.exists(FLAGS.income_output_dir): os.makedirs(FLAGS.income_output_dir) + today = datetime.date.today() - for year in range(2006, today.year): + + # List to accumulate all data + output_data = [] + + # Process data for years 2006 to the current year + for year in range(2006, today.year + 1): print(year) - process(year, matches, FLAGS.income_output_dir) + process(year, matches, output_data) + + # Concatenate all DataFrames in output_data into one single DataFrame + final_df = pd.concat(output_data, ignore_index=True) + + # Save the merged data to a single CSV + final_df.to_csv(os.path.join(FLAGS.income_output_dir, + 'output_all_years.csv'), + index=False) + print( + f'Merged data saved to {FLAGS.income_output_dir}/output_all_years.csv') if __name__ == '__main__':