Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline: Fix CAWP error with terms ending in early January #3926

Merged
merged 2 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 26 additions & 6 deletions python/datasources/cawp.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,8 @@ def get_us_congress_totals_df():
for legislator in raw_legislators_json:
# and each term they served
for term in legislator[TERMS]:
term_years = list(range(int(term[START][:4]), int(term[END][:4]) + 1))

term_years = extract_term_years(term)

# and each year of each term
for year in term_years:
Expand Down Expand Up @@ -651,12 +652,12 @@ def get_women_dfs():
columns "time_period" by year and "state_postal", "race_ethnicity"
with specific CAWP race strings"""

df = gcs_to_bq_util.load_csv_as_df_from_data_dir("cawp", CAWP_LINE_ITEMS_FILE)

# keep only needed cols
df = df[[ID, YEAR, STATE, FIRST_NAME, LAST_NAME, POSITION, RACE_ETH]]
df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
"cawp", CAWP_LINE_ITEMS_FILE, usecols=[ID, YEAR, STATE, FIRST_NAME, LAST_NAME, POSITION, RACE_ETH]
)

df = df.dropna(subset=[STATE])
# keep only valid rows
df = df.dropna(subset=[STATE, RACE_ETH])

# standardize postal codes (can't just swap codes because Michigan is also MI)
df[STATE] = df[STATE].replace(
Expand Down Expand Up @@ -1060,3 +1061,22 @@ def handle_other_and_multi_races(df):
df = df.explode(RACE_ETH)

return df


def extract_term_years(term):
"""
Extract years from a term, with special handling for those finishing their term in the first weeks of January.

Args:
term (dict): Dictionary containing start and end date keys

Returns:
list: Years of the term
"""
term_years = list(range(int(term[START][:4]), int(term[END][:4]) + 1))

# If the term ended the first week of January, don't count that year (to align with CAWP)
if term[END][5:7] == "01" and int(term[END][9:]) <= 7:
term_years = term_years[:-1]

return term_years
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,4 @@ time_period,state_fips,total_state_leg_count
"2022","60",39
"2023","60",39
"2024","60",39
"2025","60",39
44 changes: 44 additions & 0 deletions python/tests/datasources/test_cawp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
US_CONGRESS_HISTORICAL_URL,
US_CONGRESS_CURRENT_URL,
get_consecutive_time_periods,
extract_term_years,
FIPS_TO_STATE_TABLE_MAP,
)

Expand All @@ -17,6 +18,46 @@
# UNIT TESTS


def test_extract_term_years():

entry_with_jan = {
"type": "rep",
"start": "2017-01-03",
"end": "2019-01-03",
"state": "MI",
"district": 5,
"party": "Democrat",
"phone": "202-225-3611",
"url": "https://dankildee.house.gov",
"rss_url": "http://dankildee.house.gov/rss.xml",
"address": "227 Cannon House Office Building; Washington DC 20515-2205",
"office": "227 Cannon House Office Building",
"fax": "202-225-6393",
}

term_years_excluding_jan = extract_term_years(entry_with_jan)
assert term_years_excluding_jan == [2017, 2018]

entry_special_election = {
"type": "sen",
"start": "2023-01-23",
"end": "2024-11-05",
"how": "appointment",
"end-type": "special-election",
"state": "NE",
"class": 2,
"state_rank": "junior",
"party": "Republican",
"url": "https://www.ricketts.senate.gov",
"address": "139 Russell Senate Office Building Washington DC 20510",
"office": "139 Russell Senate Office Building",
"phone": "202-224-4224",
}

term_years_special_election = extract_term_years(entry_special_election)
assert term_years_special_election == [2023, 2024]


def test_get_consecutive_time_periods():
assert get_consecutive_time_periods(2020, 2022) == ["2020", "2021", "2022"]
default_time_periods = get_consecutive_time_periods()
Expand Down Expand Up @@ -57,6 +98,8 @@ def _load_csv_as_df_from_data_dir(*args, **kwargs):

print("MOCK READ FROM /data:", filename, kwargs)

usecols = kwargs.get("usecols", None)

if filename == "cawp-by_race_and_ethnicity_time_series.csv":
# READ IN CAWP DB (numerators)
test_input_data_types = {
Expand All @@ -76,6 +119,7 @@ def _load_csv_as_df_from_data_dir(*args, **kwargs):
os.path.join(TEST_DIR, f"test_input_{filename}"),
dtype=test_input_data_types,
index_col=False,
usecols=usecols,
)
else:
# READ IN MANUAL TERRITORY STATELEG TOTAL TABLES
Expand Down
Loading