SatcherInstitute · benhammondmusic · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025
diff --git a/python/datasources/cawp.py b/python/datasources/cawp.py
@@ -566,7 +566,8 @@ def get_us_congress_totals_df():
     for legislator in raw_legislators_json:
         # and each term they served
         for term in legislator[TERMS]:
-            term_years = list(range(int(term[START][:4]), int(term[END][:4]) + 1))
+
+            term_years = extract_term_years(term)
 
             # and each year of each term
             for year in term_years:
@@ -651,12 +652,12 @@ def get_women_dfs():
             columns "time_period" by year and "state_postal", "race_ethnicity"
             with specific CAWP race strings"""
 
-    df = gcs_to_bq_util.load_csv_as_df_from_data_dir("cawp", CAWP_LINE_ITEMS_FILE)
-
-    # keep only needed cols
-    df = df[[ID, YEAR, STATE, FIRST_NAME, LAST_NAME, POSITION, RACE_ETH]]
+    df = gcs_to_bq_util.load_csv_as_df_from_data_dir(
+        "cawp", CAWP_LINE_ITEMS_FILE, usecols=[ID, YEAR, STATE, FIRST_NAME, LAST_NAME, POSITION, RACE_ETH]
+    )
 
-    df = df.dropna(subset=[STATE])
+    # keep only valid rows
+    df = df.dropna(subset=[STATE, RACE_ETH])
 
     # standardize postal codes (can't just swap codes because Michigan is also MI)
     df[STATE] = df[STATE].replace(
@@ -1060,3 +1061,22 @@ def handle_other_and_multi_races(df):
     df = df.explode(RACE_ETH)
 
     return df
+
+
+def extract_term_years(term):
+    """
+    Extract years from a term, with special handling for those finishing their term in the first weeks of January.
+
+    Args:
+        term (dict): Dictionary containing start and end date keys
+
+    Returns:
+        list: Years of the term
+    """
+    term_years = list(range(int(term[START][:4]), int(term[END][:4]) + 1))
+
+    # If the term ended the first week of January, don't count that year (to align with CAWP)
+    if term[END][5:7] == "01" and int(term[END][9:]) <= 7:
+        term_years = term_years[:-1]
+
+    return term_years
diff --git a/python/tests/data/cawp/mock_territory_leg_tables/cawp_state_leg_60.csv b/python/tests/data/cawp/mock_territory_leg_tables/cawp_state_leg_60.csv
@@ -57,3 +57,4 @@ time_period,state_fips,total_state_leg_count
 "2022","60",39
 "2023","60",39
 "2024","60",39
+"2025","60",39
diff --git a/python/tests/datasources/test_cawp.py b/python/tests/datasources/test_cawp.py
@@ -9,6 +9,7 @@
     US_CONGRESS_HISTORICAL_URL,
     US_CONGRESS_CURRENT_URL,
     get_consecutive_time_periods,
+    extract_term_years,
     FIPS_TO_STATE_TABLE_MAP,
 )
 
@@ -17,6 +18,46 @@
 # UNIT TESTS
 
 
+def test_extract_term_years():
+
+    entry_with_jan = {
+        "type": "rep",
+        "start": "2017-01-03",
+        "end": "2019-01-03",
+        "state": "MI",
+        "district": 5,
+        "party": "Democrat",
+        "phone": "202-225-3611",
+        "url": "https://dankildee.house.gov",
+        "rss_url": "http://dankildee.house.gov/rss.xml",
+        "address": "227 Cannon House Office Building; Washington DC 20515-2205",
+        "office": "227 Cannon House Office Building",
+        "fax": "202-225-6393",
+    }
+
+    term_years_excluding_jan = extract_term_years(entry_with_jan)
+    assert term_years_excluding_jan == [2017, 2018]
+
+    entry_special_election = {
+        "type": "sen",
+        "start": "2023-01-23",
+        "end": "2024-11-05",
+        "how": "appointment",
+        "end-type": "special-election",
+        "state": "NE",
+        "class": 2,
+        "state_rank": "junior",
+        "party": "Republican",
+        "url": "https://www.ricketts.senate.gov",
+        "address": "139 Russell Senate Office Building Washington DC 20510",
+        "office": "139 Russell Senate Office Building",
+        "phone": "202-224-4224",
+    }
+
+    term_years_special_election = extract_term_years(entry_special_election)
+    assert term_years_special_election == [2023, 2024]
+
+
 def test_get_consecutive_time_periods():
     assert get_consecutive_time_periods(2020, 2022) == ["2020", "2021", "2022"]
     default_time_periods = get_consecutive_time_periods()
@@ -57,6 +98,8 @@ def _load_csv_as_df_from_data_dir(*args, **kwargs):
 
     print("MOCK READ FROM /data:", filename, kwargs)
 
+    usecols = kwargs.get("usecols", None)
+
     if filename == "cawp-by_race_and_ethnicity_time_series.csv":
         # READ IN CAWP DB (numerators)
         test_input_data_types = {
@@ -76,6 +119,7 @@ def _load_csv_as_df_from_data_dir(*args, **kwargs):
             os.path.join(TEST_DIR, f"test_input_{filename}"),
             dtype=test_input_data_types,
             index_col=False,
+            usecols=usecols,
         )
     else:
         # READ IN MANUAL TERRITORY STATELEG TOTAL TABLES