catalyst-cooperative · zaneselvans · Jan 21, 2022 · Jan 12, 2022 · Jan 13, 2022 · Jan 18, 2022
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -24,7 +24,7 @@ repos:
 
 # Attempt to enforce standardized PEP8 style formatting:
 - repo: https://github.com/pre-commit/mirrors-autopep8
-  rev: v1.5.7
+  rev: v1.6.0
   hooks:
   - id: autopep8
     exclude: ^docs/conf.py$
@@ -35,7 +35,7 @@ repos:
 
 # Make sure import statements are sorted uniformly.
 - repo: https://github.com/pre-commit/mirrors-isort
-  rev: v5.9.3
+  rev: v5.10.1
   hooks:
   - id: isort
 

diff --git a/src/pudl/constants.py b/src/pudl/constants.py
diff --git a/src/pudl/extract/eia861.py b/src/pudl/extract/eia861.py
@@ -11,7 +11,6 @@
 
 import pandas as pd
 
-from pudl import constants as pc
 from pudl.extract import excel
 from pudl.helpers import fix_leading_zero_gen_ids
 
@@ -59,4 +58,5 @@ def get_dtypes(page, **partition):
         return {
             "Plant ID": pd.Int64Dtype(),
             "Plant Id": pd.Int64Dtype(),
-            "zip_code": pc.COLUMN_DTYPES['eia']['zip_code']}
+            "zip_code": pd.StringDtype(),
+        }
diff --git a/src/pudl/extract/epacems.py b/src/pudl/extract/epacems.py
@@ -10,7 +10,7 @@
 
 import pandas as pd
 
-import pudl.constants as pc
+from pudl.metadata.fields import get_pandas_dtypes
 from pudl.workspace.datastore import Datastore
 
 logger = logging.getLogger(__name__)
@@ -129,8 +129,8 @@ def _csv_to_dataframe(self, csv_file) -> pd.DataFrame:
             csv (file-like object): data to be read
 
         Returns:
-            pandas.DataFrame: A DataFrame containing the contents of the
-            CSV file.
+            A DataFrame containing the contents of the CSV file.
+
         """
         df = pd.read_csv(
             csv_file,
@@ -139,8 +139,8 @@ def _csv_to_dataframe(self, csv_file) -> pd.DataFrame:
         )
         df = df.rename(columns=RENAME_DICT)
         df = df.astype({
-            col: pc.COLUMN_DTYPES["epacems"][col]
-            for col in pc.COLUMN_DTYPES["epacems"]
+            col: get_pandas_dtypes(group="epacems")[col]
+            for col in get_pandas_dtypes(group="epacems")
             if col in df.columns
         })
         return df

diff --git a/src/pudl/helpers.py b/src/pudl/helpers.py
@@ -26,6 +26,7 @@
 
 from pudl import constants as pc
 from pudl.metadata.classes import Package
+from pudl.metadata.fields import get_pandas_dtypes
 
 logger = logging.getLogger(__name__)
 
@@ -421,7 +422,7 @@ def cleanup(df, on, by):
 
 def get_pudl_dtype(col, data_source):
     """Look up a column's canonical data type based on its PUDL data source."""
-    return pc.COLUMN_DTYPES[data_source][col]
+    return get_pandas_dtypes(group=data_source)[col]
 
 
 def get_pudl_dtypes(col_source_dict):
@@ -943,22 +944,30 @@ def convert_cols_dtypes(df, data_source, name=None):
 
     """
     # get me all of the columns for the table in the constants dtype dict
-    col_dtypes = {col: col_dtype for col, col_dtype
-                  in pc.COLUMN_DTYPES[data_source].items()
-                  if col in list(df.columns)}
+    col_dtypes = {
+        col: dtype for col, dtype
+        in get_pandas_dtypes(group=data_source).items()
+        if col in list(df.columns)
+    }
 
     # grab only the boolean columns (we only need their names)
-    bool_cols = {col: col_dtype for col, col_dtype
-                 in col_dtypes.items()
-                 if col_dtype == pd.BooleanDtype()}
+    bool_cols = {
+        col: dtype for col, dtype
+        in col_dtypes.items()
+        if dtype == "boolean"
+    }
     # grab all of the non boolean columns
-    non_bool_cols = {col: col_dtype for col, col_dtype
-                     in col_dtypes.items()
-                     if col_dtype != pd.BooleanDtype()}
+    non_bool_cols = {
+        col: dtype for col, dtype
+        in col_dtypes.items()
+        if dtype != "boolean"
+    }
     # Grab only the string columns...
-    string_cols = {col: col_dtype for col, col_dtype
-                   in col_dtypes.items()
-                   if col_dtype == pd.StringDtype()}
+    string_cols = {
+        col: dtype for col, dtype
+        in col_dtypes.items()
+        if dtype == "string"
+    }
 
     # If/when we have the columns exhaustively typed, we can do it like this,
     # but right now we don't have the FERC columns done, so we can't:
@@ -1003,9 +1012,9 @@ def convert_cols_dtypes(df, data_source, name=None):
         .replace(to_replace="<NA>", value={col: pd.NA for col in string_cols})
     )
 
-    # Zip codes are highly coorelated with datatype. If they datatype gets
+    # Zip codes are highly correlated with datatype. If they datatype gets
     # converted at any point it may mess up the accuracy of the data. For
-    # example: 08401.0 or 8401 are both incorrect versions of 080401 that a
+    # example: 08401.0 or 8401 are both incorrect versions of 08401 that a
     # simple datatype conversion cannot fix. For this reason, we use the
     # zero_pad_zips function.
     if any('zip_code' for col in df.columns):

diff --git a/src/pudl/metadata/enums.py b/src/pudl/metadata/enums.py
@@ -200,7 +200,7 @@
     'isone',
     'miso',
     'nyiso',
-    'other'
+    'other',
     'pjm',
     'spp',
 ]
-Original file line number
+Diff line change
@@ Expand Up / @@ -200,7 +200,7 @@ @@
         'isone',
         'miso',
         'nyiso',
-        'other'
+        'other',
         'pjm',
         'spp',
     ]
@@ Expand Down @@