Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove COLUMN_DTYPES and switch to field metadata dictionary #1408

Merged
merged 11 commits into from
Jan 21, 2022
Merged
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ repos:

# Attempt to enforce standardized PEP8 style formatting:
- repo: https://github.com/pre-commit/mirrors-autopep8
rev: v1.5.7
rev: v1.6.0
hooks:
- id: autopep8
exclude: ^docs/conf.py$
Expand All @@ -35,7 +35,7 @@ repos:

# Make sure import statements are sorted uniformly.
- repo: https://github.com/pre-commit/mirrors-isort
rev: v5.9.3
rev: v5.10.1
hooks:
- id: isort

Expand Down
469 changes: 2 additions & 467 deletions src/pudl/constants.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions src/pudl/extract/eia861.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

import pandas as pd

from pudl import constants as pc
from pudl.extract import excel
from pudl.helpers import fix_leading_zero_gen_ids

Expand Down Expand Up @@ -59,4 +58,5 @@ def get_dtypes(page, **partition):
return {
"Plant ID": pd.Int64Dtype(),
"Plant Id": pd.Int64Dtype(),
"zip_code": pc.COLUMN_DTYPES['eia']['zip_code']}
"zip_code": pd.StringDtype(),
}
10 changes: 5 additions & 5 deletions src/pudl/extract/epacems.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import pandas as pd

import pudl.constants as pc
from pudl.metadata.fields import get_pandas_dtypes
from pudl.workspace.datastore import Datastore

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -129,8 +129,8 @@ def _csv_to_dataframe(self, csv_file) -> pd.DataFrame:
csv (file-like object): data to be read

Returns:
pandas.DataFrame: A DataFrame containing the contents of the
CSV file.
A DataFrame containing the contents of the CSV file.

"""
df = pd.read_csv(
csv_file,
Expand All @@ -139,8 +139,8 @@ def _csv_to_dataframe(self, csv_file) -> pd.DataFrame:
)
df = df.rename(columns=RENAME_DICT)
df = df.astype({
col: pc.COLUMN_DTYPES["epacems"][col]
for col in pc.COLUMN_DTYPES["epacems"]
col: get_pandas_dtypes(group="epacems")[col]
for col in get_pandas_dtypes(group="epacems")
if col in df.columns
})
return df
Expand Down
39 changes: 24 additions & 15 deletions src/pudl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from pudl import constants as pc
from pudl.metadata.classes import Package
from pudl.metadata.fields import get_pandas_dtypes

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -421,7 +422,7 @@ def cleanup(df, on, by):

def get_pudl_dtype(col, data_source):
"""Look up a column's canonical data type based on its PUDL data source."""
return pc.COLUMN_DTYPES[data_source][col]
return get_pandas_dtypes(group=data_source)[col]


def get_pudl_dtypes(col_source_dict):
Expand Down Expand Up @@ -943,22 +944,30 @@ def convert_cols_dtypes(df, data_source, name=None):

"""
# get me all of the columns for the table in the constants dtype dict
col_dtypes = {col: col_dtype for col, col_dtype
in pc.COLUMN_DTYPES[data_source].items()
if col in list(df.columns)}
col_dtypes = {
col: dtype for col, dtype
in get_pandas_dtypes(group=data_source).items()
if col in list(df.columns)
}

# grab only the boolean columns (we only need their names)
bool_cols = {col: col_dtype for col, col_dtype
in col_dtypes.items()
if col_dtype == pd.BooleanDtype()}
bool_cols = {
col: dtype for col, dtype
in col_dtypes.items()
if dtype == "boolean"
}
# grab all of the non boolean columns
non_bool_cols = {col: col_dtype for col, col_dtype
in col_dtypes.items()
if col_dtype != pd.BooleanDtype()}
non_bool_cols = {
col: dtype for col, dtype
in col_dtypes.items()
if dtype != "boolean"
}
# Grab only the string columns...
string_cols = {col: col_dtype for col, col_dtype
in col_dtypes.items()
if col_dtype == pd.StringDtype()}
string_cols = {
col: dtype for col, dtype
in col_dtypes.items()
if dtype == "string"
}

# If/when we have the columns exhaustively typed, we can do it like this,
# but right now we don't have the FERC columns done, so we can't:
Expand Down Expand Up @@ -1003,9 +1012,9 @@ def convert_cols_dtypes(df, data_source, name=None):
.replace(to_replace="<NA>", value={col: pd.NA for col in string_cols})
)

# Zip codes are highly coorelated with datatype. If they datatype gets
# Zip codes are highly correlated with datatype. If they datatype gets
# converted at any point it may mess up the accuracy of the data. For
# example: 08401.0 or 8401 are both incorrect versions of 080401 that a
# example: 08401.0 or 8401 are both incorrect versions of 08401 that a
# simple datatype conversion cannot fix. For this reason, we use the
# zero_pad_zips function.
if any('zip_code' for col in df.columns):
Expand Down
2 changes: 1 addition & 1 deletion src/pudl/metadata/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@
'isone',
'miso',
'nyiso',
'other'
'other',
'pjm',
'spp',
]
Expand Down
Loading