Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Take the most recently reported generator operating date when there's no 70%+ consistently reported date #3697

Merged
merged 6 commits into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ EIA 860
regarding energy storage and solar generators. See issue :issue:`3676` and PR
:pr:`3681`.


Data Cleaning
^^^^^^^^^^^^^
* When ``generator_operating_date`` values are too inconsistent to be harvested
successfully, we now take the last reported date in EIA 860 and 860M. See
:issue:`423` and PR :pr:`3967`.

.. _release-v2024.5.0:

Expand Down
2 changes: 1 addition & 1 deletion src/pudl/metadata/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -1826,7 +1826,7 @@
},
"generator_operating_date": {
"type": "date",
"description": "Date the generator began commercial operation.",
"description": "Date the generator began commercial operation. If harvested values are <70 percent consistent, we default to using the most recently reported date.",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We might not want to hard-code the required consistency in the description since it can be changed, and we have changed it in some special cases.

},
"generator_operating_year": {
"type": "integer",
Expand Down
51 changes: 28 additions & 23 deletions src/pudl/transform/eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import importlib.resources
from collections import namedtuple
from enum import StrEnum, auto
from typing import Literal

import networkx as nx
import numpy as np
Expand Down Expand Up @@ -188,6 +187,7 @@ def _lat_long(
col: str,
cols_to_consit: list[str],
round_to: int = 2,
**kwargs,
) -> pd.DataFrame:
"""Harvests more complete lat/long in special cases.

Expand Down Expand Up @@ -231,21 +231,19 @@ def _lat_long(
return ll_clean_df


def _round_operating_date(
def _last_operating_date(
dirty_df: pd.DataFrame,
clean_df: pd.DataFrame,
entity_id_df: pd.DataFrame,
entity_idx: list[str],
col: str,
cols_to_consit: list[str],
group_by_freq: Literal["M", "Y"],
**kwargs,
) -> pd.DataFrame:
"""Harvests operating dates by combining dates within the selected group_by_freq.
"""When there's no consistent generator operating date, take the last reported one.

For all of the entities where there is not a consistent enough reported
operating date, this function reduces the precision of the reported operating date
by only keeping the last record when records are within the time bandwidth
(default of one year) of one another.
operating date, this function keeps the most recently reported date.

Args:
dirty_df: a dataframe with entity records that have inconsistently reported
Expand All @@ -260,37 +258,44 @@ def _round_operating_date(
cols_to_consit: a list of the columns to determine consistency. This either the
[entity_id] or the [entity_id, 'report_date'], depending on whether the
entity is static or annual.
group_by_freq: Frequency to combine by ("M" for month, or "Y" for year)

Returns:
A dataframe with all of the entity ids. Some will have harvested records from
the clean_df. Some will have NA values if no consistently reported records were
found.
"""
# grab the dirty plant records, round and get a new consistency
op_df = dirty_df.assign(
operating_rounded=dirty_df[col].dt.to_period(group_by_freq).dt.to_timestamp()
# take the last reported date for each unharvested generator.
grouped = (
dirty_df.sort_values("report_date").groupby(by=entity_idx)[col].agg(["last"])
)

logger.info(
f"Taking the last generator operating date reported for {len(grouped)} generators."
)
logger.debug(f"Dirty {col} records: {len(op_df)}")
op_df = dirty_df.merge(grouped, on=entity_idx)

op_df["last"] = op_df["last"].fillna(
op_df[col]
) # If no value reported in last time slot, keep original.
op_df = op_df.drop(columns=col).rename(columns={"last": col})

# Group all records within the same rounded time period and assign them the max
# value within that time period.
op_df[col] = op_df.groupby(cols_to_consit + ["operating_rounded"])[col].transform(
"max"
)
op_df["table"] = "special_case"
op_df = op_df.drop("operating_rounded", axis=1)
op_df = occurrence_consistency(entity_idx, op_df, col, cols_to_consit)
# grab the clean plants
op_clean_df = clean_df.dropna()
# find the new clean plant records by selecting the True consistent records
op_df = op_df[op_df[f"{col}_is_consistent"]].drop_duplicates(subset=entity_idx)
logger.info(f"Clean {col} records: {len(op_df)}")
logger.info(f"Rescued dates for {col} records: {len(op_df)}")
logger.info(
f"Rescued rounded {col} for the following units ({entity_idx}): "
f"Rescued last {col} for the following units ({entity_idx}): "
f"{sorted(op_df[entity_idx].apply(lambda row: '_'.join(row.to_numpy().astype(str)), axis=1))}"
)
# add the newly cleaned records
op_clean_df = pd.concat([op_clean_df, op_df])
# assert all generator operating dates are not null
assert len(op_clean_df[op_clean_df.generator_operating_date.isnull()]) == 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a minor non-blocking nit, but it feels more readable to use any()/all() for this kind of test. E.g.

Suggested change
assert len(op_clean_df[op_clean_df.generator_operating_date.isnull()]) == 0
assert op_clean_df.generator_operating_date.notnull().all()

# merge onto the plants df w/ all plant ids
op_clean_df = entity_id_df.merge(op_clean_df, how="outer")
return op_clean_df
Expand Down Expand Up @@ -540,9 +545,9 @@ def harvest_entity_tables( # noqa: C901
entity_df = entity_id_df.copy()
annual_df = annual_id_df.copy()
special_case_cols = {
"latitude": [_lat_long, 1],
"longitude": [_lat_long, 1],
"generator_operating_date": [_round_operating_date, "Y"],
"latitude": {"method": _lat_long, "round_to": 1},
"longitude": {"method": _lat_long, "round_to": 1},
"generator_operating_date": {"method": _last_operating_date},
}
consistency = pd.DataFrame(
columns=["column", "consistent_ratio", "wrongos", "total"]
Expand Down Expand Up @@ -587,14 +592,14 @@ def harvest_entity_tables( # noqa: C901
dirty_df = col_df.merge(clean_df[clean_df[col].isnull()][id_cols])

if col in special_case_cols:
clean_df = special_case_cols[col][0](
clean_df = special_case_cols[col]["method"](
dirty_df,
clean_df,
entity_id_df,
id_cols,
col,
cols_to_consit,
special_case_cols[col][1],
**special_case_cols[col],
)
if col in static_cols:
clean_df = clean_df[id_cols + [col]]
Expand Down