Skip to content

Commit

Permalink
Add a defensive check to make sure that best-snapshot and apply-diffs…
Browse files Browse the repository at this point in the history
… are never *that* far off from each other.
  • Loading branch information
jdangerx committed Apr 10, 2024
1 parent 41a185d commit 7f32384
Showing 1 changed file with 65 additions and 2 deletions.
67 changes: 65 additions & 2 deletions src/pudl/io_managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -703,15 +703,78 @@ def filter_for_freshest_data(
we keep the old non-null value, which may be erroneous. This appears to
be fairly rare, affecting < 0.005% of reported values.
"""

def __apply_diffs(
duped_groups: pd.core.groupby.DataFrameGroupBy,
) -> pd.DataFrame:
"""Take the latest reported non-null value for each group."""
return duped_groups.last()

def __best_snapshot(
duped_groups: pd.core.groupby.DataFrameGroupBy,
) -> pd.DataFrame:
"""Take the row that has most non-null values out of each group."""
return duped_groups.apply(
lambda df: df.assign(count=df.count(axis="columns"))
.sort_values(by="count", ascending=True)
.tail(1)
).drop(columns="count")

def __compare_dedupe_methodologies(
apply_diffs: pd.DataFrame, best_snapshot: pd.DataFrame
):
"""Compare deduplication methodologies.
By cross-referencing these we can make sure that the apply-diff
methodology isn't doing something unexpected.
The main thing we want to keep tabs on is apply-diff adding new
non-null values compared to best-snapshot, because some of those
are instances of a value correctly being reported as `null`.
Instead of stacking the two datasets, merging by context, and then
looking for left_only or right_only values, we just count non-null
values. This is because we would want to use the report_year as a
merge key, but that isn't available until after we pipe the
dataframe through `refine_report_year`.
"""
n_diffs = apply_diffs.count().sum()
n_best = best_snapshot.count().sum()

if n_diffs < n_best:
raise ValueError(
f"Found {n_diffs} non-null values with apply-diffs"
f"methodology, and {n_best} with best-snapshot. "
"apply-diffs should be >= best-snapshot."
)

# 2024-04-10: this threshold set by looking at existing values for FERC
# <=2022.
threshold_pct = 0.3
if n_diffs / n_best > (1 + threshold_pct / 100):
raise ValueError(
f"Found {n_diffs} non-null values with apply-diffs"
f"methodology, and {n_best} with best-snapshot. "
f"apply-diffs shouldn't be more than {threshold_pct}% "
"greater than best-snapshot."
)

filing_metadata_cols = {"publication_time", "filing_name"}
xbrl_context_cols = [c for c in primary_key if c not in filing_metadata_cols]
original = table.sort_values("publication_time")
dupe_mask = original.duplicated(subset=xbrl_context_cols, keep=False)
duped_groups = original.loc[dupe_mask].groupby(
xbrl_context_cols, as_index=False
xbrl_context_cols, as_index=False, dropna=True
)
never_duped = original.loc[~dupe_mask]
return pd.concat([never_duped, duped_groups.last()], ignore_index=True)
apply_diffs = __apply_diffs(duped_groups)
best_snapshot = __best_snapshot(duped_groups)
__compare_dedupe_methodologies(
apply_diffs=apply_diffs, best_snapshot=best_snapshot
)

deduped = pd.concat([never_duped, apply_diffs], ignore_index=True)
return deduped

@staticmethod
def refine_report_year(df: pd.DataFrame, xbrl_years: list[int]) -> pd.DataFrame:
Expand Down

0 comments on commit 7f32384

Please sign in to comment.