Add a defensive check to make sure that best-snapshot and apply-diffs…

… are never *that* far off from each other.
catalyst-cooperative · Apr 10, 2024 · 7f32384 · 7f32384
1 parent 41a185d
commit 7f32384
Showing 1 changed file with 65 additions and 2 deletions.
diff --git a/src/pudl/io_managers.py b/src/pudl/io_managers.py
@@ -703,15 +703,78 @@ def filter_for_freshest_data(
         we keep the old non-null value, which may be erroneous. This appears to
         be fairly rare, affecting < 0.005% of reported values.
         """
+
+        def __apply_diffs(
+            duped_groups: pd.core.groupby.DataFrameGroupBy,
+        ) -> pd.DataFrame:
+            """Take the latest reported non-null value for each group."""
+            return duped_groups.last()
+
+        def __best_snapshot(
+            duped_groups: pd.core.groupby.DataFrameGroupBy,
+        ) -> pd.DataFrame:
+            """Take the row that has most non-null values out of each group."""
+            return duped_groups.apply(
+                lambda df: df.assign(count=df.count(axis="columns"))
+                .sort_values(by="count", ascending=True)
+                .tail(1)
+            ).drop(columns="count")
+
+        def __compare_dedupe_methodologies(
+            apply_diffs: pd.DataFrame, best_snapshot: pd.DataFrame
+        ):
+            """Compare deduplication methodologies.
+
+            By cross-referencing these we can make sure that the apply-diff
+            methodology isn't doing something unexpected.
+
+            The main thing we want to keep tabs on is apply-diff adding new
+            non-null values compared to best-snapshot, because some of those
+            are instances of a value correctly being reported as `null`.
+
+            Instead of stacking the two datasets, merging by context, and then
+            looking for left_only or right_only values, we just count non-null
+            values. This is because we would want to use the report_year as a
+            merge key, but that isn't available until after we pipe the
+            dataframe through `refine_report_year`.
+            """
+            n_diffs = apply_diffs.count().sum()
+            n_best = best_snapshot.count().sum()
+
+            if n_diffs < n_best:
+                raise ValueError(
+                    f"Found {n_diffs} non-null values with apply-diffs"
+                    f"methodology, and {n_best} with best-snapshot. "
+                    "apply-diffs should be >= best-snapshot."
+                )
+
+            # 2024-04-10: this threshold set by looking at existing values for FERC
+            # <=2022.
+            threshold_pct = 0.3
+            if n_diffs / n_best > (1 + threshold_pct / 100):
+                raise ValueError(
+                    f"Found {n_diffs} non-null values with apply-diffs"
+                    f"methodology, and {n_best} with best-snapshot. "
+                    f"apply-diffs shouldn't be more than {threshold_pct}% "
+                    "greater than best-snapshot."
+                )
+
         filing_metadata_cols = {"publication_time", "filing_name"}
         xbrl_context_cols = [c for c in primary_key if c not in filing_metadata_cols]
         original = table.sort_values("publication_time")
         dupe_mask = original.duplicated(subset=xbrl_context_cols, keep=False)
         duped_groups = original.loc[dupe_mask].groupby(
-            xbrl_context_cols, as_index=False
+            xbrl_context_cols, as_index=False, dropna=True
         )
         never_duped = original.loc[~dupe_mask]
-        return pd.concat([never_duped, duped_groups.last()], ignore_index=True)
+        apply_diffs = __apply_diffs(duped_groups)
+        best_snapshot = __best_snapshot(duped_groups)
+        __compare_dedupe_methodologies(
+            apply_diffs=apply_diffs, best_snapshot=best_snapshot
+        )
+
+        deduped = pd.concat([never_duped, apply_diffs], ignore_index=True)
+        return deduped
 
     @staticmethod
     def refine_report_year(df: pd.DataFrame, xbrl_years: list[int]) -> pd.DataFrame: