Skip to content

Commit

Permalink
Merge pull request #1600 from catalyst-cooperative/non_static_tech
Browse files Browse the repository at this point in the history
Re-gigger backfilling `technology_description` & make `prime_mover_code` an annually harvested column
  • Loading branch information
aesharpe authored May 5, 2022
2 parents 5dbc337 + 0dd285d commit 3b9e6bd
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 75 deletions.
150 changes: 105 additions & 45 deletions devtools/harvesting_debug.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,21 @@
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pudl\n",
"from pudl import constants as pc\n",
"import pathlib\n",
"import yaml\n",
"import sqlalchemy as sa\n",
"from pudl.etl import * \n",
"from pudl.etl import *\n",
"import logging\n",
"import sys\n",
"import pathlib\n",
"import copy"
]
},
Expand All @@ -27,29 +32,41 @@
"outputs": [],
"source": [
"logger = logging.getLogger()\n",
"logger.setLevel(logging.INFO)\n",
"logger.setLevel(logging.DEBUG)\n",
"handler = logging.StreamHandler(stream=sys.stdout)\n",
"formatter = logging.Formatter('%(message)s')\n",
"handler.setFormatter(formatter)\n",
"logger.handlers = [handler]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"settings_file_name= 'etl_example.yml'\n",
"clobber=True\n",
"pudl_settings = pudl.workspace.setup.get_defaults()\n",
"with open(pathlib.Path(pudl_settings['settings_dir'],\n",
" settings_file_name),\n",
" \"r\") as f:\n",
" settings_file = yaml.safe_load(f)\n",
" datapkg_bundle_settings = settings_file['datapkg_bundle_settings']\n",
"# validate the settings from the settings file.\n",
"validated_bundle_settings = validate_params(datapkg_bundle_settings)"
"settings_file_name= 'etl_full.yml'\n",
"etl_settings = EtlSettings.from_yaml(\n",
" pathlib.Path(pudl_settings['settings_dir'],\n",
" settings_file_name))\n",
"validated_etl_settings = etl_settings.datasets\n",
"datasets = validated_etl_settings.get_datasets()\n",
"eia_settings = datasets[\"eia\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can skip the settings step above and set these years/tables yourself here without using the settings files... just know they are not validated below so they could be wrong and fail after some time. It is HIGHLY RECOMMENDED that you use all the years/tables"
]
},
{
Expand All @@ -58,17 +75,21 @@
"metadata": {},
"outputs": [],
"source": [
"# THIS IS ASSUMING THE PKG W/ EIA IS THE FIRST ONE!\n",
"# also, if you don't want to deal w/ the settings file..\n",
"# you can just edit your eia_inputs below\n",
"datapkg_settings = validated_bundle_settings[0]\n",
"etl_params = datapkg_settings['datasets'][1]['eia']\n",
"eia_inputs = pudl.etl._validate_params_eia(etl_params)\n",
"eia923_tables = eia_inputs['eia923_tables']\n",
"eia923_years = eia_inputs['eia923_years']\n",
"eia860_tables = eia_inputs['eia860_tables']\n",
"eia860_years = eia_inputs['eia860_years']\n",
"sandbox = True"
"eia860_tables = eia_settings.eia860.tables\n",
"eia860_years = eia_settings.eia860.years\n",
"eia860m = eia_settings.eia860.eia860m\n",
"eia923_tables = eia_settings.eia923.tables\n",
"eia923_years = eia_settings.eia923.years\n",
"\n",
"ds = Datastore()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Run extract step & phase 1 transform step\n",
"this is pulled from `pudl.etl._etl_eia()`"
]
},
{
Expand All @@ -77,18 +98,38 @@
"metadata": {},
"outputs": [],
"source": [
"ds = pudl.workspace.datastore.Datastore(\n",
" Path(pudl_settings[\"pudl_in\"]),\n",
" sandbox=sandbox)\n",
"\n",
"# Extract EIA forms 923, 860\n",
"eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract(year=eia923_years)\n",
"eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract(year=eia860_years)\n",
"eia923_raw_dfs = pudl.extract.eia923.Extractor(ds).extract(\n",
" settings=eia_settings.eia923\n",
")\n",
"eia860_raw_dfs = pudl.extract.eia860.Extractor(ds).extract(\n",
" settings=eia_settings.eia860\n",
")\n",
"# if we are trying to add the EIA 860M YTD data, then extract it and append\n",
"if eia860m:\n",
" eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract(\n",
" settings=eia_settings.eia860\n",
" )\n",
" eia860_raw_dfs = pudl.extract.eia860m.append_eia860m(\n",
" eia860_raw_dfs=eia860_raw_dfs, eia860m_raw_dfs=eia860m_raw_dfs\n",
" )\n",
"\n",
"# Transform EIA forms 923, 860\n",
"eia860_transformed_dfs = pudl.transform.eia860.transform(\n",
" eia860_raw_dfs, eia860_tables=eia860_tables)\n",
" eia860_raw_dfs, eia860_settings=eia_settings.eia860\n",
")\n",
"\n",
"eia923_transformed_dfs = pudl.transform.eia923.transform(\n",
" eia923_raw_dfs, eia923_tables=eia923_tables)"
" eia923_raw_dfs, eia923_settings=eia_settings.eia923\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You have to re-run this cell every time you want to re-run the havesting cell below (bc `pudl.transform.eia.harvesting` removes columns from the dfs). This cell enables you to start with a fresh`eia_transformed_dfs` without needing to re-run the 860/923 transforms."
]
},
{
Expand All @@ -98,11 +139,21 @@
"outputs": [],
"source": [
"# create an eia transformed dfs dictionary\n",
"eia_transformed_dfs = copy.deepcopy(eia860_transformed_dfs)\n",
"eia_transformed_dfs.update(copy.deepcopy(eia923_transformed_dfs))\n",
"# convert types..\n",
"eia_transformed_dfs = pudl.helpers.convert_dfs_dict_dtypes(\n",
" eia_transformed_dfs, 'eia')"
"eia_transformed_dfs = eia860_transformed_dfs.copy()\n",
"eia_transformed_dfs.update(eia923_transformed_dfs.copy())\n",
"\n",
"# Do some final cleanup and assign appropriate types:\n",
"eia_transformed_dfs = {\n",
" name: convert_cols_dtypes(df, data_source=\"eia\")\n",
" for name, df in eia_transformed_dfs.items()\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Run harvest w/ debug=True"
]
},
{
Expand All @@ -112,11 +163,20 @@
"outputs": [],
"source": [
"# we want to investigate the harvesting of the plants in this case...\n",
"entity = 'plants'\n",
"entity = 'generators'\n",
"# create the empty entities df to fill up\n",
"entities_dfs = {}\n",
"entities_dfs, eia_transformed_dfs, col_dfs = pudl.transform.eia._harvesting(\n",
" entity, eia_transformed_dfs, entities_dfs,debug=True)"
"entities_dfs, eia_transformed_dfs, col_dfs = (\n",
" pudl.transform.eia.harvesting(\n",
" entity, eia_transformed_dfs, entities_dfs, debug=True)\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Use `col_dfs` to explore harvested values"
]
},
{
Expand All @@ -125,7 +185,7 @@
"metadata": {},
"outputs": [],
"source": [
"bac = col_dfs['balancing_authority_code']"
"pmc = col_dfs['prime_mover_code']"
]
},
{
Expand All @@ -134,13 +194,13 @@
"metadata": {},
"outputs": [],
"source": [
"bac"
"pmc.prime_mover_code.unique()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -154,7 +214,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.10.4"
}
},
"nbformat": 4,
Expand Down
16 changes: 16 additions & 0 deletions docs/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,22 @@
PUDL Release Notes
=======================================================================================

.. _release-v0-7-0:

---------------------------------------------------------------------------------------
0.7.0 (2022-XX-XX)
---------------------------------------------------------------------------------------

Database Schema Changes
^^^^^^^^^^^^^^^^^^^^^^^

* After learning that generators' prime movers do very occasionally change over
time, we recategorized the ``prime_mover_code`` column in our entity resolution
process to enable the rare but real variability over time. We moved the
``prime_mover_code`` column from the statically harvested/normalized data
column to an annually harvested data column (i.e. from :ref:`generators_entity_eia`
to :ref:`generators_eia860`) :pr:`1600`. See :issue:`1585` for more details.

.. _release-v0-6-0:

---------------------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion src/pudl/metadata/resources/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@
"generators": {
"id_cols": ["plant_id_eia", "generator_id"],
"static_cols": [
"prime_mover_code",
"duct_burners",
"operating_date",
"topping_bottoming_code",
Expand Down Expand Up @@ -112,6 +111,7 @@
"distributed_generation",
"technology_description",
"reactive_power_output_mvar",
"prime_mover_code",
"energy_source_code_1",
"energy_source_code_2",
"energy_source_code_3",
Expand Down
1 change: 0 additions & 1 deletion src/pudl/metadata/resources/eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@
"fields": [
"plant_id_eia",
"generator_id",
"prime_mover_code",
"duct_burners",
"operating_date",
"topping_bottoming_code",
Expand Down
1 change: 1 addition & 0 deletions src/pudl/metadata/resources/eia860.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"summer_capacity_estimate",
"winter_capacity_mw",
"winter_capacity_estimate",
"prime_mover_code",
"energy_source_code_1",
"energy_source_code_2",
"energy_source_code_3",
Expand Down
68 changes: 40 additions & 28 deletions src/pudl/output/eia860.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""Functions for pulling data primarily from the EIA's Form 860."""

import logging
from collections import defaultdict

import pandas as pd
import sqlalchemy as sa
Expand Down Expand Up @@ -368,13 +367,16 @@ def generators_eia860(


def fill_generator_technology_description(gens_df: pd.DataFrame) -> pd.DataFrame:
"""Fill in missing ``technology_description`` based on generator and energy source.
"""Fill in missing ``technology_description`` based by unique mapping & backfilling.
Prior to 2014, the EIA 860 did not report ``technology_description``. This
function backfills those early years within groups defined by ``plant_id_eia``,
``generator_id`` and ``energy_source_code_1``. Some remaining missing values are
then filled in using the consistent, unique mappings that are observed between
``energy_source_code_1`` and ``technology_type`` across all years and generators.
Prior to 2014, the EIA 860 did not report ``technology_description``.
This function fills in missing values are then filled in using the consistent,
unique mappings that are observed between ``energy_source_code_1``,
``prime_mover_code`` and ``technology_type`` across all years and generators.
Then function backfills those early years within groups defined by ``plant_id_eia``,
``generator_id``, ``energy_source_code_1`` and ``prime_mover_code``.
As a result, more than 95% of all generator records end up having a
``technology_description`` associated with them.
Expand All @@ -391,37 +393,47 @@ def fill_generator_technology_description(gens_df: pd.DataFrame) -> pd.DataFrame
nrows_orig = len(gens_df)
out_df = gens_df.copy()

# Fill in missing technology_descriptions with unique correspondences
# between energy_source_code_1 and prime_mover_code when there has always
# been a unique map between ESC/PM and technology_description
esc_pm_to_tech = (
out_df.loc[
:, ["energy_source_code_1", "prime_mover_code", "technology_description"]
]
.dropna(how="any") # if anything is null, we can't use it, so drop
.drop_duplicates(keep="first") # keep one of each (doesn't matter which)
.drop_duplicates( # if there are any duplicates w/in esc/pm combo.. it's gotta go
subset=["energy_source_code_1", "prime_mover_code"], keep=False
)
)

no_tech_mask = out_df.technology_description.isnull()
has_tech = out_df[~no_tech_mask]
no_tech = pd.merge(
out_df[no_tech_mask].drop(columns=["technology_description"]),
esc_pm_to_tech,
on=["energy_source_code_1", "prime_mover_code"],
how="left",
validate="m:1",
)
out_df = pd.concat([has_tech, no_tech]).reset_index(drop=True)

# Backfill within generator-energy_source groups:
out_df["technology_description"] = (
out_df.sort_values("report_date")
.groupby(["plant_id_eia", "generator_id", "energy_source_code_1"])
.groupby(
["plant_id_eia", "generator_id", "energy_source_code_1", "prime_mover_code"]
)
.technology_description.bfill()
)

# Fill in remaining missing technology_descriptions with unique correspondences
# between energy_source_code_1 where possible. Use a default value of pd.NA
# for any technology_description that isn't uniquely identified by energy source
static_fuels = defaultdict(
lambda: pd.NA,
gens_df.dropna(subset=["technology_description"])
.drop_duplicates(subset=["energy_source_code_1", "technology_description"])
.drop_duplicates(subset=["energy_source_code_1"], keep=False)
.set_index("energy_source_code_1")["technology_description"]
.to_dict(),
)

out_df.loc[
out_df.technology_description.isna(), "technology_description"
] = out_df.energy_source_code_1.map(static_fuels)

assert len(out_df) == nrows_orig

# Assert that at least 95 percent of tech desc rows are filled in
pct_cov = out_df.technology_description.count() / out_df.technology_description.size
logger.info(f"Filled technology_type coverage now at {pct_cov:.1%}")
pct_val = 0.95
if (
out_df.technology_description.count() / out_df.technology_description.size
< pct_val
):
if pct_cov < pct_val:
raise AssertionError(
f"technology_description filling no longer covering {pct_val:.0%}"
)
Expand Down
Loading

0 comments on commit 3b9e6bd

Please sign in to comment.