Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Require Python 3.10 and update to modern syntax #1685

Merged
merged 6 commits into from
Jun 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .github/workflows/tox-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ jobs:
ci-test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.8", "3.9", "3.10"]
fail-fast: false

steps:
Expand All @@ -21,7 +19,7 @@ jobs:
mamba-version: "*"
channels: conda-forge,defaults
channel-priority: true
python-version: ${{ matrix.python-version }}
python-version: "3.10"
activate-environment: pudl-test
environment-file: test/test-environment.yml
- shell: bash -l {0}
Expand Down
13 changes: 13 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@ repos:
hooks:
- id: rm-unneeded-f-str

# Use built-in types for annotations as per PEP585
- repo: https://github.com/sondrelg/pep585-upgrade
rev: 'v1.0'
hooks:
- id: upgrade-type-hints

# Update Python language constructs to modern standards
- repo: https://github.com/asottile/pyupgrade
rev: v2.32.1
hooks:
- id: pyupgrade
args: ['--py310-plus']

########################################################################################
# Linters: hooks that check but don't alter Python and documentation files
########################################################################################
Expand Down
28 changes: 17 additions & 11 deletions .readthedocs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,26 @@
# Required
version: 2

# Set the version of Python and other tools you might need
build:
os: ubuntu-22.04
tools:
python: mambaforge-4.10

# Define the python environment using conda / mamba
conda:
environment: docs/docs-environment.yml

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: docs/conf.py

build:
image: testing
apt_packages:
- libsnappy-dev
configuration: docs/conf.py
builder: html
fail_on_warning: true

# Set the version of Python and requirements required to build your docs
python:
version: "3.9"
install:
- method: pip
path: .
extra_requirements:
- doc
- method: pip
path: .
extra_requirements:
- doc
3 changes: 1 addition & 2 deletions devtools/ferc1-eia-glue/find_unmapped_plants_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,6 @@
import logging
import sys
from pathlib import Path
from typing import Dict

import coloredlogs
import pandas as pd
Expand All @@ -96,7 +95,7 @@
MAX_LOST_PLANTS_EIA: int = 50
MAX_LOST_UTILS_EIA: int = 10

PUDL_SETTINGS: Dict[str, str] = pudl.workspace.setup.get_defaults()
PUDL_SETTINGS: dict[str, str] = pudl.workspace.setup.get_defaults()


def parse_command_line(argv: str) -> argparse.Namespace:
Expand Down
13 changes: 13 additions & 0 deletions docs/docs-environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
name: pudl-docs
channels:
- conda-forge
dependencies:
- geopandas>=0.9,<11
- numba>=0.55.1,<0.56
- pip>=22,<23
- pygeos>=0.10,<0.13
- python>=3.10,<3.11
- python-snappy>=0.6,<1
- setuptools<63
- sqlite>=3.36,<4
- tox>=3.24,<4
4 changes: 1 addition & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"eia 861",
"ferc 714",
],
python_requires=">=3.8,<3.11",
python_requires=">=3.10,<3.11",
setup_requires=["setuptools_scm"],
install_requires=[
"addfips>=0.3.1,<0.4.0",
Expand Down Expand Up @@ -128,8 +128,6 @@
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering",
],
Expand Down
5 changes: 2 additions & 3 deletions src/pudl/analysis/allocate_net_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@

import logging
import warnings
from typing import List

# Useful high-level external modules.
import numpy as np
Expand Down Expand Up @@ -275,8 +274,8 @@ def scale_allocated_net_gen_by_ownership(

def agg_by_generator(
gen_pm_fuel: pd.DataFrame,
by_cols: List[str] = IDX_GENS,
sum_cols: List[str] = ["net_generation_mwh", "fuel_consumed_mmbtu"],
by_cols: list[str] = IDX_GENS,
sum_cols: list[str] = ["net_generation_mwh", "fuel_consumed_mmbtu"],
) -> pd.DataFrame:
"""Aggreate the allocated gen fuel data to the generator level.

Expand Down
100 changes: 55 additions & 45 deletions src/pudl/analysis/epa_crosswalk.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
"""Use the EPA crosswalk to connect EPA units to EIA generators and other data.

A major use case for this dataset is to identify subplants within plant_ids,
which are the smallest coherent units for aggregation.
Despite the name, plant_id refers to a legal entity that often contains
multiple distinct power plants, even of different technology or fuel types.
A major use case for this dataset is to identify subplants within plant_ids, which are
the smallest coherent units for aggregation. Despite the name, plant_id refers to a
legal entity that often contains multiple distinct power plants, even of different
technology or fuel types.

EPA CEMS data combines information from several parts of a power plant:

* emissions from smokestacks
* fuel use from combustors
* electricty production from generators
But smokestacks, combustors, and generators can be connected in
complex, many-to-many relationships. This complexity makes attribution difficult for,
as an example, allocating pollution to energy producers.
Furthermore, heterogeneity within plant_ids make aggregation
to the parent entity difficult or inappropriate.

But by analyzing the relationships between combustors and generators,
as provided in the EPA/EIA crosswalk, we can identify distinct power plants.
These are the smallest coherent units of aggregation.
But smokestacks, combustors, and generators can be connected in complex, many-to-many
relationships. This complexity makes attribution difficult for, as an example,
allocating pollution to energy producers. Furthermore, heterogeneity within plant_ids
make aggregation to the parent entity difficult or inappropriate.

But by analyzing the relationships between combustors and generators, as provided in the
EPA/EIA crosswalk, we can identify distinct power plants. These are the smallest
coherent units of aggregation.

In graph analysis terminology, the crosswalk is a list of edges between nodes
(combustors and generators) in a bipartite graph. The networkx python package provides
Expand All @@ -38,21 +39,19 @@
filtered_crosswalk = filter_crosswalk(epa_crosswalk_df, epacems)
crosswalk_with_subplant_ids = make_subplant_ids(filtered_crosswalk)
"""
from typing import Union

import dask.dataframe as dd
import networkx as nx
import pandas as pd


def _get_unique_keys(epacems: Union[pd.DataFrame, dd.DataFrame]) -> pd.DataFrame:
def _get_unique_keys(epacems: pd.DataFrame | dd.DataFrame) -> pd.DataFrame:
"""Get unique unit IDs from CEMS data.

Args:
epacems (Union[pd.DataFrame, dd.DataFrame]): epacems dataset from pudl.output.epacems.epacems
epacems: dataset from :func:`pudl.output.epacems.epacems`

Returns:
pd.DataFrame: unique keys from the epacems dataset
Unique keys from the epacems dataset.

"""
# The purpose of this function is mostly to resolve the
Expand All @@ -64,7 +63,7 @@ def _get_unique_keys(epacems: Union[pd.DataFrame, dd.DataFrame]) -> pd.DataFrame


def filter_crosswalk_by_epacems(
crosswalk: pd.DataFrame, epacems: Union[pd.DataFrame, dd.DataFrame]
crosswalk: pd.DataFrame, epacems: pd.DataFrame | dd.DataFrame
) -> pd.DataFrame:
"""Inner join unique CEMS units with the EPA crosswalk.

Expand All @@ -74,11 +73,11 @@ def filter_crosswalk_by_epacems(

Args:
crosswalk: the EPA crosswalk, as from pudl.output.epacems.epa_crosswalk()
unique_epacems_ids (pd.DataFrame): unique ids from _get_unique_keys
unique_epacems_ids: unique ids from _get_unique_keys

Returns:
The inner join of the EPA crosswalk and unique epacems units. Adds
the global ID column unit_id_epa.
The inner join of the EPA crosswalk and unique epacems units. Adds the global ID
column unit_id_epa.

"""
unique_epacems_ids = _get_unique_keys(epacems)
Expand All @@ -94,13 +93,14 @@ def filter_crosswalk_by_epacems(
def filter_out_unmatched(crosswalk: pd.DataFrame) -> pd.DataFrame:
"""Remove unmatched or excluded (non-exporting) units.

Unmatched rows are limitations of the completeness of the EPA crosswalk itself, not of PUDL.
Unmatched rows are limitations of the completeness of the EPA crosswalk itself, not
of PUDL.

Args:
crosswalk (pd.DataFrame): the EPA crosswalk, as from pudl.output.epacems.epa_crosswalk()
crosswalk: the EPA crosswalk, as from :func:`pudl.output.epacems.epa_crosswalk`

Returns:
pd.DataFrame: the EPA crosswalk with unmatched units removed
The EPA crosswalk with unmatched units removed.
"""
bad = crosswalk["MATCH_TYPE_GEN"].isin({"CAMD Unmatched", "Manual CAMD Excluded"})
return crosswalk.loc[~bad].copy()
Expand All @@ -110,10 +110,10 @@ def filter_out_boiler_rows(crosswalk: pd.DataFrame) -> pd.DataFrame:
"""Remove rows that represent graph edges between generators and boilers.

Args:
crosswalk (pd.DataFrame): the EPA crosswalk, as from pudl.output.epacems.epa_crosswalk()
crosswalk: the EPA crosswalk, as from :func:`pudl.output.epacems.epa_crosswalk`

Returns:
pd.DataFrame: the EPA crosswalk with boiler rows (many/one-to-many) removed
The EPA crosswalk with boiler rows (many/one-to-many) removed
"""
crosswalk = crosswalk.drop_duplicates(
subset=["CAMD_PLANT_ID", "CAMD_UNIT_ID", "EIA_GENERATOR_ID"]
Expand All @@ -125,10 +125,11 @@ def _prep_for_networkx(crosswalk: pd.DataFrame) -> pd.DataFrame:
"""Make surrogate keys for combustors and generators.

Args:
crosswalk (pd.DataFrame): EPA crosswalk, as from pudl.output.epacems.epa_crosswalk()
crosswalk: EPA crosswalk, as from :func:`pudl.output.epacems.epa_crosswalk`

Returns:
pd.DataFrame: copy of EPA crosswalk with new surrogate ID columns 'combustor_id' and 'generator_id'
A copy of EPA crosswalk with new surrogate ID columns 'combustor_id' and
'generator_id'
"""
prepped = crosswalk.copy()
# networkx can't handle composite keys, so make surrogates
Expand All @@ -145,13 +146,13 @@ def _prep_for_networkx(crosswalk: pd.DataFrame) -> pd.DataFrame:


def _subplant_ids_from_prepped_crosswalk(prepped: pd.DataFrame) -> pd.DataFrame:
"""Use networkx graph analysis to create global subplant IDs from a preprocessed crosswalk edge list.
"""Use graph analysis to create global subplant IDs from a crosswalk edge list.

Args:
prepped (pd.DataFrame): an EPA crosswalk that has passed through _prep_for_networkx()
prepped: an EPA crosswalk that has passed through :func:`_prep_for_networkx`

Returns:
pd.DataFrame: copy of EPA crosswalk plus new column 'global_subplant_id'
A copy of EPA crosswalk plus new column 'global_subplant_id'
"""
graph = nx.from_pandas_edgelist(
prepped,
Expand All @@ -171,27 +172,29 @@ def _subplant_ids_from_prepped_crosswalk(prepped: pd.DataFrame) -> pd.DataFrame:
def _convert_global_id_to_composite_id(
crosswalk_with_ids: pd.DataFrame,
) -> pd.DataFrame:
"""Convert global_subplant_id to an equivalent composite key (CAMD_PLANT_ID, subplant_id).
"""Convert global_subplant_id to a composite key (CAMD_PLANT_ID, subplant_id).

The composite key will be much more stable (though not fully stable!) in time.
The global ID changes if ANY unit or generator changes, whereas the
compound key only changes if units/generators change within that specific plant.

A global ID could also tempt users into using it as a crutch, even though it isn't stable.
A compound key should discourage that behavior.
A global ID could also tempt users into using it as a crutch, even though it isn't
stable. A compound key should discourage that behavior.

Args:
crosswalk_with_ids (pd.DataFrame): crosswalk with global_subplant_id, as from _subplant_ids_from_prepped_crosswalk()
crosswalk_with_ids: crosswalk with global_subplant_id, as from
:func:`_subplant_ids_from_prepped_crosswalk`

Raises:
ValueError: if crosswalk_with_ids has a MultiIndex

Returns:
pd.DataFrame: copy of crosswalk_with_ids with an added column: 'subplant_id'
A copy of crosswalk_with_ids with an added column: 'subplant_id'
"""
if isinstance(crosswalk_with_ids.index, pd.MultiIndex):
raise ValueError(
f"Input crosswalk must have single level index. Given levels: {crosswalk_with_ids.index.names}"
"Input crosswalk must have single level index. "
f"Given levels: {crosswalk_with_ids.index.names}"
)

reindexed = crosswalk_with_ids.reset_index() # copy
Expand Down Expand Up @@ -223,16 +226,20 @@ def _convert_global_id_to_composite_id(


def filter_crosswalk(
crosswalk: pd.DataFrame, epacems: Union[pd.DataFrame, dd.DataFrame]
crosswalk: pd.DataFrame, epacems: pd.DataFrame | dd.DataFrame
) -> pd.DataFrame:
"""Remove crosswalk rows that do not correspond to an EIA facility or are duplicated due to many-to-many boiler relationships.
"""Remove irrelevant or duplicated rows from the crosswalk.

Remove crosswalk rows that do not correspond to an EIA facility or are duplicated
due to many-to-many boiler relationships.

Args:
crosswalk (pd.DataFrame): The EPA/EIA crosswalk, as from pudl.output.epacems.epa_crosswalk()
epacems (Union[pd.DataFrame, dd.DataFrame]): Emissions data. Must contain columns named ["plant_id_eia", "unitid", "unit_id_epa"]
crosswalk: The EPA/EIA crosswalk from :func:`pudl.output.epacems.epa_crosswalk`
epacems: Emissions data. Must contain columns named
["plant_id_eia", "unitid", "unit_id_epa"]

Returns:
pd.DataFrame: A filtered copy of EPA crosswalk
A filtered copy of EPA crosswalk.
"""
filtered_crosswalk = filter_out_unmatched(crosswalk)
filtered_crosswalk = filter_out_boiler_rows(filtered_crosswalk)
Expand All @@ -241,7 +248,9 @@ def filter_crosswalk(


def make_subplant_ids(crosswalk: pd.DataFrame) -> pd.DataFrame:
"""Identify sub-plants in the EPA/EIA crosswalk graph. Any row filtering should be done before this step.
"""Identify sub-plants in the EPA/EIA crosswalk graph.

Any row filtering should be done before this step.

Usage Example:

Expand All @@ -251,10 +260,11 @@ def make_subplant_ids(crosswalk: pd.DataFrame) -> pd.DataFrame:
crosswalk_with_subplant_ids = make_subplant_ids(filtered_crosswalk)

Args:
crosswalk (pd.DataFrame): The EPA/EIA crosswalk, as from pudl.output.epacems.epa_crosswalk()
crosswalk: The EPA/EIA crosswalk, from :func:`pudl.output.epacems.epa_crosswalk`

Returns:
pd.DataFrame: An edge list connecting EPA units to EIA generators, with connected pieces issued a subplant_id
An edge list connecting EPA units to EIA generators, with connected pieces
issued a subplant_id
"""
edge_list = _prep_for_networkx(crosswalk)
edge_list = _subplant_ids_from_prepped_crosswalk(edge_list)
Expand Down
Loading