catalyst-cooperative · zaneselvans · Jun 10, 2022 · Jun 10, 2022 · Jun 10, 2022 · Jun 10, 2022
diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml
@@ -6,8 +6,6 @@ jobs:
   ci-test:
     runs-on: ubuntu-latest
     strategy:
-      matrix:
-        python-version: ["3.8", "3.9", "3.10"]
       fail-fast: false
 
     steps:
@@ -21,7 +19,7 @@ jobs:
         mamba-version: "*"
         channels: conda-forge,defaults
         channel-priority: true
-        python-version: ${{ matrix.python-version }}
+        python-version: "3.10"
         activate-environment: pudl-test
         environment-file: test/test-environment.yml
     - shell: bash -l {0}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -54,6 +54,19 @@ repos:
   hooks:
   - id: rm-unneeded-f-str
 
+# Use built-in types for annotations as per PEP585
+- repo: https://github.com/sondrelg/pep585-upgrade
+  rev: 'v1.0'
+  hooks:
+  - id: upgrade-type-hints
+
+# Update Python language constructs to modern standards
+- repo: https://github.com/asottile/pyupgrade
+  rev: v2.32.1
+  hooks:
+  - id: pyupgrade
+    args: ['--py310-plus']
+
 ########################################################################################
 # Linters: hooks that check but don't alter Python and documentation files
 ########################################################################################

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -5,20 +5,26 @@
 # Required
 version: 2
 
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: mambaforge-4.10
+
+# Define the python environment using conda / mamba
+conda:
+  environment: docs/docs-environment.yml
+
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
-    configuration: docs/conf.py
-
-build:
-  image: testing
-  apt_packages:
-    - libsnappy-dev
+  configuration: docs/conf.py
+  builder: html
+  fail_on_warning: true
 
 # Set the version of Python and requirements required to build your docs
 python:
-  version: "3.9"
   install:
-    - method: pip
-      path: .
-      extra_requirements:
-          - doc
+  - method: pip
+    path: .
+    extra_requirements:
+    - doc
diff --git a/devtools/ferc1-eia-glue/find_unmapped_plants_utils.py b/devtools/ferc1-eia-glue/find_unmapped_plants_utils.py
@@ -69,7 +69,6 @@
 import logging
 import sys
 from pathlib import Path
-from typing import Dict
 
 import coloredlogs
 import pandas as pd
@@ -96,7 +95,7 @@
 MAX_LOST_PLANTS_EIA: int = 50
 MAX_LOST_UTILS_EIA: int = 10
 
-PUDL_SETTINGS: Dict[str, str] = pudl.workspace.setup.get_defaults()
+PUDL_SETTINGS: dict[str, str] = pudl.workspace.setup.get_defaults()
 
 
 def parse_command_line(argv: str) -> argparse.Namespace:

diff --git a/docs/docs-environment.yml b/docs/docs-environment.yml
@@ -0,0 +1,13 @@
+name: pudl-docs
+channels:
+  - conda-forge
+dependencies:
+  - geopandas>=0.9,<11
+  - numba>=0.55.1,<0.56
+  - pip>=22,<23
+  - pygeos>=0.10,<0.13
+  - python>=3.10,<3.11
+  - python-snappy>=0.6,<1
+  - setuptools<63
+  - sqlite>=3.36,<4
+  - tox>=3.24,<4
diff --git a/setup.py b/setup.py
@@ -45,7 +45,7 @@
         "eia 861",
         "ferc 714",
     ],
-    python_requires=">=3.8,<3.11",
+    python_requires=">=3.10,<3.11",
     setup_requires=["setuptools_scm"],
     install_requires=[
         "addfips>=0.3.1,<0.4.0",
@@ -128,8 +128,6 @@
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3 :: Only",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Topic :: Scientific/Engineering",
     ],

diff --git a/src/pudl/analysis/allocate_net_gen.py b/src/pudl/analysis/allocate_net_gen.py
@@ -83,7 +83,6 @@
 
 import logging
 import warnings
-from typing import List
 
 # Useful high-level external modules.
 import numpy as np
@@ -275,8 +274,8 @@ def scale_allocated_net_gen_by_ownership(
 
 def agg_by_generator(
     gen_pm_fuel: pd.DataFrame,
-    by_cols: List[str] = IDX_GENS,
-    sum_cols: List[str] = ["net_generation_mwh", "fuel_consumed_mmbtu"],
+    by_cols: list[str] = IDX_GENS,
+    sum_cols: list[str] = ["net_generation_mwh", "fuel_consumed_mmbtu"],
 ) -> pd.DataFrame:
     """Aggreate the allocated gen fuel data to the generator level.
 

diff --git a/src/pudl/analysis/epa_crosswalk.py b/src/pudl/analysis/epa_crosswalk.py
@@ -1,23 +1,24 @@
 """Use the EPA crosswalk to connect EPA units to EIA generators and other data.
 
-A major use case for this dataset is to identify subplants within plant_ids,
-which are the smallest coherent units for aggregation.
-Despite the name, plant_id refers to a legal entity that often contains
-multiple distinct power plants, even of different technology or fuel types.
+A major use case for this dataset is to identify subplants within plant_ids, which are
+the smallest coherent units for aggregation.  Despite the name, plant_id refers to a
+legal entity that often contains multiple distinct power plants, even of different
+technology or fuel types.
 
 EPA CEMS data combines information from several parts of a power plant:
+
 * emissions from smokestacks
 * fuel use from combustors
 * electricty production from generators
-But smokestacks, combustors, and generators can be connected in
-complex, many-to-many relationships. This complexity makes attribution difficult for,
-as an example, allocating pollution to energy producers.
-Furthermore, heterogeneity within plant_ids make aggregation
-to the parent entity difficult or inappropriate.
 
-But by analyzing the relationships between combustors and generators,
-as provided in the EPA/EIA crosswalk, we can identify distinct power plants.
-These are the smallest coherent units of aggregation.
+But smokestacks, combustors, and generators can be connected in complex, many-to-many
+relationships. This complexity makes attribution difficult for, as an example,
+allocating pollution to energy producers.  Furthermore, heterogeneity within plant_ids
+make aggregation to the parent entity difficult or inappropriate.
+
+But by analyzing the relationships between combustors and generators, as provided in the
+EPA/EIA crosswalk, we can identify distinct power plants.  These are the smallest
+coherent units of aggregation.
 
 In graph analysis terminology, the crosswalk is a list of edges between nodes
 (combustors and generators) in a bipartite graph. The networkx python package provides
@@ -38,21 +39,19 @@
 filtered_crosswalk = filter_crosswalk(epa_crosswalk_df, epacems)
 crosswalk_with_subplant_ids = make_subplant_ids(filtered_crosswalk)
 """
-from typing import Union
-
 import dask.dataframe as dd
 import networkx as nx
 import pandas as pd
 
 
-def _get_unique_keys(epacems: Union[pd.DataFrame, dd.DataFrame]) -> pd.DataFrame:
+def _get_unique_keys(epacems: pd.DataFrame | dd.DataFrame) -> pd.DataFrame:
     """Get unique unit IDs from CEMS data.
 
     Args:
-        epacems (Union[pd.DataFrame, dd.DataFrame]): epacems dataset from pudl.output.epacems.epacems
+        epacems: dataset from :func:`pudl.output.epacems.epacems`
 
     Returns:
-        pd.DataFrame: unique keys from the epacems dataset
+        Unique keys from the epacems dataset.
 
     """
     # The purpose of this function is mostly to resolve the
@@ -64,7 +63,7 @@ def _get_unique_keys(epacems: Union[pd.DataFrame, dd.DataFrame]) -> pd.DataFrame
 
 
 def filter_crosswalk_by_epacems(
-    crosswalk: pd.DataFrame, epacems: Union[pd.DataFrame, dd.DataFrame]
+    crosswalk: pd.DataFrame, epacems: pd.DataFrame | dd.DataFrame
 ) -> pd.DataFrame:
     """Inner join unique CEMS units with the EPA crosswalk.
 
@@ -74,11 +73,11 @@ def filter_crosswalk_by_epacems(
 
     Args:
         crosswalk: the EPA crosswalk, as from pudl.output.epacems.epa_crosswalk()
-        unique_epacems_ids (pd.DataFrame): unique ids from _get_unique_keys
+        unique_epacems_ids: unique ids from _get_unique_keys
 
     Returns:
-        The inner join of the EPA crosswalk and unique epacems units. Adds
-        the global ID column unit_id_epa.
+        The inner join of the EPA crosswalk and unique epacems units. Adds the global ID
+        column unit_id_epa.
 
     """
     unique_epacems_ids = _get_unique_keys(epacems)
@@ -94,13 +93,14 @@ def filter_crosswalk_by_epacems(
 def filter_out_unmatched(crosswalk: pd.DataFrame) -> pd.DataFrame:
     """Remove unmatched or excluded (non-exporting) units.
 
-    Unmatched rows are limitations of the completeness of the EPA crosswalk itself, not of PUDL.
+    Unmatched rows are limitations of the completeness of the EPA crosswalk itself, not
+    of PUDL.
 
     Args:
-        crosswalk (pd.DataFrame): the EPA crosswalk, as from pudl.output.epacems.epa_crosswalk()
+        crosswalk: the EPA crosswalk, as from :func:`pudl.output.epacems.epa_crosswalk`
 
     Returns:
-        pd.DataFrame: the EPA crosswalk with unmatched units removed
+        The EPA crosswalk with unmatched units removed.
     """
     bad = crosswalk["MATCH_TYPE_GEN"].isin({"CAMD Unmatched", "Manual CAMD Excluded"})
     return crosswalk.loc[~bad].copy()
@@ -110,10 +110,10 @@ def filter_out_boiler_rows(crosswalk: pd.DataFrame) -> pd.DataFrame:
     """Remove rows that represent graph edges between generators and boilers.
 
     Args:
-        crosswalk (pd.DataFrame): the EPA crosswalk, as from pudl.output.epacems.epa_crosswalk()
+        crosswalk: the EPA crosswalk, as from :func:`pudl.output.epacems.epa_crosswalk`
 
     Returns:
-        pd.DataFrame: the EPA crosswalk with boiler rows (many/one-to-many) removed
+        The EPA crosswalk with boiler rows (many/one-to-many) removed
     """
     crosswalk = crosswalk.drop_duplicates(
         subset=["CAMD_PLANT_ID", "CAMD_UNIT_ID", "EIA_GENERATOR_ID"]
@@ -125,10 +125,11 @@ def _prep_for_networkx(crosswalk: pd.DataFrame) -> pd.DataFrame:
     """Make surrogate keys for combustors and generators.
 
     Args:
-        crosswalk (pd.DataFrame): EPA crosswalk, as from pudl.output.epacems.epa_crosswalk()
+        crosswalk: EPA crosswalk, as from :func:`pudl.output.epacems.epa_crosswalk`
 
     Returns:
-        pd.DataFrame: copy of EPA crosswalk with new surrogate ID columns 'combustor_id' and 'generator_id'
+        A copy of EPA crosswalk with new surrogate ID columns 'combustor_id' and
+        'generator_id'
     """
     prepped = crosswalk.copy()
     # networkx can't handle composite keys, so make surrogates
@@ -145,13 +146,13 @@ def _prep_for_networkx(crosswalk: pd.DataFrame) -> pd.DataFrame:
 
 
 def _subplant_ids_from_prepped_crosswalk(prepped: pd.DataFrame) -> pd.DataFrame:
-    """Use networkx graph analysis to create global subplant IDs from a preprocessed crosswalk edge list.
+    """Use graph analysis to create global subplant IDs from a crosswalk edge list.
 
     Args:
-        prepped (pd.DataFrame): an EPA crosswalk that has passed through _prep_for_networkx()
+        prepped: an EPA crosswalk that has passed through :func:`_prep_for_networkx`
 
     Returns:
-        pd.DataFrame: copy of EPA crosswalk plus new column 'global_subplant_id'
+        A copy of EPA crosswalk plus new column 'global_subplant_id'
     """
     graph = nx.from_pandas_edgelist(
         prepped,
@@ -171,27 +172,29 @@ def _subplant_ids_from_prepped_crosswalk(prepped: pd.DataFrame) -> pd.DataFrame:
 def _convert_global_id_to_composite_id(
     crosswalk_with_ids: pd.DataFrame,
 ) -> pd.DataFrame:
-    """Convert global_subplant_id to an equivalent composite key (CAMD_PLANT_ID, subplant_id).
+    """Convert global_subplant_id to a composite key (CAMD_PLANT_ID, subplant_id).
 
     The composite key will be much more stable (though not fully stable!) in time.
     The global ID changes if ANY unit or generator changes, whereas the
     compound key only changes if units/generators change within that specific plant.
 
-    A global ID could also tempt users into using it as a crutch, even though it isn't stable.
-    A compound key should discourage that behavior.
+    A global ID could also tempt users into using it as a crutch, even though it isn't
+    stable.  A compound key should discourage that behavior.
 
     Args:
-        crosswalk_with_ids (pd.DataFrame): crosswalk with global_subplant_id, as from _subplant_ids_from_prepped_crosswalk()
+        crosswalk_with_ids: crosswalk with global_subplant_id, as from
+            :func:`_subplant_ids_from_prepped_crosswalk`
 
     Raises:
         ValueError: if crosswalk_with_ids has a MultiIndex
 
     Returns:
-        pd.DataFrame: copy of crosswalk_with_ids with an added column: 'subplant_id'
+        A copy of crosswalk_with_ids with an added column: 'subplant_id'
     """
     if isinstance(crosswalk_with_ids.index, pd.MultiIndex):
         raise ValueError(
-            f"Input crosswalk must have single level index. Given levels: {crosswalk_with_ids.index.names}"
+            "Input crosswalk must have single level index. "
+            f"Given levels: {crosswalk_with_ids.index.names}"
         )
 
     reindexed = crosswalk_with_ids.reset_index()  # copy
@@ -223,16 +226,20 @@ def _convert_global_id_to_composite_id(
 
 
 def filter_crosswalk(
-    crosswalk: pd.DataFrame, epacems: Union[pd.DataFrame, dd.DataFrame]
+    crosswalk: pd.DataFrame, epacems: pd.DataFrame | dd.DataFrame
 ) -> pd.DataFrame:
-    """Remove crosswalk rows that do not correspond to an EIA facility or are duplicated due to many-to-many boiler relationships.
+    """Remove irrelevant or duplicated rows from the crosswalk.
+
+    Remove crosswalk rows that do not correspond to an EIA facility or are duplicated
+    due to many-to-many boiler relationships.
 
     Args:
-        crosswalk (pd.DataFrame): The EPA/EIA crosswalk, as from pudl.output.epacems.epa_crosswalk()
-        epacems (Union[pd.DataFrame, dd.DataFrame]): Emissions data. Must contain columns named ["plant_id_eia", "unitid", "unit_id_epa"]
+        crosswalk: The EPA/EIA crosswalk from :func:`pudl.output.epacems.epa_crosswalk`
+        epacems: Emissions data. Must contain columns named
+            ["plant_id_eia", "unitid", "unit_id_epa"]
 
     Returns:
-        pd.DataFrame: A filtered copy of EPA crosswalk
+        A filtered copy of EPA crosswalk.
     """
     filtered_crosswalk = filter_out_unmatched(crosswalk)
     filtered_crosswalk = filter_out_boiler_rows(filtered_crosswalk)
@@ -241,7 +248,9 @@ def filter_crosswalk(
 
 
 def make_subplant_ids(crosswalk: pd.DataFrame) -> pd.DataFrame:
-    """Identify sub-plants in the EPA/EIA crosswalk graph. Any row filtering should be done before this step.
+    """Identify sub-plants in the EPA/EIA crosswalk graph.
+
+    Any row filtering should be done before this step.
 
     Usage Example:
 
@@ -251,10 +260,11 @@ def make_subplant_ids(crosswalk: pd.DataFrame) -> pd.DataFrame:
     crosswalk_with_subplant_ids = make_subplant_ids(filtered_crosswalk)
 
     Args:
-        crosswalk (pd.DataFrame): The EPA/EIA crosswalk, as from pudl.output.epacems.epa_crosswalk()
+        crosswalk: The EPA/EIA crosswalk, from :func:`pudl.output.epacems.epa_crosswalk`
 
     Returns:
-        pd.DataFrame: An edge list connecting EPA units to EIA generators, with connected pieces issued a subplant_id
+        An edge list connecting EPA units to EIA generators, with connected pieces
+        issued a subplant_id
     """
     edge_list = _prep_for_networkx(crosswalk)
     edge_list = _subplant_ids_from_prepped_crosswalk(edge_list)