From dd23b3386f35442cc5c0de7750940dd71458f40a Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 10 Jun 2022 11:47:38 -0500 Subject: [PATCH 1/6] Require py3.10 and only run tests on py3.10 --- .github/workflows/tox-pytest.yml | 4 +--- setup.py | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml index 4e65edcafe..1522137ccc 100644 --- a/.github/workflows/tox-pytest.yml +++ b/.github/workflows/tox-pytest.yml @@ -6,8 +6,6 @@ jobs: ci-test: runs-on: ubuntu-latest strategy: - matrix: - python-version: ["3.8", "3.9", "3.10"] fail-fast: false steps: @@ -21,7 +19,7 @@ jobs: mamba-version: "*" channels: conda-forge,defaults channel-priority: true - python-version: ${{ matrix.python-version }} + python-version: "3.10" activate-environment: pudl-test environment-file: test/test-environment.yml - shell: bash -l {0} diff --git a/setup.py b/setup.py index 6cf4bb4733..bc02127fb8 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ "eia 861", "ferc 714", ], - python_requires=">=3.8,<3.11", + python_requires=">=3.10,<3.11", setup_requires=["setuptools_scm"], install_requires=[ "addfips>=0.3.1,<0.4.0", From 456f9354353d353e30aff6f077f6810b51d27af0 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 10 Jun 2022 13:30:59 -0500 Subject: [PATCH 2/6] Update RTD to use python 3.10 and mambaforge --- .readthedocs.yaml | 28 +++++++++++++++++----------- docs/docs-environment.yml | 13 +++++++++++++ 2 files changed, 30 insertions(+), 11 deletions(-) create mode 100644 docs/docs-environment.yml diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 97f7b6bfcc..aa234771db 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,20 +5,26 @@ # Required version: 2 +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: mambaforge-4.10 + +# Define the python environment using conda / mamba +conda: + environment: docs/docs-environment.yml + # Build documentation in the docs/ directory with Sphinx sphinx: - configuration: docs/conf.py - -build: - image: testing - apt_packages: - - libsnappy-dev + configuration: docs/conf.py + builder: html + fail_on_warning: true # Set the version of Python and requirements required to build your docs python: - version: "3.9" install: - - method: pip - path: . - extra_requirements: - - doc + - method: pip + path: . + extra_requirements: + - doc diff --git a/docs/docs-environment.yml b/docs/docs-environment.yml new file mode 100644 index 0000000000..ebd83449cf --- /dev/null +++ b/docs/docs-environment.yml @@ -0,0 +1,13 @@ +name: pudl-docs +channels: + - conda-forge +dependencies: + - geopandas>=0.9,<11 + - numba>=0.55.1,<0.56 + - pip>=22,<23 + - pygeos>=0.10,<0.13 + - python>=3.10,<3.11 + - python-snappy>=0.6,<1 + - setuptools<63 + - sqlite>=3.36,<4 + - tox>=3.24,<4 From 0722a0cc5a6d70a021b05760d27b27d12eeff692 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 10 Jun 2022 13:46:29 -0500 Subject: [PATCH 3/6] Update PyPI metadata on compatible Python versions. --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index bc02127fb8..ed6dd5412b 100644 --- a/setup.py +++ b/setup.py @@ -128,8 +128,6 @@ "Programming Language :: Python", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Topic :: Scientific/Engineering", ], From faa3efc1ebe379552c231c91942e086edb43484b Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 10 Jun 2022 15:34:08 -0500 Subject: [PATCH 4/6] Update type hints with PEP585 compatible types. --- .pre-commit-config.yaml | 6 ++ .../find_unmapped_plants_utils.py | 3 +- src/pudl/analysis/allocate_net_gen.py | 5 +- src/pudl/analysis/plant_parts_eia.py | 22 ++--- src/pudl/analysis/spatial.py | 3 +- src/pudl/analysis/state_demand.py | 12 +-- src/pudl/analysis/timeseries_cleaning.py | 12 +-- src/pudl/etl.py | 26 +++--- src/pudl/extract/ferc1.py | 7 +- src/pudl/glue/eia_epacems.py | 5 +- src/pudl/glue/ferc1_eia.py | 10 +-- src/pudl/helpers.py | 22 ++--- src/pudl/load.py | 3 +- src/pudl/metadata/classes.py | 88 ++++++++----------- src/pudl/metadata/codes.py | 4 +- src/pudl/metadata/constants.py | 18 ++-- src/pudl/metadata/enums.py | 27 +++--- src/pudl/metadata/fields.py | 20 ++--- src/pudl/metadata/helpers.py | 33 +++---- src/pudl/metadata/labels.py | 15 ++-- src/pudl/metadata/resources/__init__.py | 5 +- src/pudl/metadata/resources/eia.py | 4 +- src/pudl/metadata/resources/eia860.py | 4 +- src/pudl/metadata/resources/eia861.py | 6 +- src/pudl/metadata/resources/eia923.py | 4 +- src/pudl/metadata/resources/epacems.py | 4 +- src/pudl/metadata/resources/ferc1.py | 4 +- src/pudl/metadata/resources/ferc714.py | 4 +- src/pudl/metadata/resources/glue.py | 4 +- src/pudl/metadata/resources/pudl.py | 4 +- src/pudl/metadata/sources.py | 4 +- src/pudl/output/epacems.py | 6 +- src/pudl/output/ferc714.py | 6 +- src/pudl/settings.py | 34 +++---- src/pudl/transform/eia.py | 7 +- src/pudl/transform/eia861.py | 3 +- src/pudl/transform/eia923.py | 9 +- src/pudl/transform/ferc1.py | 9 +- src/pudl/workspace/datastore.py | 24 ++--- src/pudl/workspace/resource_cache.py | 6 +- test/unit/analysis/epa_crosswalk_test.py | 4 +- test/unit/analysis/state_demand_test.py | 4 +- .../unit/analysis/timeseries_cleaning_test.py | 9 +- test/unit/harvest_test.py | 16 ++-- test/unit/workspace/datastore_test.py | 3 +- 45 files changed, 256 insertions(+), 272 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 18fe940a2c..2c84b9efcb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -54,6 +54,12 @@ repos: hooks: - id: rm-unneeded-f-str +# Use built-in types for annotations as per PEP585 +- repo: https://github.com/sondrelg/pep585-upgrade + rev: 'v1.0' + hooks: + - id: upgrade-type-hints + ######################################################################################## # Linters: hooks that check but don't alter Python and documentation files ######################################################################################## diff --git a/devtools/ferc1-eia-glue/find_unmapped_plants_utils.py b/devtools/ferc1-eia-glue/find_unmapped_plants_utils.py index 099c289071..1b96f5aa00 100755 --- a/devtools/ferc1-eia-glue/find_unmapped_plants_utils.py +++ b/devtools/ferc1-eia-glue/find_unmapped_plants_utils.py @@ -69,7 +69,6 @@ import logging import sys from pathlib import Path -from typing import Dict import coloredlogs import pandas as pd @@ -96,7 +95,7 @@ MAX_LOST_PLANTS_EIA: int = 50 MAX_LOST_UTILS_EIA: int = 10 -PUDL_SETTINGS: Dict[str, str] = pudl.workspace.setup.get_defaults() +PUDL_SETTINGS: dict[str, str] = pudl.workspace.setup.get_defaults() def parse_command_line(argv: str) -> argparse.Namespace: diff --git a/src/pudl/analysis/allocate_net_gen.py b/src/pudl/analysis/allocate_net_gen.py index 2645fb10dd..a1b082fd07 100644 --- a/src/pudl/analysis/allocate_net_gen.py +++ b/src/pudl/analysis/allocate_net_gen.py @@ -83,7 +83,6 @@ import logging import warnings -from typing import List # Useful high-level external modules. import numpy as np @@ -275,8 +274,8 @@ def scale_allocated_net_gen_by_ownership( def agg_by_generator( gen_pm_fuel: pd.DataFrame, - by_cols: List[str] = IDX_GENS, - sum_cols: List[str] = ["net_generation_mwh", "fuel_consumed_mmbtu"], + by_cols: list[str] = IDX_GENS, + sum_cols: list[str] = ["net_generation_mwh", "fuel_consumed_mmbtu"], ) -> pd.DataFrame: """Aggreate the allocated gen fuel data to the generator level. diff --git a/src/pudl/analysis/plant_parts_eia.py b/src/pudl/analysis/plant_parts_eia.py index 88e372c5b2..12ebd0613a 100644 --- a/src/pudl/analysis/plant_parts_eia.py +++ b/src/pudl/analysis/plant_parts_eia.py @@ -182,7 +182,7 @@ import logging import warnings from copy import deepcopy -from typing import Dict, List, Literal, Optional +from typing import Literal, Optional import numpy as np import pandas as pd @@ -200,7 +200,7 @@ pd.options.display.width = 1000 pd.options.display.max_columns = 1000 -PLANT_PARTS: Dict[str, Dict[str, List]] = { +PLANT_PARTS: dict[str, dict[str, list]] = { "plant": { "id_cols": ["plant_id_eia"], }, @@ -236,7 +236,7 @@ """ -PLANT_PARTS_ORDERED: List[str] = [ +PLANT_PARTS_ORDERED: list[str] = [ "plant", "plant_unit", "plant_prime_mover", @@ -259,7 +259,7 @@ ] -IDX_TO_ADD: List[str] = ["report_date", "operational_status_pudl"] +IDX_TO_ADD: list[str] = ["report_date", "operational_status_pudl"] """ list: list of additional columns to add to the id_cols in :py:const:`PLANT_PARTS`. The id_cols are the base columns that we need to aggregate on, but we also need @@ -268,7 +268,7 @@ non-operating plant-parts. """ -IDX_OWN_TO_ADD: List[str] = ["utility_id_eia", "ownership"] +IDX_OWN_TO_ADD: list[str] = ["utility_id_eia", "ownership"] """ list: list of additional columns beyond the :py:const:`IDX_TO_ADD` to add to the id_cols in :py:const:`PLANT_PARTS` when we are dealing with plant-part records @@ -276,7 +276,7 @@ owners. """ -SUM_COLS: List[str] = [ +SUM_COLS: list[str] = [ "total_fuel_cost", "net_generation_mwh", "capacity_mw", @@ -424,7 +424,7 @@ def execute( self, mcoe: pd.DataFrame, own_eia860: pd.DataFrame, - slice_cols: List[str] = SUM_COLS, + slice_cols: list[str] = SUM_COLS, validate_own_merge: str = "1:m", ) -> pd.DataFrame: """Make the mega generators table with ownership integrated. @@ -857,8 +857,8 @@ def __init__(self, part_name: PLANT_PARTS_LITERAL): def execute( self, gens_mega: pd.DataFrame, - sum_cols: List[str] = SUM_COLS, - wtavg_dict: Dict = WTAVG_DICT, + sum_cols: list[str] = SUM_COLS, + wtavg_dict: dict = WTAVG_DICT, ) -> pd.DataFrame: """Get a table of data aggregated by a specific plant-part. @@ -1144,7 +1144,7 @@ def __init__( self, attribute_col: str, part_name: str, - assign_col_dict: Optional[Dict[str, str]] = None, + assign_col_dict: Optional[dict[str, str]] = None, ): """Initialize a attribute adder. @@ -1497,7 +1497,7 @@ def match_to_single_plant_part( multi_gran_df: pd.DataFrame, ppl: pd.DataFrame, part_name: PLANT_PARTS_LITERAL = "plant_gen", - cols_to_keep: List[str] = [], + cols_to_keep: list[str] = [], ) -> pd.DataFrame: """Match data with a variety of granularities to a single plant-part. diff --git a/src/pudl/analysis/spatial.py b/src/pudl/analysis/spatial.py index a6b84ddd25..723c4982b7 100644 --- a/src/pudl/analysis/spatial.py +++ b/src/pudl/analysis/spatial.py @@ -1,7 +1,8 @@ """Spatial operations for demand allocation.""" import itertools import warnings -from typing import Callable, Iterable, Literal, Union +from collections.abc import Callable, Iterable +from typing import Literal, Union import geopandas as gpd import pandas as pd diff --git a/src/pudl/analysis/state_demand.py b/src/pudl/analysis/state_demand.py index 4c6a260066..ff56e06874 100644 --- a/src/pudl/analysis/state_demand.py +++ b/src/pudl/analysis/state_demand.py @@ -22,13 +22,13 @@ PUDL_DIR/local/state-demand/demand.csv """ - import argparse import datetime import logging import pathlib import sys -from typing import Any, Dict, Iterable, List, Tuple, Union +from collections.abc import Iterable +from typing import Any, Union import matplotlib.pyplot as plt import numpy as np @@ -45,7 +45,7 @@ # --- Constants --- # -STATES: List[Dict[str, Union[str, int]]] = [ +STATES: list[dict[str, Union[str, int]]] = [ {"name": "Alabama", "code": "AL", "fips": "01"}, {"name": "Alaska", "code": "AK", "fips": "02"}, {"name": "Arizona", "code": "AZ", "fips": "04"}, @@ -111,7 +111,7 @@ """ -STANDARD_UTC_OFFSETS: Dict[str, str] = { +STANDARD_UTC_OFFSETS: dict[str, str] = { "Pacific/Honolulu": -10, "America/Anchorage": -9, "America/Los_Angeles": -8, @@ -127,7 +127,7 @@ """ -UTC_OFFSETS: Dict[str, int] = { +UTC_OFFSETS: dict[str, int] = { "HST": -10, "AKST": -9, "AKDT": -8, @@ -324,7 +324,7 @@ def load_ventyx_hourly_state_demand(path: str) -> pd.DataFrame: def load_ferc714_hourly_demand_matrix( pudl_out: pudl.output.pudltabl.PudlTabl, -) -> Tuple[pd.DataFrame, pd.DataFrame]: +) -> tuple[pd.DataFrame, pd.DataFrame]: """Read and format FERC 714 hourly demand into matrix form. Args: diff --git a/src/pudl/analysis/timeseries_cleaning.py b/src/pudl/analysis/timeseries_cleaning.py index a70b7f6946..1fbc3dc2c0 100644 --- a/src/pudl/analysis/timeseries_cleaning.py +++ b/src/pudl/analysis/timeseries_cleaning.py @@ -29,10 +29,10 @@ * https://github.com/xinychen/tensor-learning """ - import functools import warnings -from typing import Any, Iterable, List, Sequence, Tuple, Union +from collections.abc import Iterable, Sequence +from typing import Any, Union import matplotlib.pyplot as plt import numpy as np @@ -44,7 +44,7 @@ def slice_axis( x: np.ndarray, start: int = None, end: int = None, step: int = None, axis: int = 0 -) -> Tuple[slice, ...]: +) -> tuple[slice, ...]: """Return an index that slices an array along an axis. Args: @@ -109,7 +109,7 @@ def array_diff( return dx -def encode_run_length(x: Union[Sequence, np.ndarray]) -> Tuple[np.ndarray, np.ndarray]: +def encode_run_length(x: Union[Sequence, np.ndarray]) -> tuple[np.ndarray, np.ndarray]: """Encode vector with run-length encoding. Args: @@ -566,7 +566,7 @@ def __init__(self, x: Union[np.ndarray, pd.DataFrame]) -> None: self.columns = pd.RangeIndex(x.shape[1]) self.x: np.ndarray = self.xi.copy() self.flags: np.ndarray = np.empty(self.x.shape, dtype=object) - self.flagged: List[str] = [] + self.flagged: list[str] = [] def to_dataframe(self, array: np.ndarray = None, copy: bool = True) -> pd.DataFrame: """Return multivariate timeseries as a :class:`pandas.DataFrame`. @@ -768,7 +768,7 @@ def flag_local_outlier( shifts: Sequence[int] = range(-240, 241, 24), long_window: int = 480, iqr_window: int = 240, - multiplier: Tuple[float, float] = (3.5, 2.5), + multiplier: tuple[float, float] = (3.5, 2.5), ) -> None: """Flag local outliers (LOCAL_OUTLIER_HIGH, LOCAL_OUTLIER_LOW). diff --git a/src/pudl/etl.py b/src/pudl/etl.py index ed986d71e6..5c9d68c5de 100644 --- a/src/pudl/etl.py +++ b/src/pudl/etl.py @@ -20,7 +20,7 @@ from concurrent.futures import ProcessPoolExecutor from functools import partial from pathlib import Path -from typing import Any, Dict, List +from typing import Any import pandas as pd import pyarrow as pa @@ -50,7 +50,7 @@ ############################################################################### -def _read_static_tables_eia() -> Dict[str, pd.DataFrame]: +def _read_static_tables_eia() -> dict[str, pd.DataFrame]: """Build dataframes of static EIA tables for use as foreign key constraints. There are many values specified within the data that are essentially @@ -75,8 +75,8 @@ def _read_static_tables_eia() -> Dict[str, pd.DataFrame]: def _etl_eia( - eia_settings: EiaSettings, ds_kwargs: Dict[str, Any] -) -> Dict[str, pd.DataFrame]: + eia_settings: EiaSettings, ds_kwargs: dict[str, Any] +) -> dict[str, pd.DataFrame]: """Extract, transform and load CSVs for the EIA datasets. Args: @@ -163,7 +163,7 @@ def _etl_eia( ############################################################################### -def _read_static_tables_ferc1() -> Dict[str, pd.DataFrame]: +def _read_static_tables_ferc1() -> dict[str, pd.DataFrame]: """Populate static PUDL tables with constants for use as foreign keys. There are many values specified within the data that are essentially @@ -186,8 +186,8 @@ def _read_static_tables_ferc1() -> Dict[str, pd.DataFrame]: def _etl_ferc1( ferc1_settings: Ferc1Settings, - pudl_settings: Dict[str, Any], -) -> Dict[str, pd.DataFrame]: + pudl_settings: dict[str, Any], +) -> dict[str, pd.DataFrame]: """Extract, transform and load CSVs for FERC Form 1. Args: @@ -221,10 +221,10 @@ def _etl_ferc1( ############################################################################### def _etl_one_year_epacems( year: int, - states: List[str], + states: list[str], pudl_db: str, out_dir: str, - ds_kwargs: Dict[str, Any], + ds_kwargs: dict[str, Any], ) -> None: """Process one year of EPA CEMS and output year-state paritioned Parquet files.""" pudl_engine = sa.create_engine(pudl_db) @@ -248,8 +248,8 @@ def _etl_one_year_epacems( def etl_epacems( epacems_settings: EpaCemsSettings, - pudl_settings: Dict[str, Any], - ds_kwargs: Dict[str, Any], + pudl_settings: dict[str, Any], + ds_kwargs: dict[str, Any], ) -> None: """Extract, transform and load CSVs for EPA CEMS. @@ -341,7 +341,7 @@ def etl_epacems( ############################################################################### # GLUE EXPORT FUNCTIONS ############################################################################### -def _etl_glue(glue_settings: GlueSettings) -> Dict[str, pd.DataFrame]: +def _etl_glue(glue_settings: GlueSettings) -> dict[str, pd.DataFrame]: """Extract, transform and load CSVs for the Glue tables. Args: @@ -373,7 +373,7 @@ def _etl_glue(glue_settings: GlueSettings) -> Dict[str, pd.DataFrame]: def etl( # noqa: C901 etl_settings: EtlSettings, - pudl_settings: Dict, + pudl_settings: dict, clobber: bool = False, use_local_cache: bool = True, gcs_cache_path: str = None, diff --git a/src/pudl/extract/ferc1.py b/src/pudl/extract/ferc1.py index aa06e6f705..1e3c98935a 100644 --- a/src/pudl/extract/ferc1.py +++ b/src/pudl/extract/ferc1.py @@ -53,7 +53,6 @@ import io import logging from pathlib import Path -from typing import Dict, Set import dbfread import pandas as pd @@ -145,7 +144,7 @@ def missing_respondents(reported, observed, identified): return records -def observed_respondents(ferc1_engine: sa.engine.Engine) -> Set[int]: +def observed_respondents(ferc1_engine: sa.engine.Engine) -> set[int]: """Compile the set of all observed respondent IDs found in the FERC 1 database. A significant number of FERC 1 respondent IDs appear in the data tables, but not @@ -182,8 +181,8 @@ class Ferc1Datastore: def __init__(self, datastore: Datastore): """Instantiate datastore wrapper for ferc1 resources.""" self.datastore = datastore - self._cache: Dict[int, io.BytesIO] = {} - self.dbc_path: Dict[int, Path] = {} + self._cache: dict[int, io.BytesIO] = {} + self.dbc_path: dict[int, Path] = {} with importlib.resources.open_text(self.PACKAGE_PATH, "file_map.csv") as f: for row in csv.DictReader(f): diff --git a/src/pudl/glue/eia_epacems.py b/src/pudl/glue/eia_epacems.py index 53e985d813..2f0037da5d 100644 --- a/src/pudl/glue/eia_epacems.py +++ b/src/pudl/glue/eia_epacems.py @@ -16,7 +16,6 @@ """ import importlib import logging -from typing import Dict import pandas as pd @@ -64,7 +63,7 @@ def grab_n_clean_epa_orignal(): return eia_epacems_crosswalk -def split_tables(df: pd.DataFrame) -> Dict[str, pd.DataFrame]: +def split_tables(df: pd.DataFrame) -> dict[str, pd.DataFrame]: """Split the cleaned EIA-EPA crosswalk table into three normalized tables. Args: @@ -98,7 +97,7 @@ def split_tables(df: pd.DataFrame) -> Dict[str, pd.DataFrame]: } -def grab_clean_split() -> Dict[str, pd.DataFrame]: +def grab_clean_split() -> dict[str, pd.DataFrame]: """Clean raw crosswalk data, drop nans, and return split tables. Returns: diff --git a/src/pudl/glue/ferc1_eia.py b/src/pudl/glue/ferc1_eia.py index 603d3fa39f..ff2a061c73 100644 --- a/src/pudl/glue/ferc1_eia.py +++ b/src/pudl/glue/ferc1_eia.py @@ -31,7 +31,7 @@ """ import importlib import logging -from typing import Dict, Iterable, List +from collections.abc import Iterable import pandas as pd import sqlalchemy as sa @@ -43,7 +43,7 @@ # Identify only those utilities assocaited with plants that reported data # at some point in the EIA 923 -- these are the ones we might need to link # to the FERC Form 1 utilities: -DATA_TABLES_EIA923: List[str] = [ +DATA_TABLES_EIA923: list[str] = [ "boiler_fuel_eia923", "fuel_receipts_costs_eia923", "generation_eia923", @@ -100,7 +100,7 @@ def get_utility_map() -> pd.DataFrame: def get_db_plants_ferc1( - pudl_settings: Dict[str, str], years: Iterable[int] + pudl_settings: dict[str, str], years: Iterable[int] ) -> pd.DataFrame: """Pull a dataframe of all plants in the FERC Form 1 DB for the given years. @@ -283,7 +283,7 @@ def get_mapped_utils_ferc1(): def get_unmapped_plants_ferc1( - pudl_settings: Dict[str, str], + pudl_settings: dict[str, str], years: Iterable[int], ) -> pd.DataFrame: """Generate a DataFrame of all unmapped FERC plants in the given years. @@ -519,7 +519,7 @@ def get_mapped_utils_eia() -> pd.DataFrame: def get_unmapped_utils_eia( pudl_engine: sa.engine.Engine, - data_tables_eia923: List[str] = DATA_TABLES_EIA923, + data_tables_eia923: list[str] = DATA_TABLES_EIA923, ) -> pd.DataFrame: """Get a list of all the EIA Utilities in the PUDL DB without PUDL IDs. diff --git a/src/pudl/helpers.py b/src/pudl/helpers.py index 70a16159ec..17b419e597 100644 --- a/src/pudl/helpers.py +++ b/src/pudl/helpers.py @@ -16,7 +16,7 @@ from functools import partial from importlib import resources from io import BytesIO -from typing import Any, DefaultDict, Dict, List, Literal, Optional, Set, Union +from typing import Any, Literal, Optional, Union import addfips import numpy as np @@ -46,7 +46,7 @@ def label_map( from_col: str = "code", to_col: str = "label", null_value: Union[str, type(pd.NA)] = pd.NA, -) -> DefaultDict[str, Union[str, type(pd.NA)]]: +) -> defaultdict[str, Union[str, type(pd.NA)]]: """Build a mapping dictionary from two columns of a labeling / coding dataframe. These dataframes document the meanings of the codes that show up in much of the @@ -78,9 +78,9 @@ def label_map( def find_new_ferc1_strings( table: str, field: str, - strdict: Dict[str, List[str]], + strdict: dict[str, list[str]], ferc1_engine: sa.engine.Engine, -) -> Set[str]: +) -> set[str]: """Identify as-of-yet uncategorized freeform strings in FERC Form 1. Args: @@ -104,7 +104,7 @@ def find_new_ferc1_strings( return all_strings.difference(old_strings) -def find_foreign_key_errors(dfs: Dict[str, pd.DataFrame]) -> List[Dict[str, Any]]: +def find_foreign_key_errors(dfs: dict[str, pd.DataFrame]) -> list[dict[str, Any]]: """Report foreign key violations from a dictionary of dataframes. The database schema to check against is generated based on the names of the @@ -356,11 +356,11 @@ def convert_col_to_datetime(df, date_col_name): def full_timeseries_date_merge( left: pd.DataFrame, right: pd.DataFrame, - on: List[str], + on: list[str], left_date_col: str = "report_date", right_date_col: str = "report_date", new_date_col: str = "report_date", - date_on: List[str] = ["year"], + date_on: list[str] = ["year"], how: Literal["inner", "outer", "left", "right", "cross"] = "inner", report_at_start: bool = True, freq: str = "MS", @@ -408,11 +408,11 @@ def _add_suffix_to_date_on(date_on): def date_merge( left: pd.DataFrame, right: pd.DataFrame, - on: List[str], + on: list[str], left_date_col: str = "report_date", right_date_col: str = "report_date", new_date_col: str = "report_date", - date_on: List[str] = None, + date_on: list[str] = None, how: Literal["inner", "outer", "left", "right", "cross"] = "inner", report_at_start: bool = True, **kwargs, @@ -516,7 +516,7 @@ def separate_date_cols(df, date_col_name, date_on): def expand_timeseries( df: pd.DataFrame, - key_cols: List[str], + key_cols: list[str], date_col: str = "report_date", freq: str = "MS", fill_through_freq: Literal["year", "month", "day"] = "year", @@ -1477,7 +1477,7 @@ def sum_and_weighted_average_agg( df_in: pd.DataFrame, by: list, sum_cols: list, - wtavg_dict: Dict[str, str], + wtavg_dict: dict[str, str], ) -> pd.DataFrame: """Aggregate dataframe by summing and using weighted averages. diff --git a/src/pudl/load.py b/src/pudl/load.py index d12a54f5ed..b6ab79a088 100644 --- a/src/pudl/load.py +++ b/src/pudl/load.py @@ -4,7 +4,6 @@ import sys from sqlite3 import Connection as SQLite3Connection from sqlite3 import sqlite_version -from typing import Dict import pandas as pd import sqlalchemy as sa @@ -20,7 +19,7 @@ def dfs_to_sqlite( - dfs: Dict[str, pd.DataFrame], + dfs: dict[str, pd.DataFrame], engine: sa.engine.Engine, check_foreign_keys: bool = True, check_types: bool = True, diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index c081c287f7..e376b0093f 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -4,20 +4,10 @@ import logging import re import sys +from collections.abc import Callable, Iterable from functools import lru_cache from pathlib import Path -from typing import ( - Any, - Callable, - Dict, - Iterable, - List, - Literal, - Optional, - Tuple, - Type, - Union, -) +from typing import Any, Literal, Optional, Union import jinja2 import pandas as pd @@ -157,7 +147,7 @@ class Base(pydantic.BaseModel): Examples: >>> class Class(Base): - ... fields_: List[str] = pydantic.Field(alias="fields") + ... fields_: list[str] = pydantic.Field(alias="fields") >>> m = Class(fields=['x']) >>> m Class(fields=['x']) @@ -196,7 +186,7 @@ def __setattr__(self, name, value) -> None: name = f"{name}_" super().__setattr__(name, value) - def __repr_args__(self) -> List[Tuple[str, Any]]: + def __repr_args__(self) -> list[tuple[str, Any]]: """Returns the attributes to show in __str__, __repr__, and __pretty__.""" return [ (a[:-1] if a in ("fields_", "schema_") else a, v) @@ -284,7 +274,7 @@ def validate(cls, value: Any) -> re.Pattern: return value -def StrictList(item_type: Type = Any) -> pydantic.ConstrainedList: # noqa: N802 +def StrictList(item_type: type = Any) -> pydantic.ConstrainedList: # noqa: N802 """Non-empty :class:`list`. Allows :class:`list`, :class:`tuple`, :class:`set`, :class:`frozenset`, @@ -424,14 +414,14 @@ class Encoder(Base): values. """ - ignored_codes: List[Union[Int, str]] = [] + ignored_codes: list[Union[Int, str]] = [] """A list of non-standard codes which appear in the data, and will be set to NA. These codes may be the result of data entry errors, and we are unable to map them to the appropriate canonical code. They are discarded from the raw input data. """ - code_fixes: Dict[Union[Int, String], Union[Int, String]] = {} + code_fixes: dict[Union[Int, String], Union[Int, String]] = {} """A dictionary mapping non-standard codes to canonical, standardized codes. The intended meanings of some non-standard codes are clear, and therefore they can @@ -512,7 +502,7 @@ def _check_fixed_codes_are_good_codes(cls, code_fixes, values): # noqa: N805 return code_fixes @property - def code_map(self) -> Dict[str, Union[str, type(pd.NA)]]: + def code_map(self) -> dict[str, Union[str, type(pd.NA)]]: """A mapping of all known codes to their standardized values, or NA.""" code_map = {code: code for code in self.df["code"]} code_map.update(self.code_fixes) @@ -795,9 +785,9 @@ class Schema(Base): """ fields_: StrictList(Field) = pydantic.Field(alias="fields") - missing_values: List[pydantic.StrictStr] = [""] + missing_values: list[pydantic.StrictStr] = [""] primary_key: StrictList(SnakeCase) = None - foreign_keys: List[ForeignKey] = [] + foreign_keys: list[ForeignKey] = [] _check_unique = _validator( "missing_values", "primary_key", "foreign_keys", fn=_check_unique @@ -908,18 +898,18 @@ class DataSource(Base): title: String = None description: String = None field_namespace: String = None - keywords: List[str] = [] + keywords: list[str] = [] path: HttpUrl = None - contributors: List[Contributor] = [] # Or should this be compiled from Resources? + contributors: list[Contributor] = [] # Or should this be compiled from Resources? license_raw: License license_pudl: License # concept_doi: Doi = None # Need to define a Doi type? - working_partitions: Dict[SnakeCase, Any] = {} - source_file_dict: Dict[SnakeCase, Any] = {} + working_partitions: dict[SnakeCase, Any] = {} + source_file_dict: dict[SnakeCase, Any] = {} # agency: Agency # needs to be defined email: Email = None - def get_resource_ids(self) -> List[str]: + def get_resource_ids(self) -> list[str]: """Compile list of resoruce IDs associated with this data source.""" # Temporary check to use eia861.RESOURCE_METADATA directly # eia861 is not currently included in the general RESOURCE_METADATA dict @@ -935,7 +925,7 @@ def get_resource_ids(self) -> List[str]: ] ) - def get_temporal_coverage(self, partitions: Dict = None) -> str: + def get_temporal_coverage(self, partitions: dict = None) -> str: """Return a string describing the time span covered by the data source.""" if partitions is None: partitions = self.working_partitions @@ -960,8 +950,8 @@ def add_datastore_metadata(self) -> None: def to_rst( self, docs_dir: DirectoryPath, - source_resources: List, - extra_resources: List, + source_resources: list, + extra_resources: list, output_path: str = None, ) -> None: """Output a representation of the data source in RST for documentation.""" @@ -988,7 +978,7 @@ def to_rst( sys.stdout.write(rendered) @classmethod - def from_field_namespace(cls, x: str) -> List["DataSource"]: + def from_field_namespace(cls, x: str) -> list["DataSource"]: """Return list of DataSource objects by field namespace.""" return [ cls(**cls.dict_from_id(name)) @@ -1148,10 +1138,10 @@ class Resource(Base): description: String = None harvest: ResourceHarvest = {} schema_: Schema = pydantic.Field(alias="schema") - contributors: List[Contributor] = [] - licenses: List[License] = [] - sources: List[DataSource] = [] - keywords: List[String] = [] + contributors: list[Contributor] = [] + licenses: list[License] = [] + sources: list[DataSource] = [] + keywords: list[String] = [] encoder: Encoder = None field_namespace: Literal[ "eia", "epacems", "ferc1", "ferc714", "glue", "pudl" @@ -1319,7 +1309,7 @@ def to_pyarrow(self) -> pa.Schema: def to_pandas_dtypes( self, **kwargs: Any - ) -> Dict[str, Union[str, pd.CategoricalDtype]]: + ) -> dict[str, Union[str, pd.CategoricalDtype]]: """Return Pandas data type of each field by field name. Args: @@ -1327,7 +1317,7 @@ def to_pandas_dtypes( """ return {f.name: f.to_pandas_dtype(**kwargs) for f in self.schema.fields} - def match_primary_key(self, names: Iterable[str]) -> Optional[Dict[str, str]]: + def match_primary_key(self, names: Iterable[str]) -> Optional[dict[str, str]]: """Match primary key fields to input field names. An exact match is required unless :attr:`harvest` .`harvest=True`, @@ -1447,7 +1437,7 @@ def format_df(self, df: pd.DataFrame = None, **kwargs: Any) -> pd.DataFrame: def aggregate_df( self, df: pd.DataFrame, raised: bool = False, error: Callable = None - ) -> Tuple[pd.DataFrame, dict]: + ) -> tuple[pd.DataFrame, dict]: """Aggregate dataframe by primary key. The dataframe is grouped by primary key fields @@ -1555,11 +1545,11 @@ def _build_aggregation_report(self, df: pd.DataFrame, errors: dict) -> dict: def harvest_dfs( self, - dfs: Dict[str, pd.DataFrame], + dfs: dict[str, pd.DataFrame], aggregate: bool = None, - aggregate_kwargs: Dict[str, Any] = {}, - format_kwargs: Dict[str, Any] = {}, - ) -> Tuple[pd.DataFrame, dict]: + aggregate_kwargs: dict[str, Any] = {}, + format_kwargs: dict[str, Any] = {}, + ) -> tuple[pd.DataFrame, dict]: """Harvest from named dataframes. For standard resources (:attr:`harvest`. `harvest=False`), the columns @@ -1661,12 +1651,12 @@ class Package(Base): name: String title: String = None description: String = None - keywords: List[String] = [] + keywords: list[String] = [] homepage: HttpUrl = "https://catalyst.coop/pudl" created: Datetime = datetime.datetime.utcnow() - contributors: List[Contributor] = [] - sources: List[DataSource] = [] - licenses: List[License] = [] + contributors: list[Contributor] = [] + sources: list[DataSource] = [] + licenses: list[License] = [] resources: StrictList(Resource) @pydantic.validator("resources") @@ -1709,7 +1699,7 @@ def _populate_from_resources(cls, values): # noqa: N805 @lru_cache def from_resource_ids( # noqa: C901 cls, - resource_ids: Tuple[str] = tuple(sorted(RESOURCE_METADATA)), + resource_ids: tuple[str] = tuple(sorted(RESOURCE_METADATA)), resolve_foreign_keys: bool = False, ) -> "Package": """Construct a collection of Resources from PUDL identifiers (`resource.name`). @@ -1782,7 +1772,7 @@ class CodeMetadata(Base): Used to export to documentation. """ - encoder_list: List[Encoder] = [] + encoder_list: list[Encoder] = [] @classmethod def from_code_ids(cls, code_ids: Iterable[str]) -> "CodeMetadata": @@ -1817,9 +1807,9 @@ class DatasetteMetadata(Base): Used to create metadata YAML file to accompany Datasette. """ - data_sources: List[DataSource] - resources: List[Resource] = Package.from_resource_ids().resources - label_columns: Dict[str, str] = { + data_sources: list[DataSource] + resources: list[Resource] = Package.from_resource_ids().resources + label_columns: dict[str, str] = { "plants_entity_eia": "plant_name_eia", "plants_ferc1": "plant_name_ferc1", "plants_pudl": "plant_name_pudl", diff --git a/src/pudl/metadata/codes.py b/src/pudl/metadata/codes.py index 49cfcc5a85..76a1254c24 100644 --- a/src/pudl/metadata/codes.py +++ b/src/pudl/metadata/codes.py @@ -11,12 +11,12 @@ be set to NA. """ -from typing import Any, Dict +from typing import Any import numpy as np import pandas as pd -CODE_METADATA: Dict[str, Dict[str, Any]] = { +CODE_METADATA: dict[str, dict[str, Any]] = { "coalmine_types_eia": { "df": pd.DataFrame( columns=["code", "label", "description"], diff --git a/src/pudl/metadata/constants.py b/src/pudl/metadata/constants.py index a5828d54dc..33dd272b9f 100644 --- a/src/pudl/metadata/constants.py +++ b/src/pudl/metadata/constants.py @@ -1,12 +1,12 @@ """Metadata and operational constants.""" import datetime -from typing import Callable, Dict, List, Type +from collections.abc import Callable import pandas as pd import pyarrow as pa import sqlalchemy as sa -FIELD_DTYPES_PANDAS: Dict[str, str] = { +FIELD_DTYPES_PANDAS: dict[str, str] = { "string": "string", "number": "float64", "integer": "Int64", @@ -19,7 +19,7 @@ Pandas data type by PUDL field type (Data Package `field.type`). """ -FIELD_DTYPES_PYARROW: Dict[str, pa.lib.DataType] = { +FIELD_DTYPES_PYARROW: dict[str, pa.lib.DataType] = { "boolean": pa.bool_(), "date": pa.date32(), "datetime": pa.timestamp("ms", tz="UTC"), @@ -29,7 +29,7 @@ "year": pa.int32(), } -FIELD_DTYPES_SQL: Dict[str, sa.sql.visitors.VisitableType] = { +FIELD_DTYPES_SQL: dict[str, sa.sql.visitors.VisitableType] = { "boolean": sa.Boolean, "date": sa.Date, "datetime": sa.DateTime, @@ -42,7 +42,7 @@ SQLAlchemy column types by PUDL field type (Data Package `field.type`). """ -CONSTRAINT_DTYPES: Dict[str, Type] = { +CONSTRAINT_DTYPES: dict[str, type] = { "string": str, "integer": int, "year": int, @@ -55,7 +55,7 @@ Python types for field constraints by PUDL field type (Data Package `field.type`). """ -LICENSES: Dict[str, Dict[str, str]] = { +LICENSES: dict[str, dict[str, str]] = { "cc-by-4.0": { "name": "CC-BY-4.0", "title": "Creative Commons Attribution 4.0", @@ -71,7 +71,7 @@ License attributes. """ -PERIODS: Dict[str, Callable[[pd.Series], pd.Series]] = { +PERIODS: dict[str, Callable[[pd.Series], pd.Series]] = { "year": lambda x: x.astype("datetime64[Y]"), "quarter": lambda x: x.apply( pd.tseries.offsets.QuarterBegin(startingMonth=1).rollback @@ -83,7 +83,7 @@ Functions converting datetimes to period start times, by time period. """ -CONTRIBUTORS: Dict[str, Dict[str, str]] = { +CONTRIBUTORS: dict[str, dict[str, str]] = { "catalyst-cooperative": { "title": "Catalyst Cooperative", "email": "pudl@catalyst.coop", @@ -165,7 +165,7 @@ PUDL Contributors for attribution. """ -KEYWORDS: Dict[str, List[str]] = { +KEYWORDS: dict[str, list[str]] = { "electricity": [ "electricity", "electric", diff --git a/src/pudl/metadata/enums.py b/src/pudl/metadata/enums.py index 83815e5a2a..877c98b37b 100644 --- a/src/pudl/metadata/enums.py +++ b/src/pudl/metadata/enums.py @@ -1,7 +1,6 @@ """Enumerations of valid field values.""" -from typing import Dict, List -US_STATES: Dict[str, str] = { +US_STATES: dict[str, str] = { "AK": "Alaska", "AL": "Alabama", "AR": "Arkansas", @@ -55,7 +54,7 @@ } """Mapping of US state abbreviations to their full names.""" -US_TERRITORIES: Dict[str, str] = { +US_TERRITORIES: dict[str, str] = { "AS": "American Samoa", "DC": "District of Columbia", "GU": "Guam", @@ -65,9 +64,9 @@ } """Mapping of US territory abbreviations to their full names.""" -US_STATES_TERRITORIES: Dict[str, str] = {**US_STATES, **US_TERRITORIES} +US_STATES_TERRITORIES: dict[str, str] = {**US_STATES, **US_TERRITORIES} -EPACEMS_STATES: List[str] = [ +EPACEMS_STATES: list[str] = [ state for state in US_STATES_TERRITORIES # AK and PR have data but only a few years, and that breaks the Datastore. @@ -76,7 +75,7 @@ ] """The US states and territories that are present in the EPA CEMS dataset.""" -CANADA_PROVINCES_TERRITORIES: Dict[str, str] = { +CANADA_PROVINCES_TERRITORIES: dict[str, str] = { "AB": "Alberta", "BC": "British Columbia", "CN": "Canada", @@ -94,7 +93,7 @@ } """Mapping of Canadian province and territory abbreviations to their full names""" -NERC_REGIONS: List[str] = [ +NERC_REGIONS: list[str] = [ "BASN", # ASSESSMENT AREA Basin (WECC) "CALN", # ASSESSMENT AREA California (WECC) "CALS", # ASSESSMENT AREA California (WECC) @@ -142,7 +141,7 @@ See https://www.eia.gov/electricity/data/eia411/#tabs_NERC-3. """ -CUSTOMER_CLASSES: List[str] = [ +CUSTOMER_CLASSES: list[str] = [ "commercial", "industrial", "direct_connection", @@ -152,7 +151,7 @@ "transportation", ] -TECH_CLASSES: List[str] = [ +TECH_CLASSES: list[str] = [ "backup", # WHERE Is this used? because removed from DG table b/c not a real component "chp_cogen", "combustion_turbine", @@ -169,7 +168,7 @@ "wind", ] -REVENUE_CLASSES: List[str] = [ +REVENUE_CLASSES: list[str] = [ "credits_or_adjustments", "delivery_customers", "other", @@ -180,9 +179,9 @@ "unbundled", ] -RELIABILITY_STANDARDS: List[str] = ["ieee_standard", "other_standard"] +RELIABILITY_STANDARDS: list[str] = ["ieee_standard", "other_standard"] -FUEL_CLASSES: List[str] = [ +FUEL_CLASSES: list[str] = [ "gas", "oil", "other", @@ -192,7 +191,7 @@ "wood", ] -RTO_CLASSES: List[str] = [ +RTO_CLASSES: list[str] = [ "caiso", "ercot", "isone", @@ -203,7 +202,7 @@ "spp", ] -EPACEMS_MEASUREMENT_CODES: List[str] = [ +EPACEMS_MEASUREMENT_CODES: list[str] = [ "Calculated", "LME", "Measured", diff --git a/src/pudl/metadata/fields.py b/src/pudl/metadata/fields.py index 2d72d8de74..782774cba3 100644 --- a/src/pudl/metadata/fields.py +++ b/src/pudl/metadata/fields.py @@ -1,6 +1,6 @@ """Field metadata.""" from copy import deepcopy -from typing import Any, Dict, Optional +from typing import Any, Optional import pandas as pd from pytz import all_timezones @@ -27,7 +27,7 @@ ) from pudl.metadata.sources import SOURCES -FIELD_METADATA: Dict[str, Dict[str, Any]] = { +FIELD_METADATA: dict[str, dict[str, Any]] = { "active": { "type": "boolean", "description": "Indicates whether or not the dataset has been pulled into PUDL by the extract transform load process.", @@ -2016,7 +2016,7 @@ Keys are in alphabetical order. """ -FIELD_METADATA_BY_GROUP: Dict[str, Dict[str, Any]] = { +FIELD_METADATA_BY_GROUP: dict[str, dict[str, Any]] = { "epacems": { "state": {"constraints": {"enum": EPACEMS_STATES}}, "gross_load_mw": { @@ -2079,7 +2079,7 @@ override. Only those elements which should be overridden need to be specified. """ -FIELD_METADATA_BY_RESOURCE: Dict[str, Dict[str, Any]] = { +FIELD_METADATA_BY_RESOURCE: dict[str, dict[str, Any]] = { "sector_consolidated_eia": {"code": {"type": "integer"}}, "plants_steam_ferc1": { "plant_type": { @@ -2104,10 +2104,10 @@ def get_pudl_dtypes( group: Optional[str] = None, - field_meta: Optional[Dict[str, Any]] = FIELD_METADATA, - field_meta_by_group: Optional[Dict[str, Any]] = FIELD_METADATA_BY_GROUP, - dtype_map: Optional[Dict[str, Any]] = FIELD_DTYPES_PANDAS, -) -> Dict[str, Any]: + field_meta: Optional[dict[str, Any]] = FIELD_METADATA, + field_meta_by_group: Optional[dict[str, Any]] = FIELD_METADATA_BY_GROUP, + dtype_map: Optional[dict[str, Any]] = FIELD_DTYPES_PANDAS, +) -> dict[str, Any]: """Compile a dictionary of field dtypes, applying group overrides. Args: @@ -2137,8 +2137,8 @@ def get_pudl_dtypes( def apply_pudl_dtypes( df: pd.DataFrame, group: Optional[str] = None, - field_meta: Optional[Dict[str, Any]] = FIELD_METADATA, - field_meta_by_group: Optional[Dict[str, Any]] = FIELD_METADATA_BY_GROUP, + field_meta: Optional[dict[str, Any]] = FIELD_METADATA, + field_meta_by_group: Optional[dict[str, Any]] = FIELD_METADATA_BY_GROUP, ) -> pd.DataFrame: """Apply dtypes to those columns in a dataframe that have PUDL types defined. diff --git a/src/pudl/metadata/helpers.py b/src/pudl/metadata/helpers.py index e05dc689d6..fad60f4aab 100644 --- a/src/pudl/metadata/helpers.py +++ b/src/pudl/metadata/helpers.py @@ -1,6 +1,7 @@ """Functions for manipulating metadata constants.""" from collections import defaultdict -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +from collections.abc import Callable, Iterable +from typing import Any, Optional, Union import numpy as np import pandas as pd @@ -44,7 +45,7 @@ def format_errors(*errors: str, title: str = None, pydantic: bool = False) -> st # --- Foreign keys --- # -def _parse_field_names(fields: List[Union[str, dict]]) -> List[str]: +def _parse_field_names(fields: list[Union[str, dict]]) -> list[str]: """Parse field names. Args: @@ -56,7 +57,7 @@ def _parse_field_names(fields: List[Union[str, dict]]) -> List[str]: return [field if isinstance(field, str) else field["name"] for field in fields] -def _parse_foreign_key_rule(rule: dict, name: str, key: List[str]) -> List[dict]: +def _parse_foreign_key_rule(rule: dict, name: str, key: list[str]) -> list[dict]: """Parse foreign key rule from resource descriptor. Args: @@ -86,8 +87,8 @@ def _parse_foreign_key_rule(rule: dict, name: str, key: List[str]) -> List[dict] def _build_foreign_key_tree( - resources: Dict[str, dict] -) -> Dict[str, Dict[Tuple[str, ...], dict]]: + resources: dict[str, dict] +) -> dict[str, dict[tuple[str, ...], dict]]: """Build foreign key tree. Args: @@ -125,8 +126,8 @@ def _build_foreign_key_tree( def _traverse_foreign_key_tree( - tree: Dict[str, Dict[Tuple[str, ...], dict]], name: str, fields: Tuple[str, ...] -) -> List[Dict[str, Any]]: + tree: dict[str, dict[tuple[str, ...], dict]], name: str, fields: tuple[str, ...] +) -> list[dict[str, Any]]: """Traverse foreign key tree. Args: @@ -159,9 +160,9 @@ def _traverse_foreign_key_tree( def build_foreign_keys( - resources: Dict[str, dict], + resources: dict[str, dict], prune: bool = True, -) -> Dict[str, List[dict]]: +) -> dict[str, list[dict]]: """Build foreign keys for each resource. A resource's `foreign_key_rules` (if present) determines which other resources will @@ -228,7 +229,7 @@ def build_foreign_keys( # --- Harvest --- # -def split_period(name: str) -> Tuple[str, Optional[str]]: +def split_period(name: str) -> tuple[str, Optional[str]]: """Split the time period from a column name. Args: @@ -251,7 +252,7 @@ def split_period(name: str) -> Tuple[str, Optional[str]]: return parts[0], parts[1] -def expand_periodic_column_names(names: Iterable[str]) -> List[str]: +def expand_periodic_column_names(names: Iterable[str]) -> list[str]: """Add smaller periods to a list of column names. Args: @@ -330,7 +331,7 @@ def unique(x: pd.Series) -> Any: raise AggregationError("Not unique.") -def as_dict(x: pd.Series) -> Dict[Any, list]: +def as_dict(x: pd.Series) -> dict[Any, list]: """Return dictionary of values, listed by index.""" result = {} x = x.dropna() @@ -414,10 +415,10 @@ def wrapped(x): def groupby_apply( # noqa: C901 df: pd.DataFrame, by: Iterable, - aggfuncs: Dict[Any, Callable], + aggfuncs: dict[Any, Callable], raised: bool = True, error: Callable = None, -) -> Tuple[pd.DataFrame, Dict[Any, pd.Series]]: +) -> tuple[pd.DataFrame, dict[Any, pd.Series]]: """Aggregate dataframe and capture errors (using apply). Args: @@ -500,10 +501,10 @@ def wrapper(x): def groupby_aggregate( # noqa: C901 df: pd.DataFrame, by: Iterable, - aggfuncs: Dict[Any, Callable], + aggfuncs: dict[Any, Callable], raised: bool = True, error: Callable = None, -) -> Tuple[pd.DataFrame, Dict[Any, pd.Series]]: +) -> tuple[pd.DataFrame, dict[Any, pd.Series]]: """Aggregate dataframe and capture errors (using aggregate). Although faster than :func:`groupby_apply`, it has some limitations: diff --git a/src/pudl/metadata/labels.py b/src/pudl/metadata/labels.py index bd855df306..ef6693f1fd 100644 --- a/src/pudl/metadata/labels.py +++ b/src/pudl/metadata/labels.py @@ -1,19 +1,18 @@ """Descriptive labels for coded field values.""" -from typing import Dict -ESTIMATED_OR_ACTUAL: Dict[str, str] = {"E": "estimated", "A": "actual"} +ESTIMATED_OR_ACTUAL: dict[str, str] = {"E": "estimated", "A": "actual"} """ Descriptive labels for EIA estimated or actual codes. """ -MOMENTARY_INTERRUPTIONS: Dict[str, str] = { +MOMENTARY_INTERRUPTIONS: dict[str, str] = { "L": "less_than_1_minute", "F": "less_than_5_minutes", "O": "other", } """Descriptive labels for EIA momentary interruption codes.""" -POWER_PURCHASE_TYPES_FERC1: Dict[str, str] = { +POWER_PURCHASE_TYPES_FERC1: dict[str, str] = { "RQ": "requirement", "LF": "long_firm", "IF": "intermediate_firm", @@ -26,7 +25,7 @@ } """Descriptive labels for FERC 1 power purchase type codes.""" -COALMINE_TYPES_EIA: Dict[str, str] = { +COALMINE_TYPES_EIA: dict[str, str] = { "P": "preparation_plant", "S": "surface", "U": "underground", @@ -39,7 +38,7 @@ These codes and descriptions come from Page 7 of the EIA 923. """ -CENSUS_REGIONS: Dict[str, str] = { +CENSUS_REGIONS: dict[str, str] = { "NEW": "New England", "MAT": "Middle Atlantic", "SAT": "South Atlantic", @@ -57,7 +56,7 @@ Not currently being used. """ -RTO_ISO: Dict[str, str] = { +RTO_ISO: dict[str, str] = { "CAISO": "California ISO", "ERCOT": "Electric Reliability Council of Texas", "ISONE": "ISO New England", @@ -72,7 +71,7 @@ Not currently being used. """ -FUEL_UNITS_EIA: Dict[str, str] = { +FUEL_UNITS_EIA: dict[str, str] = { "mcf": "Thousands of cubic feet (for gases)", "short_tons": "Short tons (for solids)", "barrels": "Barrels (for liquids)", diff --git a/src/pudl/metadata/resources/__init__.py b/src/pudl/metadata/resources/__init__.py index e1fd65881e..2547c2a64b 100644 --- a/src/pudl/metadata/resources/__init__.py +++ b/src/pudl/metadata/resources/__init__.py @@ -2,7 +2,6 @@ import importlib import pkgutil -from typing import Dict, List from pudl.metadata.helpers import build_foreign_keys @@ -14,14 +13,14 @@ resources = module.RESOURCE_METADATA RESOURCE_METADATA.update(resources) -FOREIGN_KEYS: Dict[str, List[dict]] = build_foreign_keys(RESOURCE_METADATA) +FOREIGN_KEYS: dict[str, list[dict]] = build_foreign_keys(RESOURCE_METADATA) """ Generated foreign key constraints by resource name. See :func:`pudl.metadata.helpers.build_foreign_keys`. """ -ENTITIES: Dict[str, Dict[str, List[str]]] = { +ENTITIES: dict[str, dict[str, list[str]]] = { "plants": { "id_cols": ["plant_id_eia"], "static_cols": [ diff --git a/src/pudl/metadata/resources/eia.py b/src/pudl/metadata/resources/eia.py index 7aae52128e..e03cdfd7eb 100644 --- a/src/pudl/metadata/resources/eia.py +++ b/src/pudl/metadata/resources/eia.py @@ -1,9 +1,9 @@ """Definitions of data tables primarily coming from EIA 860/861/923.""" -from typing import Any, Dict +from typing import Any from pudl.metadata.codes import CODE_METADATA -RESOURCE_METADATA: Dict[str, Dict[str, Any]] = { +RESOURCE_METADATA: dict[str, dict[str, Any]] = { "boilers_entity_eia": { "description": "Static boiler attributes compiled from the EIA-860 and EIA-923 data.", "schema": { diff --git a/src/pudl/metadata/resources/eia860.py b/src/pudl/metadata/resources/eia860.py index 36f337a38b..5e42005685 100644 --- a/src/pudl/metadata/resources/eia860.py +++ b/src/pudl/metadata/resources/eia860.py @@ -1,7 +1,7 @@ """Definitions of data tables primarily coming from EIA-860.""" -from typing import Any, Dict +from typing import Any -RESOURCE_METADATA: Dict[str, Dict[str, Any]] = { +RESOURCE_METADATA: dict[str, dict[str, Any]] = { "boiler_generator_assn_eia860": { "description": "Associations between boilers and generators as reported in EIA-860 Schedule 6, Part A. Augmented with various heuristics within PUDL.", "schema": { diff --git a/src/pudl/metadata/resources/eia861.py b/src/pudl/metadata/resources/eia861.py index d485ec13ac..3027036c52 100644 --- a/src/pudl/metadata/resources/eia861.py +++ b/src/pudl/metadata/resources/eia861.py @@ -1,7 +1,7 @@ """Definitions of data tables primarily coming from EIA-861.""" -from typing import Any, Dict, List +from typing import Any -RESOURCE_METADATA: Dict[str, Dict[str, Any]] = { +RESOURCE_METADATA: dict[str, dict[str, Any]] = { "advanced_metering_infrastructure_eia861": { "description": "The data contain number of meters from automated meter readings (AMR) and advanced metering infrastructure (AMI) by state, sector, and balancing authority. The energy served (in megawatthours) for AMI systems is provided. Form EIA-861 respondents also report the number of standard meters (non AMR/AMI) in their system. Historical Changes: We started collecting the number of standard meters in 2013. The monthly survey collected these data from January 2011 to January 2017.", "schema": { @@ -635,7 +635,7 @@ # Association tables that are always generated from the other tables: # - 'balancing_authority_assn_eia861', # - 'utility_assn_eia861', -TABLE_DEPENDENCIES: Dict[str, List[str]] = { +TABLE_DEPENDENCIES: dict[str, list[str]] = { "advanced_metering_infrastructure_eia861": [ "advanced_metering_infrastructure_eia861" ], diff --git a/src/pudl/metadata/resources/eia923.py b/src/pudl/metadata/resources/eia923.py index c45c8929cc..0f2e6beca9 100644 --- a/src/pudl/metadata/resources/eia923.py +++ b/src/pudl/metadata/resources/eia923.py @@ -1,7 +1,7 @@ """Definitions of data tables primarily coming from EIA-923.""" -from typing import Any, Dict +from typing import Any -RESOURCE_METADATA: Dict[str, Dict[str, Any]] = { +RESOURCE_METADATA: dict[str, dict[str, Any]] = { "boiler_fuel_eia923": { "description": "EIA-923 Monthly Boiler Fuel Consumption and Emissions Time Series. From EIA-923 Schedule 3.", "schema": { diff --git a/src/pudl/metadata/resources/epacems.py b/src/pudl/metadata/resources/epacems.py index 637ed8671c..1c7105dfec 100644 --- a/src/pudl/metadata/resources/epacems.py +++ b/src/pudl/metadata/resources/epacems.py @@ -1,7 +1,7 @@ """Table definitions for the EPA CEMS data group.""" -from typing import Any, Dict +from typing import Any -RESOURCE_METADATA: Dict[str, Dict[str, Any]] = { +RESOURCE_METADATA: dict[str, dict[str, Any]] = { "hourly_emissions_epacems": { "description": "Hourly emissions and plant operational data reported via Continuous Emissions Monitoring Systems as required by 40 CFR Part 75.", "schema": { diff --git a/src/pudl/metadata/resources/ferc1.py b/src/pudl/metadata/resources/ferc1.py index 41dab742e8..78cf16b7e4 100644 --- a/src/pudl/metadata/resources/ferc1.py +++ b/src/pudl/metadata/resources/ferc1.py @@ -1,9 +1,9 @@ """Table definitions for the FERC Form 1 data group.""" -from typing import Any, Dict +from typing import Any from pudl.metadata.codes import CODE_METADATA -RESOURCE_METADATA: Dict[str, Dict[str, Any]] = { +RESOURCE_METADATA: dict[str, dict[str, Any]] = { "accumulated_depreciation_ferc1": { "description": "Balances and changes to FERC Accumulated Provision for Depreciation.", "schema": { diff --git a/src/pudl/metadata/resources/ferc714.py b/src/pudl/metadata/resources/ferc714.py index fabddb5e29..5ab542fa21 100644 --- a/src/pudl/metadata/resources/ferc714.py +++ b/src/pudl/metadata/resources/ferc714.py @@ -1,8 +1,8 @@ """Tables definitions for data coming from the FERC Form 714.""" -from typing import Any, Dict +from typing import Any -RESOURCE_METADATA: Dict[str, Dict[str, Any]] = { +RESOURCE_METADATA: dict[str, dict[str, Any]] = { "respondent_id_ferc714": { "description": "Respondent identification. FERC Form 714, Part I, Schedule 1.", "schema": { diff --git a/src/pudl/metadata/resources/glue.py b/src/pudl/metadata/resources/glue.py index 6ae655aa14..8d6eb47861 100644 --- a/src/pudl/metadata/resources/glue.py +++ b/src/pudl/metadata/resources/glue.py @@ -1,7 +1,7 @@ """Definitions for the glue/crosswalk tables that connect data groups.""" -from typing import Any, Dict +from typing import Any -RESOURCE_METADATA: Dict[str, Dict[str, Any]] = { +RESOURCE_METADATA: dict[str, dict[str, Any]] = { "assn_gen_eia_unit_epa": { "schema": { "fields": [ diff --git a/src/pudl/metadata/resources/pudl.py b/src/pudl/metadata/resources/pudl.py index 15fb7f897a..17ea412764 100644 --- a/src/pudl/metadata/resources/pudl.py +++ b/src/pudl/metadata/resources/pudl.py @@ -1,7 +1,7 @@ """Definitions for the glue/crosswalk tables that connect data groups.""" -from typing import Any, Dict +from typing import Any -RESOURCE_METADATA: Dict[str, Dict[str, Any]] = { +RESOURCE_METADATA: dict[str, dict[str, Any]] = { "plants_pudl": { "title": "PUDL Plants", "description": "Home table for PUDL assigned plant IDs. These IDs are manually generated each year when new FERC and EIA reporting is integrated, and any newly identified plants are added to the list with a new ID. Each ID maps to a power plant which is reported in at least one FERC or EIA data set. This table is read in from a spreadsheet stored in the PUDL repository: src/pudl/package_data/glue/pudl_id_mapping.xlsx", diff --git a/src/pudl/metadata/sources.py b/src/pudl/metadata/sources.py index ee7ade8151..55a397d680 100644 --- a/src/pudl/metadata/sources.py +++ b/src/pudl/metadata/sources.py @@ -1,10 +1,10 @@ """Metadata and operational constants.""" -from typing import Any, Dict +from typing import Any from pudl.metadata.constants import CONTRIBUTORS, KEYWORDS, LICENSES from pudl.metadata.enums import EPACEMS_STATES -SOURCES: Dict[str, Any] = { +SOURCES: dict[str, Any] = { "censusdp1tract": { "title": "Census DP1", "path": "https://www.census.gov/geographies/mapping-files/2010/geo/tiger-data.html", diff --git a/src/pudl/output/epacems.py b/src/pudl/output/epacems.py index b585ad5991..ab5c5e68e4 100644 --- a/src/pudl/output/epacems.py +++ b/src/pudl/output/epacems.py @@ -1,8 +1,8 @@ """Routines that provide user-friendly access to the partitioned EPA CEMS dataset.""" - +from collections.abc import Iterable, Sequence from itertools import product from pathlib import Path -from typing import Iterable, List, Optional, Sequence, Tuple, Union +from typing import Optional, Union import dask.dataframe as dd import pandas as pd @@ -31,7 +31,7 @@ def epa_crosswalk() -> pd.DataFrame: def year_state_filter( years: Iterable[int] = None, states: Iterable[str] = None -) -> List[List[Tuple[Union[str, int]]]]: +) -> list[list[tuple[Union[str, int]]]]: """Create filters to read given years and states from partitioned parquet dataset. A subset of an Apache Parquet dataset can be read in more efficiently if files which diff --git a/src/pudl/output/ferc714.py b/src/pudl/output/ferc714.py index 995c135c6f..918f934e15 100644 --- a/src/pudl/output/ferc714.py +++ b/src/pudl/output/ferc714.py @@ -1,6 +1,6 @@ """Functions & classes for compiling derived aspects of the FERC Form 714 data.""" from functools import cached_property -from typing import Any, Dict, List +from typing import Any import numpy as np import pandas as pd @@ -8,7 +8,7 @@ import pudl from pudl.metadata.fields import apply_pudl_dtypes -ASSOCIATIONS: List[Dict[str, Any]] = [ +ASSOCIATIONS: list[dict[str, Any]] = [ # MISO: Midwest Indep System Operator {"id": 56669, "from": 2011, "to": [2009, 2010]}, # SWPP: Southwest Power Pool @@ -47,7 +47,7 @@ Rows are excluded from `balancing_authority_assn_eia861` with target year and state. """ -UTILITIES: List[Dict[str, Any]] = [ +UTILITIES: list[dict[str, Any]] = [ # (no code): Pacific Gas & Electric Co {"id": 14328, "reassign": True}, # (no code): San Diego Gas & Electric Co diff --git a/src/pudl/settings.py b/src/pudl/settings.py index b6c453efc2..47ee929451 100644 --- a/src/pudl/settings.py +++ b/src/pudl/settings.py @@ -1,6 +1,6 @@ """Module for validating pudl etl settings.""" import pathlib -from typing import ClassVar, List +from typing import ClassVar import pandas as pd import yaml @@ -31,7 +31,7 @@ class GenericDatasetSettings(BaseModel): A dataset can have an arbitrary number of partitions. """ - tables: List[str] + tables: list[str] @root_validator def validate_partitions(cls, partitions): # noqa: N805 @@ -77,8 +77,8 @@ class Ferc1Settings(GenericDatasetSettings): data_source: ClassVar[DataSource] = DataSource.from_id("ferc1") - years: List[int] = data_source.working_partitions["years"] - tables: List[str] = data_source.get_resource_ids() + years: list[int] = data_source.working_partitions["years"] + tables: list[str] = data_source.get_resource_ids() class Ferc714Settings(GenericDatasetSettings): @@ -91,7 +91,7 @@ class Ferc714Settings(GenericDatasetSettings): data_source: ClassVar[DataSource] = DataSource.from_id("ferc714") - tables: List[str] = data_source.get_resource_ids() + tables: list[str] = data_source.get_resource_ids() class EpaCemsSettings(GenericDatasetSettings): @@ -108,9 +108,9 @@ class EpaCemsSettings(GenericDatasetSettings): data_source: ClassVar[DataSource] = DataSource.from_id("epacems") - years: List[int] = data_source.working_partitions["years"] - states: List[str] = data_source.working_partitions["states"] - tables: List[str] = data_source.get_resource_ids() + years: list[int] = data_source.working_partitions["years"] + states: list[str] = data_source.working_partitions["states"] + tables: list[str] = data_source.get_resource_ids() partition: bool = False @validator("states") @@ -132,8 +132,8 @@ class Eia923Settings(GenericDatasetSettings): data_source: ClassVar[DataSource] = DataSource.from_id("eia923") - years: List[int] = data_source.working_partitions["years"] - tables: List[str] = data_source.get_resource_ids() + years: list[int] = data_source.working_partitions["years"] + tables: list[str] = data_source.get_resource_ids() class Eia861Settings(GenericDatasetSettings): @@ -148,9 +148,9 @@ class Eia861Settings(GenericDatasetSettings): data_source: ClassVar[DataSource] = DataSource.from_id("eia861") - years: List[int] = data_source.working_partitions["years"] - tables: List[str] = data_source.get_resource_ids() - transform_functions: List[str] + years: list[int] = data_source.working_partitions["years"] + tables: list[str] = data_source.get_resource_ids() + transform_functions: list[str] @root_validator(pre=True) def generate_transform_functions(cls, values): # noqa: N805 @@ -200,8 +200,8 @@ class Eia860Settings(GenericDatasetSettings): eia860m_data_source: ClassVar[DataSource] = DataSource.from_id("eia860m") eia860m_date: ClassVar[str] = eia860m_data_source.working_partitions["year_month"] - years: List[int] = data_source.working_partitions["years"] - tables: List[str] = data_source.get_resource_ids() + years: list[int] = data_source.working_partitions["years"] + tables: list[str] = data_source.get_resource_ids() eia860m: bool = True @validator("eia860m") @@ -358,8 +358,8 @@ class Ferc1ToSqliteSettings(GenericDatasetSettings): """ data_source: ClassVar[DataSource] = DataSource.from_id("ferc1") - years: List[int] = data_source.working_partitions["years"] - tables: List[str] = sorted(list(DBF_TABLES_FILENAMES.keys())) + years: list[int] = data_source.working_partitions["years"] + tables: list[str] = sorted(list(DBF_TABLES_FILENAMES.keys())) refyear: ClassVar[int] = max(years) bad_cols: tuple = () diff --git a/src/pudl/transform/eia.py b/src/pudl/transform/eia.py index 8e14b92833..1fd1043ef1 100644 --- a/src/pudl/transform/eia.py +++ b/src/pudl/transform/eia.py @@ -19,7 +19,6 @@ import importlib.resources import logging -from typing import Dict import networkx as nx import numpy as np @@ -37,7 +36,7 @@ TZ_FINDER = timezonefinder.TimezoneFinder() """A global TimezoneFinder to cache geographies in memory for faster access.""" -APPROXIMATE_TIMEZONES: Dict[str, str] = { +APPROXIMATE_TIMEZONES: dict[str, str] = { "AK": "America/Anchorage", # Alaska "AL": "America/Chicago", # Alabama "AR": "America/Chicago", # Arkansas @@ -443,8 +442,8 @@ def _manage_strictness(col, eia860m): def harvesting( # noqa: C901 entity: str, - eia_transformed_dfs: Dict[str, pd.DataFrame], - entities_dfs: Dict[str, pd.DataFrame], + eia_transformed_dfs: dict[str, pd.DataFrame], + entities_dfs: dict[str, pd.DataFrame], eia860m: bool = False, debug: bool = False, ) -> tuple: diff --git a/src/pudl/transform/eia861.py b/src/pudl/transform/eia861.py index 26db902884..c9b6d7f519 100644 --- a/src/pudl/transform/eia861.py +++ b/src/pudl/transform/eia861.py @@ -6,7 +6,6 @@ """ import logging -from typing import Dict import pandas as pd @@ -408,7 +407,7 @@ ], ) -NERC_SPELLCHECK: Dict[str, str] = { +NERC_SPELLCHECK: dict[str, str] = { "GUSTAVUSAK": "ASCC", "AK": "ASCC", "HI": "HICC", diff --git a/src/pudl/transform/eia923.py b/src/pudl/transform/eia923.py index 323874b423..d813eb2758 100644 --- a/src/pudl/transform/eia923.py +++ b/src/pudl/transform/eia923.py @@ -1,6 +1,5 @@ """Module to perform data cleaning functions on EIA923 data tables.""" import logging -from typing import Dict import numpy as np import pandas as pd @@ -11,7 +10,7 @@ logger = logging.getLogger(__name__) -COALMINE_COUNTRY_CODES: Dict[str, str] = { +COALMINE_COUNTRY_CODES: dict[str, str] = { "AU": "AUS", # Australia "CL": "COL", # Colombia "CN": "CAN", # Canada @@ -42,7 +41,7 @@ ############################################################################### -def _get_plant_nuclear_unit_id_map(nuc_fuel: pd.DataFrame) -> Dict[int, str]: +def _get_plant_nuclear_unit_id_map(nuc_fuel: pd.DataFrame) -> dict[int, str]: """Get a plant_id -> nuclear_unit_id mapping for all plants with one nuclear unit. Args: @@ -115,7 +114,7 @@ def _backfill_nuclear_unit_id(nuc_fuel: pd.DataFrame) -> pd.DataFrame: return nuc_fuel -def _get_plant_prime_mover_map(gen_fuel: pd.DataFrame) -> Dict[int, str]: +def _get_plant_prime_mover_map(gen_fuel: pd.DataFrame) -> dict[int, str]: """Get a plant_id -> prime_mover_code mapping for all plants with one prime mover. Args: @@ -198,7 +197,7 @@ def _backfill_prime_mover_code(gen_fuel: pd.DataFrame) -> pd.DataFrame: return gen_fuel -def _get_most_frequent_energy_source_map(gen_fuel: pd.DataFrame) -> Dict[str, str]: +def _get_most_frequent_energy_source_map(gen_fuel: pd.DataFrame) -> dict[str, str]: """Get the a mapping of the most common energy_source for each fuel_type_code_aer. Args: diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index 06183d90f0..2499461fcb 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -11,7 +11,6 @@ import logging import re from difflib import SequenceMatcher -from typing import Dict, List # NetworkX is used to knit incomplete ferc plant time series together. import networkx as nx @@ -37,7 +36,7 @@ ############################################################################## # Dicts for categorizing freeform strings #################################### ############################################################################## -FUEL_STRINGS: Dict[str, List[str]] = { +FUEL_STRINGS: dict[str, list[str]] = { "coal": [ "coal", "coal-subbit", @@ -319,7 +318,7 @@ a lower case in the data set. """ -FUEL_UNIT_STRINGS: Dict[str, List[str]] = { +FUEL_UNIT_STRINGS: dict[str, list[str]] = { "ton": [ "toms", "taons", @@ -631,7 +630,7 @@ fuel units (values) """ -PLANT_KIND_STRINGS: Dict[str, List[str]] = { +PLANT_KIND_STRINGS: dict[str, list[str]] = { "steam": [ "coal", "steam", @@ -1022,7 +1021,7 @@ research of the plants on the Internet. """ -CONSTRUCTION_TYPE_STRINGS: Dict[str, List[str]] = { +CONSTRUCTION_TYPE_STRINGS: dict[str, list[str]] = { "outdoor": [ "outdoor", "outdoor boiler", diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py index ec71033dc5..540b6831c4 100644 --- a/src/pudl/workspace/datastore.py +++ b/src/pudl/workspace/datastore.py @@ -1,5 +1,4 @@ """Datastore manages file retrieval for PUDL datasets.""" - import argparse import hashlib import io @@ -9,8 +8,9 @@ import sys import zipfile from collections import defaultdict +from collections.abc import Iterator from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Set, Tuple +from typing import Any, Optional import coloredlogs import datapackage @@ -108,9 +108,9 @@ def get_resources( dataset=self.dataset, doi=self.doi, name=res["name"] ) - def get_partitions(self, name: str = None) -> Dict[str, Set[str]]: + def get_partitions(self, name: str = None) -> dict[str, set[str]]: """Returns mapping of all known partition keys to the set of its known values.""" - partitions: Dict[str, Set[str]] = defaultdict(set) + partitions: dict[str, set[str]] = defaultdict(set) for res in self.datapackage_json["resources"]: if name and res["name"] != name: continue @@ -183,7 +183,7 @@ def __init__(self, sandbox: bool = False, timeout: float = 15.0): self._api_root = self.API_ROOT[backend] self._token = self.TOKEN[backend] self._dataset_to_doi = self.DOI[backend] - self._descriptor_cache: Dict[str, DatapackageDescriptor] = {} + self._descriptor_cache: dict[str, DatapackageDescriptor] = {} self.timeout = timeout retries = Retry( @@ -251,7 +251,7 @@ def get_resource(self, res: PudlResourceKey) -> bytes: desc.validate_checksum(res.name, content) return content - def get_known_datasets(self) -> List[str]: + def get_known_datasets(self) -> list[str]: """Returns list of supported datasets.""" return sorted(self._dataset_to_doi) @@ -283,7 +283,7 @@ def __init__( """ self._cache = resource_cache.LayeredCache() - self._datapackage_descriptors: Dict[str, DatapackageDescriptor] = {} + self._datapackage_descriptors: dict[str, DatapackageDescriptor] = {} if local_cache_path: self._cache.add_cache_layer(resource_cache.LocalFileCache(local_cache_path)) @@ -294,7 +294,7 @@ def __init__( self._zenodo_fetcher = ZenodoFetcher(sandbox=sandbox, timeout=timeout) - def get_known_datasets(self) -> List[str]: + def get_known_datasets(self) -> list[str]: """Returns list of supported datasets.""" return self._zenodo_fetcher.get_known_datasets() @@ -321,7 +321,7 @@ def get_resources( cached_only: bool = False, skip_optimally_cached: bool = False, **filters: Any, - ) -> Iterator[Tuple[PudlResourceKey, bytes]]: + ) -> Iterator[tuple[PudlResourceKey, bytes]]: """Return content of the matching resources. Args: @@ -482,7 +482,7 @@ def _create_datastore(args: dict) -> Datastore: ) -def print_partitions(dstore: Datastore, datasets: List[str]) -> None: +def print_partitions(dstore: Datastore, datasets: list[str]) -> None: """Prints known partition keys and its values for each of the datasets.""" for single_ds in datasets: parts = dstore.get_datapackage_descriptor(single_ds).get_partitions() @@ -495,7 +495,7 @@ def print_partitions(dstore: Datastore, datasets: List[str]) -> None: def validate_cache( - dstore: Datastore, datasets: List[str], args: argparse.Namespace + dstore: Datastore, datasets: list[str], args: argparse.Namespace ) -> None: """Validate elements in the datastore cache. Delete invalid entires from cache.""" for single_ds in datasets: @@ -520,7 +520,7 @@ def validate_cache( def fetch_resources( - dstore: Datastore, datasets: List[str], args: argparse.Namespace + dstore: Datastore, datasets: list[str], args: argparse.Namespace ) -> None: """Retrieve all matching resources and store them in the cache.""" for single_ds in datasets: diff --git a/src/pudl/workspace/resource_cache.py b/src/pudl/workspace/resource_cache.py index ff8dd55e74..fafbe2323a 100644 --- a/src/pudl/workspace/resource_cache.py +++ b/src/pudl/workspace/resource_cache.py @@ -3,7 +3,7 @@ import logging from abc import ABC, abstractmethod from pathlib import Path -from typing import Any, List, NamedTuple +from typing import Any, NamedTuple from urllib.parse import urlparse from google.cloud import storage @@ -147,7 +147,7 @@ class LayeredCache(AbstractCache): layers are read-only (get). """ - def __init__(self, *caches: List[AbstractCache], **kwargs: Any): + def __init__(self, *caches: list[AbstractCache], **kwargs: Any): """Creates layered cache consisting of given cache layers. Args: @@ -155,7 +155,7 @@ def __init__(self, *caches: List[AbstractCache], **kwargs: Any): of decreasing priority. """ super().__init__(**kwargs) - self._caches: List[AbstractCache] = list(caches) + self._caches: list[AbstractCache] = list(caches) def add_cache_layer(self, cache: AbstractCache): """Adds caching layer. The priority is below all other.""" diff --git a/test/unit/analysis/epa_crosswalk_test.py b/test/unit/analysis/epa_crosswalk_test.py index 2aa069ab58..7f6107457e 100644 --- a/test/unit/analysis/epa_crosswalk_test.py +++ b/test/unit/analysis/epa_crosswalk_test.py @@ -1,5 +1,5 @@ """Unit tests for the :mod:`pudl.analysis.epa_crosswalk` module.""" -from typing import Dict, Sequence +from collections.abc import Sequence import dask.dataframe as dd import pandas as pd @@ -9,7 +9,7 @@ import pudl.analysis.epa_crosswalk as cw -def df_from_product(inputs: Dict[str, Sequence], as_index=True) -> pd.DataFrame: +def df_from_product(inputs: dict[str, Sequence], as_index=True) -> pd.DataFrame: """Make a dataframe from cartesian product of input sequences. Args: diff --git a/test/unit/analysis/state_demand_test.py b/test/unit/analysis/state_demand_test.py index 769911d01b..d238eb5547 100644 --- a/test/unit/analysis/state_demand_test.py +++ b/test/unit/analysis/state_demand_test.py @@ -1,6 +1,6 @@ """Tests for timeseries anomalies detection and imputation.""" -from typing import Dict, Union +from typing import Union import numpy as np import pandas as pd @@ -34,7 +34,7 @@ ], ) def test_lookup_state( - state: Union[str, int], expected: Dict[str, Union[str, int]] + state: Union[str, int], expected: dict[str, Union[str, int]] ) -> None: """Check that various kinds of state lookups work.""" assert lookup_state(state) == expected diff --git a/test/unit/analysis/timeseries_cleaning_test.py b/test/unit/analysis/timeseries_cleaning_test.py index 90e2c1ba3f..430ee85b18 100644 --- a/test/unit/analysis/timeseries_cleaning_test.py +++ b/test/unit/analysis/timeseries_cleaning_test.py @@ -1,5 +1,4 @@ """Tests for timeseries anomalies detection and imputation.""" -from typing import Tuple import numpy as np import pytest @@ -11,9 +10,9 @@ def simulate_series( n: int = 10, periods: int = 20, frequency: int = 24, - amplitude_range: Tuple[float, float] = (0.0, 1.0), - offset_range: Tuple[float, float] = (1.0, 2.0), - shift_range: Tuple[int, int] = (-3, 3), + amplitude_range: tuple[float, float] = (0.0, 1.0), + offset_range: tuple[float, float] = (1.0, 2.0), + shift_range: tuple[int, int] = (-3, 3), seed=None, ) -> np.ndarray: """Generate synthetic multivariate series from sin functions. @@ -47,7 +46,7 @@ def simulate_anomalies( n: int = 100, sigma: float = 1, seed=None, -) -> Tuple[np.ndarray, np.ndarray]: +) -> tuple[np.ndarray, np.ndarray]: """Simulate anomalies in series. Args: diff --git a/test/unit/harvest_test.py b/test/unit/harvest_test.py index 80237e234f..2222df978f 100644 --- a/test/unit/harvest_test.py +++ b/test/unit/harvest_test.py @@ -1,5 +1,5 @@ """Tests for Resource harvesting methods.""" -from typing import Any, Dict, List +from typing import Any import numpy as np import pandas as pd @@ -23,7 +23,7 @@ def _assert_frame_equal(a: pd.DataFrame, b: pd.DataFrame, **kwargs: Any) -> None # ---- Unit tests ---- # -STANDARD: Dict[str, Any] = { +STANDARD: dict[str, Any] = { "name": "r", "harvest": {"harvest": False}, "schema": { @@ -37,7 +37,7 @@ def _assert_frame_equal(a: pd.DataFrame, b: pd.DataFrame, **kwargs: Any) -> None }, } -HARVEST: Dict[str, Any] = {**STANDARD, "harvest": {"harvest": True}} +HARVEST: dict[str, Any] = {**STANDARD, "harvest": {"harvest": True}} def test_resource_ignores_input_with_different_name() -> None: @@ -83,7 +83,7 @@ def test_resource_harvests_input_with_only_key_fields() -> None: }, ], ) -def test_resource_harvests_inputs(dfs: Dict[Any, pd.DataFrame]) -> None: +def test_resource_harvests_inputs(dfs: dict[Any, pd.DataFrame]) -> None: """Resource harvests inputs.""" resource = Resource(**HARVEST) expected = ( @@ -111,7 +111,7 @@ def test_resource_with_only_key_fields_harvests() -> None: # ---- EIA example ---- # -INPUT_DFS: Dict[str, pd.DataFrame] = dict( +INPUT_DFS: dict[str, pd.DataFrame] = dict( service_territory_eia861=pd.DataFrame( columns=[ "utility_id_eia", @@ -238,7 +238,7 @@ def test_resource_with_only_key_fields_harvests() -> None: ), ) -FIELD_DTYPES: Dict[str, str] = { +FIELD_DTYPES: dict[str, str] = { "balancing_authority_code_eia": "string", "utility_id_eia": "integer", "plant_id_eia": "integer", @@ -256,7 +256,7 @@ def test_resource_with_only_key_fields_harvests() -> None: "net_generation_mwh": "number", } -RESOURCES: List[Dict[str, Any]] = [ +RESOURCES: list[dict[str, Any]] = [ { "name": "plant_entity_eia860", "harvest": {"harvest": True}, @@ -340,7 +340,7 @@ def test_resource_with_only_key_fields_harvests() -> None: ] RESOURCES[i] = Resource(**d) -EXPECTED_DFS: Dict[str, pd.DataFrame] = dict( +EXPECTED_DFS: dict[str, pd.DataFrame] = dict( plant_entity_eia860=pd.DataFrame( columns=["plant_id_eia", "state", "balancing_authority_code_eia"], data=[(3, "AL", "SOCO"), (4, np.nan, np.nan)], diff --git a/test/unit/workspace/datastore_test.py b/test/unit/workspace/datastore_test.py index f5bc6d2f71..b91ef6667b 100644 --- a/test/unit/workspace/datastore_test.py +++ b/test/unit/workspace/datastore_test.py @@ -3,7 +3,6 @@ import json import re import unittest -from typing import Dict import responses @@ -86,7 +85,7 @@ class MockableZenodoFetcher(datastore.ZenodoFetcher): """ def __init__( - self, descriptors: Dict[str, datastore.DatapackageDescriptor], **kwargs + self, descriptors: dict[str, datastore.DatapackageDescriptor], **kwargs ): """Constructs test-friendly ZenodoFetcher that has given descriptors pre-loaded.""" super().__init__(**kwargs) From 0e1478212f59e3a7012c161b3413141d3ae3914d Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 10 Jun 2022 15:41:19 -0500 Subject: [PATCH 5/6] Update Python syntax to use 3.10 norms. --- .pre-commit-config.yaml | 6 ++++++ src/pudl/analysis/plant_parts_eia.py | 6 +++--- src/pudl/analysis/spatial.py | 4 ++-- src/pudl/extract/ferc1.py | 2 +- src/pudl/helpers.py | 4 ++-- src/pudl/metadata/classes.py | 10 ++++------ src/pudl/metadata/sources.py | 10 ++++------ src/pudl/transform/eia861.py | 4 ++-- test/integration/datasette_metadata_test.py | 2 +- 9 files changed, 25 insertions(+), 23 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2c84b9efcb..1bd7443f91 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -60,6 +60,12 @@ repos: hooks: - id: upgrade-type-hints +# Update Python language constructs to modern standards +- repo: https://github.com/asottile/pyupgrade + rev: v2.32.1 + hooks: + - id: pyupgrade + ######################################################################################## # Linters: hooks that check but don't alter Python and documentation files ######################################################################################## diff --git a/src/pudl/analysis/plant_parts_eia.py b/src/pudl/analysis/plant_parts_eia.py index 12ebd0613a..185b9b5dcd 100644 --- a/src/pudl/analysis/plant_parts_eia.py +++ b/src/pudl/analysis/plant_parts_eia.py @@ -675,9 +675,9 @@ def execute(self, gens_mega): keep=MAX_MIN_ATTRIBUTES_DICT[attribute_col]["keep"], ) # assert that all the plant part ID columns are now in part_df - assert set( - [col for part in PLANT_PARTS for col in PLANT_PARTS[part]["id_cols"]] - ).issubset(part_df.columns) + assert { + col for part in PLANT_PARTS for col in PLANT_PARTS[part]["id_cols"] + }.issubset(part_df.columns) part_dfs.append(part_df) plant_parts_eia = pd.concat(part_dfs) plant_parts_eia = TrueGranLabeler().execute(plant_parts_eia) diff --git a/src/pudl/analysis/spatial.py b/src/pudl/analysis/spatial.py index 723c4982b7..643b77fe50 100644 --- a/src/pudl/analysis/spatial.py +++ b/src/pudl/analysis/spatial.py @@ -247,9 +247,9 @@ def overlay( ratios = [] # Check for duplicate non-geometry column names seen = set() - duplicates = set( + duplicates = { c for df in gdfs for c in get_data_columns(df) if c in seen or seen.add(c) - ) + } if duplicates: raise ValueError(f"Duplicate column names in layers: {duplicates}") # Drop index columns and replace with default index of known name diff --git a/src/pudl/extract/ferc1.py b/src/pudl/extract/ferc1.py index 1e3c98935a..2e8c0fea56 100644 --- a/src/pudl/extract/ferc1.py +++ b/src/pudl/extract/ferc1.py @@ -160,7 +160,7 @@ def observed_respondents(ferc1_engine: sa.engine.Engine) -> set[int]: """ f1_table_meta = pudl.output.pudltabl.get_table_meta(ferc1_engine) - observed = set([]) + observed = set() for table in f1_table_meta.values(): if "respondent_id" in table.columns: observed = observed.union( diff --git a/src/pudl/helpers.py b/src/pudl/helpers.py index 17b419e597..505df10459 100644 --- a/src/pudl/helpers.py +++ b/src/pudl/helpers.py @@ -618,7 +618,7 @@ def organize_cols(df, cols): """ # Generate a list of all the columns in the dataframe that are not # included in cols - data_cols = sorted([c for c in df.columns.tolist() if c not in cols]) + data_cols = sorted(c for c in df.columns.tolist() if c not in cols) organized_cols = cols + data_cols return df[organized_cols] @@ -1534,7 +1534,7 @@ def get_eia_ferc_acct_map(): def dedupe_n_flatten_list_of_lists(mega_list): """Flatten a list of lists and remove duplicates.""" - return list(set([item for sublist in mega_list for item in sublist])) + return list({item for sublist in mega_list for item in sublist}) def convert_df_to_excel_file(df: pd.DataFrame, **kwargs) -> pd.ExcelFile: diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index e376b0093f..1901689591 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -918,11 +918,9 @@ def get_resource_ids(self) -> list[str]: resources = eia861.RESOURCE_METADATA return sorted( - [ - name - for name, value in resources.items() - if value.get("etl_group") == self.name - ] + name + for name, value in resources.items() + if value.get("etl_group") == self.name ) def get_temporal_coverage(self, partitions: dict = None) -> str: @@ -1530,7 +1528,7 @@ def _build_aggregation_report(self, df: pd.DataFrame, errors: dict) -> dict: "stats": stats, "errors": errors.get(field.name, None), } - nerrors = sum([not f["valid"] for f in freports.values()]) + nerrors = sum(not f["valid"] for f in freports.values()) stats = { "all": ncols, "invalid": nerrors, diff --git a/src/pudl/metadata/sources.py b/src/pudl/metadata/sources.py index 55a397d680..00748a0ca0 100644 --- a/src/pudl/metadata/sources.py +++ b/src/pudl/metadata/sources.py @@ -13,12 +13,10 @@ ), "working_partitions": {}, # Census DP1 is monolithic. "keywords": sorted( - set( - [ - "censusdp1tract", - "census", - ] - ) + { + "censusdp1tract", + "census", + } ), "license_raw": LICENSES["us-govt"], "license_pudl": LICENSES["cc-by-4.0"], diff --git a/src/pudl/transform/eia861.py b/src/pudl/transform/eia861.py index c9b6d7f519..e79af0ae56 100644 --- a/src/pudl/transform/eia861.py +++ b/src/pudl/transform/eia861.py @@ -659,7 +659,7 @@ def _clean_nerc(df, idx_cols): # Record a list of the reported nerc regions not included in the recognized regions list (these eventually become UNK) nerc_col = nerc_df["nerc_region"].tolist() - nerc_list = list(set([item for sublist in nerc_col for item in sublist])) + nerc_list = list({item for sublist in nerc_col for item in sublist}) non_nerc_list = [ nerc_entity for nerc_entity in nerc_list @@ -692,7 +692,7 @@ def _remove_nerc_duplicates(entity_list): ] ) ) - .apply(lambda x: sorted([i if i in NERC_REGIONS else "UNK" for i in x])) + .apply(lambda x: sorted(i if i in NERC_REGIONS else "UNK" for i in x)) .apply(lambda x: _remove_nerc_duplicates(x)) .str.join("_") ) diff --git a/test/integration/datasette_metadata_test.py b/test/integration/datasette_metadata_test.py index 15759c4ff9..b9e988f3e6 100644 --- a/test/integration/datasette_metadata_test.py +++ b/test/integration/datasette_metadata_test.py @@ -28,7 +28,7 @@ def test_datasette_metadata_script(script_runner, pudl_settings_fixture): logger.info("Parsing generated metadata using datasette utils.") metadata_json = json.dumps(yaml.safe_load(metadata_yml.open())) parsed_metadata = datasette.utils.parse_metadata(metadata_json) - assert set(parsed_metadata["databases"]) == set(["pudl", "ferc1"]) + assert set(parsed_metadata["databases"]) == {"pudl", "ferc1"} assert parsed_metadata["license"] == "CC-BY-4.0" assert ( parsed_metadata["databases"]["pudl"]["source_url"] From 4e053b50dc93fcc0d242bd2ac058f81d5d91d2c5 Mon Sep 17 00:00:00 2001 From: Zane Selvans Date: Fri, 10 Jun 2022 17:42:56 -0500 Subject: [PATCH 6/6] Use only Python 3.10+ compatible syntax. --- .pre-commit-config.yaml | 1 + src/pudl/analysis/epa_crosswalk.py | 100 +++++++++++++---------- src/pudl/analysis/plant_parts_eia.py | 12 +-- src/pudl/analysis/spatial.py | 12 +-- src/pudl/analysis/state_demand.py | 6 +- src/pudl/analysis/timeseries_cleaning.py | 10 +-- src/pudl/extract/excel.py | 4 +- src/pudl/extract/ferc1.py | 2 +- src/pudl/helpers.py | 8 +- src/pudl/metadata/classes.py | 28 +++---- src/pudl/metadata/fields.py | 16 ++-- src/pudl/metadata/helpers.py | 16 ++-- src/pudl/output/eia923.py | 10 +-- src/pudl/output/epacems.py | 11 ++- src/pudl/output/ferc714.py | 2 +- src/pudl/output/pudltabl.py | 12 +-- src/pudl/workspace/datastore.py | 6 +- test/conftest.py | 2 +- test/unit/analysis/state_demand_test.py | 7 +- 19 files changed, 134 insertions(+), 131 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1bd7443f91..974247c6c7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -65,6 +65,7 @@ repos: rev: v2.32.1 hooks: - id: pyupgrade + args: ['--py310-plus'] ######################################################################################## # Linters: hooks that check but don't alter Python and documentation files diff --git a/src/pudl/analysis/epa_crosswalk.py b/src/pudl/analysis/epa_crosswalk.py index 04dff444c1..dbb89a59be 100644 --- a/src/pudl/analysis/epa_crosswalk.py +++ b/src/pudl/analysis/epa_crosswalk.py @@ -1,23 +1,24 @@ """Use the EPA crosswalk to connect EPA units to EIA generators and other data. -A major use case for this dataset is to identify subplants within plant_ids, -which are the smallest coherent units for aggregation. -Despite the name, plant_id refers to a legal entity that often contains -multiple distinct power plants, even of different technology or fuel types. +A major use case for this dataset is to identify subplants within plant_ids, which are +the smallest coherent units for aggregation. Despite the name, plant_id refers to a +legal entity that often contains multiple distinct power plants, even of different +technology or fuel types. EPA CEMS data combines information from several parts of a power plant: + * emissions from smokestacks * fuel use from combustors * electricty production from generators -But smokestacks, combustors, and generators can be connected in -complex, many-to-many relationships. This complexity makes attribution difficult for, -as an example, allocating pollution to energy producers. -Furthermore, heterogeneity within plant_ids make aggregation -to the parent entity difficult or inappropriate. -But by analyzing the relationships between combustors and generators, -as provided in the EPA/EIA crosswalk, we can identify distinct power plants. -These are the smallest coherent units of aggregation. +But smokestacks, combustors, and generators can be connected in complex, many-to-many +relationships. This complexity makes attribution difficult for, as an example, +allocating pollution to energy producers. Furthermore, heterogeneity within plant_ids +make aggregation to the parent entity difficult or inappropriate. + +But by analyzing the relationships between combustors and generators, as provided in the +EPA/EIA crosswalk, we can identify distinct power plants. These are the smallest +coherent units of aggregation. In graph analysis terminology, the crosswalk is a list of edges between nodes (combustors and generators) in a bipartite graph. The networkx python package provides @@ -38,21 +39,19 @@ filtered_crosswalk = filter_crosswalk(epa_crosswalk_df, epacems) crosswalk_with_subplant_ids = make_subplant_ids(filtered_crosswalk) """ -from typing import Union - import dask.dataframe as dd import networkx as nx import pandas as pd -def _get_unique_keys(epacems: Union[pd.DataFrame, dd.DataFrame]) -> pd.DataFrame: +def _get_unique_keys(epacems: pd.DataFrame | dd.DataFrame) -> pd.DataFrame: """Get unique unit IDs from CEMS data. Args: - epacems (Union[pd.DataFrame, dd.DataFrame]): epacems dataset from pudl.output.epacems.epacems + epacems: dataset from :func:`pudl.output.epacems.epacems` Returns: - pd.DataFrame: unique keys from the epacems dataset + Unique keys from the epacems dataset. """ # The purpose of this function is mostly to resolve the @@ -64,7 +63,7 @@ def _get_unique_keys(epacems: Union[pd.DataFrame, dd.DataFrame]) -> pd.DataFrame def filter_crosswalk_by_epacems( - crosswalk: pd.DataFrame, epacems: Union[pd.DataFrame, dd.DataFrame] + crosswalk: pd.DataFrame, epacems: pd.DataFrame | dd.DataFrame ) -> pd.DataFrame: """Inner join unique CEMS units with the EPA crosswalk. @@ -74,11 +73,11 @@ def filter_crosswalk_by_epacems( Args: crosswalk: the EPA crosswalk, as from pudl.output.epacems.epa_crosswalk() - unique_epacems_ids (pd.DataFrame): unique ids from _get_unique_keys + unique_epacems_ids: unique ids from _get_unique_keys Returns: - The inner join of the EPA crosswalk and unique epacems units. Adds - the global ID column unit_id_epa. + The inner join of the EPA crosswalk and unique epacems units. Adds the global ID + column unit_id_epa. """ unique_epacems_ids = _get_unique_keys(epacems) @@ -94,13 +93,14 @@ def filter_crosswalk_by_epacems( def filter_out_unmatched(crosswalk: pd.DataFrame) -> pd.DataFrame: """Remove unmatched or excluded (non-exporting) units. - Unmatched rows are limitations of the completeness of the EPA crosswalk itself, not of PUDL. + Unmatched rows are limitations of the completeness of the EPA crosswalk itself, not + of PUDL. Args: - crosswalk (pd.DataFrame): the EPA crosswalk, as from pudl.output.epacems.epa_crosswalk() + crosswalk: the EPA crosswalk, as from :func:`pudl.output.epacems.epa_crosswalk` Returns: - pd.DataFrame: the EPA crosswalk with unmatched units removed + The EPA crosswalk with unmatched units removed. """ bad = crosswalk["MATCH_TYPE_GEN"].isin({"CAMD Unmatched", "Manual CAMD Excluded"}) return crosswalk.loc[~bad].copy() @@ -110,10 +110,10 @@ def filter_out_boiler_rows(crosswalk: pd.DataFrame) -> pd.DataFrame: """Remove rows that represent graph edges between generators and boilers. Args: - crosswalk (pd.DataFrame): the EPA crosswalk, as from pudl.output.epacems.epa_crosswalk() + crosswalk: the EPA crosswalk, as from :func:`pudl.output.epacems.epa_crosswalk` Returns: - pd.DataFrame: the EPA crosswalk with boiler rows (many/one-to-many) removed + The EPA crosswalk with boiler rows (many/one-to-many) removed """ crosswalk = crosswalk.drop_duplicates( subset=["CAMD_PLANT_ID", "CAMD_UNIT_ID", "EIA_GENERATOR_ID"] @@ -125,10 +125,11 @@ def _prep_for_networkx(crosswalk: pd.DataFrame) -> pd.DataFrame: """Make surrogate keys for combustors and generators. Args: - crosswalk (pd.DataFrame): EPA crosswalk, as from pudl.output.epacems.epa_crosswalk() + crosswalk: EPA crosswalk, as from :func:`pudl.output.epacems.epa_crosswalk` Returns: - pd.DataFrame: copy of EPA crosswalk with new surrogate ID columns 'combustor_id' and 'generator_id' + A copy of EPA crosswalk with new surrogate ID columns 'combustor_id' and + 'generator_id' """ prepped = crosswalk.copy() # networkx can't handle composite keys, so make surrogates @@ -145,13 +146,13 @@ def _prep_for_networkx(crosswalk: pd.DataFrame) -> pd.DataFrame: def _subplant_ids_from_prepped_crosswalk(prepped: pd.DataFrame) -> pd.DataFrame: - """Use networkx graph analysis to create global subplant IDs from a preprocessed crosswalk edge list. + """Use graph analysis to create global subplant IDs from a crosswalk edge list. Args: - prepped (pd.DataFrame): an EPA crosswalk that has passed through _prep_for_networkx() + prepped: an EPA crosswalk that has passed through :func:`_prep_for_networkx` Returns: - pd.DataFrame: copy of EPA crosswalk plus new column 'global_subplant_id' + A copy of EPA crosswalk plus new column 'global_subplant_id' """ graph = nx.from_pandas_edgelist( prepped, @@ -171,27 +172,29 @@ def _subplant_ids_from_prepped_crosswalk(prepped: pd.DataFrame) -> pd.DataFrame: def _convert_global_id_to_composite_id( crosswalk_with_ids: pd.DataFrame, ) -> pd.DataFrame: - """Convert global_subplant_id to an equivalent composite key (CAMD_PLANT_ID, subplant_id). + """Convert global_subplant_id to a composite key (CAMD_PLANT_ID, subplant_id). The composite key will be much more stable (though not fully stable!) in time. The global ID changes if ANY unit or generator changes, whereas the compound key only changes if units/generators change within that specific plant. - A global ID could also tempt users into using it as a crutch, even though it isn't stable. - A compound key should discourage that behavior. + A global ID could also tempt users into using it as a crutch, even though it isn't + stable. A compound key should discourage that behavior. Args: - crosswalk_with_ids (pd.DataFrame): crosswalk with global_subplant_id, as from _subplant_ids_from_prepped_crosswalk() + crosswalk_with_ids: crosswalk with global_subplant_id, as from + :func:`_subplant_ids_from_prepped_crosswalk` Raises: ValueError: if crosswalk_with_ids has a MultiIndex Returns: - pd.DataFrame: copy of crosswalk_with_ids with an added column: 'subplant_id' + A copy of crosswalk_with_ids with an added column: 'subplant_id' """ if isinstance(crosswalk_with_ids.index, pd.MultiIndex): raise ValueError( - f"Input crosswalk must have single level index. Given levels: {crosswalk_with_ids.index.names}" + "Input crosswalk must have single level index. " + f"Given levels: {crosswalk_with_ids.index.names}" ) reindexed = crosswalk_with_ids.reset_index() # copy @@ -223,16 +226,20 @@ def _convert_global_id_to_composite_id( def filter_crosswalk( - crosswalk: pd.DataFrame, epacems: Union[pd.DataFrame, dd.DataFrame] + crosswalk: pd.DataFrame, epacems: pd.DataFrame | dd.DataFrame ) -> pd.DataFrame: - """Remove crosswalk rows that do not correspond to an EIA facility or are duplicated due to many-to-many boiler relationships. + """Remove irrelevant or duplicated rows from the crosswalk. + + Remove crosswalk rows that do not correspond to an EIA facility or are duplicated + due to many-to-many boiler relationships. Args: - crosswalk (pd.DataFrame): The EPA/EIA crosswalk, as from pudl.output.epacems.epa_crosswalk() - epacems (Union[pd.DataFrame, dd.DataFrame]): Emissions data. Must contain columns named ["plant_id_eia", "unitid", "unit_id_epa"] + crosswalk: The EPA/EIA crosswalk from :func:`pudl.output.epacems.epa_crosswalk` + epacems: Emissions data. Must contain columns named + ["plant_id_eia", "unitid", "unit_id_epa"] Returns: - pd.DataFrame: A filtered copy of EPA crosswalk + A filtered copy of EPA crosswalk. """ filtered_crosswalk = filter_out_unmatched(crosswalk) filtered_crosswalk = filter_out_boiler_rows(filtered_crosswalk) @@ -241,7 +248,9 @@ def filter_crosswalk( def make_subplant_ids(crosswalk: pd.DataFrame) -> pd.DataFrame: - """Identify sub-plants in the EPA/EIA crosswalk graph. Any row filtering should be done before this step. + """Identify sub-plants in the EPA/EIA crosswalk graph. + + Any row filtering should be done before this step. Usage Example: @@ -251,10 +260,11 @@ def make_subplant_ids(crosswalk: pd.DataFrame) -> pd.DataFrame: crosswalk_with_subplant_ids = make_subplant_ids(filtered_crosswalk) Args: - crosswalk (pd.DataFrame): The EPA/EIA crosswalk, as from pudl.output.epacems.epa_crosswalk() + crosswalk: The EPA/EIA crosswalk, from :func:`pudl.output.epacems.epa_crosswalk` Returns: - pd.DataFrame: An edge list connecting EPA units to EIA generators, with connected pieces issued a subplant_id + An edge list connecting EPA units to EIA generators, with connected pieces + issued a subplant_id """ edge_list = _prep_for_networkx(crosswalk) edge_list = _subplant_ids_from_prepped_crosswalk(edge_list) diff --git a/src/pudl/analysis/plant_parts_eia.py b/src/pudl/analysis/plant_parts_eia.py index 185b9b5dcd..ab9a2f1d71 100644 --- a/src/pudl/analysis/plant_parts_eia.py +++ b/src/pudl/analysis/plant_parts_eia.py @@ -182,7 +182,7 @@ import logging import warnings from copy import deepcopy -from typing import Literal, Optional +from typing import Literal import numpy as np import pandas as pd @@ -346,7 +346,7 @@ ] -class MakeMegaGenTbl(object): +class MakeMegaGenTbl: """Compiler for a MEGA generator table with ownership integrated. Examples @@ -605,7 +605,7 @@ def scale_by_ownership( return gens_mega -class MakePlantParts(object): +class MakePlantParts: """Compile the plant parts for the master unit list. This object generates a master list of different "plant-parts", which @@ -807,7 +807,7 @@ def validate_ownership_for_owned_records(self, plant_parts_eia): ) -class PlantPart(object): +class PlantPart: """Plant-part table maker. The coordinating method here is :meth:`execute`. @@ -1137,14 +1137,14 @@ def execute(self, ppl): return ppl_true_gran -class AddAttribute(object): +class AddAttribute: """Base class for adding attributes to plant-part tables.""" def __init__( self, attribute_col: str, part_name: str, - assign_col_dict: Optional[dict[str, str]] = None, + assign_col_dict: dict[str, str] | None = None, ): """Initialize a attribute adder. diff --git a/src/pudl/analysis/spatial.py b/src/pudl/analysis/spatial.py index 643b77fe50..c4787d2432 100644 --- a/src/pudl/analysis/spatial.py +++ b/src/pudl/analysis/spatial.py @@ -2,7 +2,7 @@ import itertools import warnings from collections.abc import Callable, Iterable -from typing import Literal, Union +from typing import Literal import geopandas as gpd import pandas as pd @@ -49,7 +49,7 @@ def check_gdf(gdf: gpd.GeoDataFrame) -> None: ) -def polygonize(geom: BaseGeometry) -> Union[Polygon, MultiPolygon]: +def polygonize(geom: BaseGeometry) -> Polygon | MultiPolygon: """Convert geometry to (Multi)Polygon. Args: @@ -174,10 +174,10 @@ def self_union(gdf: gpd.GeoDataFrame, ratios: Iterable[str] = None) -> gpd.GeoDa def dissolve( gdf: gpd.GeoDataFrame, by: Iterable[str], - func: Union[Callable, str, list, dict], - how: Union[ - Literal["union", "first"], Callable[[gpd.GeoSeries], BaseGeometry] - ] = "union", + func: Callable | str | list | dict, + how: ( + Literal["union", "first"] | Callable[[gpd.GeoSeries], BaseGeometry] + ) = "union", ) -> gpd.GeoDataFrame: """Dissolve layer by aggregating features based on common attributes. diff --git a/src/pudl/analysis/state_demand.py b/src/pudl/analysis/state_demand.py index ff56e06874..6937e1ae7b 100644 --- a/src/pudl/analysis/state_demand.py +++ b/src/pudl/analysis/state_demand.py @@ -28,7 +28,7 @@ import pathlib import sys from collections.abc import Iterable -from typing import Any, Union +from typing import Any import matplotlib.pyplot as plt import numpy as np @@ -45,7 +45,7 @@ # --- Constants --- # -STATES: list[dict[str, Union[str, int]]] = [ +STATES: list[dict[str, str | int]] = [ {"name": "Alabama", "code": "AL", "fips": "01"}, {"name": "Alaska", "code": "AK", "fips": "02"}, {"name": "Arizona", "code": "AZ", "fips": "04"}, @@ -151,7 +151,7 @@ # --- Helpers --- # -def lookup_state(state: Union[str, int]) -> dict: +def lookup_state(state: str | int) -> dict: """Lookup US state by state identifier. Args: diff --git a/src/pudl/analysis/timeseries_cleaning.py b/src/pudl/analysis/timeseries_cleaning.py index 1fbc3dc2c0..0c145baa94 100644 --- a/src/pudl/analysis/timeseries_cleaning.py +++ b/src/pudl/analysis/timeseries_cleaning.py @@ -32,7 +32,7 @@ import functools import warnings from collections.abc import Iterable, Sequence -from typing import Any, Union +from typing import Any import matplotlib.pyplot as plt import numpy as np @@ -109,7 +109,7 @@ def array_diff( return dx -def encode_run_length(x: Union[Sequence, np.ndarray]) -> tuple[np.ndarray, np.ndarray]: +def encode_run_length(x: Sequence | np.ndarray) -> tuple[np.ndarray, np.ndarray]: """Encode vector with run-length encoding. Args: @@ -144,8 +144,8 @@ def encode_run_length(x: Union[Sequence, np.ndarray]) -> tuple[np.ndarray, np.nd def insert_run_length( # noqa: C901 - x: Union[Sequence, np.ndarray], - values: Union[Sequence, np.ndarray], + x: Sequence | np.ndarray, + values: Sequence | np.ndarray, lengths: Sequence[int], mask: Sequence[bool] = None, padding: int = 0, @@ -543,7 +543,7 @@ class Timeseries: columns: Column names. """ - def __init__(self, x: Union[np.ndarray, pd.DataFrame]) -> None: + def __init__(self, x: np.ndarray | pd.DataFrame) -> None: """Initialize a multivariate timeseries. Args: diff --git a/src/pudl/extract/excel.py b/src/pudl/extract/excel.py index 6b11fef249..0fbe42f5c7 100644 --- a/src/pudl/extract/excel.py +++ b/src/pudl/extract/excel.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -class Metadata(object): +class Metadata: """Load Excel metadata from Python package data. Excel sheet files may contain many different tables. When we load those @@ -120,7 +120,7 @@ def _get_partition_key(partition): return list(partition.values())[0] -class GenericExtractor(object): +class GenericExtractor: """Contains logic for extracting panda.DataFrames from excel spreadsheets. This class implements the generic dataset agnostic logic to load data diff --git a/src/pudl/extract/ferc1.py b/src/pudl/extract/ferc1.py index 2e8c0fea56..dbc45bed62 100644 --- a/src/pudl/extract/ferc1.py +++ b/src/pudl/extract/ferc1.py @@ -480,7 +480,7 @@ def parseN(self, field, data): # noqa: N802 # Replace bare periods (which are non-numeric) with zero. if data == b".": data = b"0" - return super(FERC1FieldParser, self).parseN(field, data) + return super().parseN(field, data) def get_raw_df( diff --git a/src/pudl/helpers.py b/src/pudl/helpers.py index 505df10459..edd6780fdc 100644 --- a/src/pudl/helpers.py +++ b/src/pudl/helpers.py @@ -16,7 +16,7 @@ from functools import partial from importlib import resources from io import BytesIO -from typing import Any, Literal, Optional, Union +from typing import Any, Literal import addfips import numpy as np @@ -45,8 +45,8 @@ def label_map( df: pd.DataFrame, from_col: str = "code", to_col: str = "label", - null_value: Union[str, type(pd.NA)] = pd.NA, -) -> defaultdict[str, Union[str, type(pd.NA)]]: + null_value: str | type(pd.NA) = pd.NA, +) -> defaultdict[str, str | type(pd.NA)]: """Build a mapping dictionary from two columns of a labeling / coding dataframe. These dataframes document the meanings of the codes that show up in much of the @@ -1063,7 +1063,7 @@ def merge_dicts(list_of_dicts): def convert_cols_dtypes( - df: pd.DataFrame, data_source: Optional[str] = None, name: Optional[str] = None + df: pd.DataFrame, data_source: str | None = None, name: str | None = None ) -> pd.DataFrame: """Convert a PUDL dataframe's columns to the correct data type. diff --git a/src/pudl/metadata/classes.py b/src/pudl/metadata/classes.py index 1901689591..ec89d08e49 100644 --- a/src/pudl/metadata/classes.py +++ b/src/pudl/metadata/classes.py @@ -7,7 +7,7 @@ from collections.abc import Callable, Iterable from functools import lru_cache from pathlib import Path -from typing import Any, Literal, Optional, Union +from typing import Any, Literal import jinja2 import pandas as pd @@ -286,7 +286,7 @@ def StrictList(item_type: type = Any) -> pydantic.ConstrainedList: # noqa: N802 # ---- Class attribute validators ---- # -def _check_unique(value: list = None) -> Optional[list]: +def _check_unique(value: list = None) -> list | None: """Check that input list has unique values.""" if value: for i in range(len(value)): @@ -326,11 +326,11 @@ class FieldConstraints(Base): unique: Bool = False min_length: PositiveInt = None max_length: PositiveInt = None - minimum: Union[Int, Float, Date, Datetime] = None - maximum: Union[Int, Float, Date, Datetime] = None + minimum: Int | Float | Date | Datetime = None + maximum: Int | Float | Date | Datetime = None pattern: Pattern = None # TODO: Replace with String (min_length=1) once "" removed from enums - enum: StrictList(Union[pydantic.StrictStr, Int, Float, Bool, Date, Datetime]) = None + enum: StrictList(pydantic.StrictStr | Int | Float | Bool | Date | Datetime) = None _check_unique = _validator("enum", fn=_check_unique) @@ -414,14 +414,14 @@ class Encoder(Base): values. """ - ignored_codes: list[Union[Int, str]] = [] + ignored_codes: list[Int | str] = [] """A list of non-standard codes which appear in the data, and will be set to NA. These codes may be the result of data entry errors, and we are unable to map them to the appropriate canonical code. They are discarded from the raw input data. """ - code_fixes: dict[Union[Int, String], Union[Int, String]] = {} + code_fixes: dict[Int | String, Int | String] = {} """A dictionary mapping non-standard codes to canonical, standardized codes. The intended meanings of some non-standard codes are clear, and therefore they can @@ -502,7 +502,7 @@ def _check_fixed_codes_are_good_codes(cls, code_fixes, values): # noqa: N805 return code_fixes @property - def code_map(self) -> dict[str, Union[str, type(pd.NA)]]: + def code_map(self) -> dict[str, str | type(pd.NA)]: """A mapping of all known codes to their standardized values, or NA.""" code_map = {code: code for code in self.df["code"]} code_map.update(self.code_fixes) @@ -512,7 +512,7 @@ def code_map(self) -> dict[str, Union[str, type(pd.NA)]]: def encode( self, col: pd.Series, - dtype: Union[type, None] = None, + dtype: type | None = None, ) -> pd.Series: """Apply the stored code mapping to an input Series.""" # Every value in the Series should appear in the map. If that's not the @@ -634,7 +634,7 @@ def from_id(cls, x: str) -> "Field": """Construct from PUDL identifier (`Field.name`).""" return cls(**cls.dict_from_id(x)) - def to_pandas_dtype(self, compact: bool = False) -> Union[str, pd.CategoricalDtype]: + def to_pandas_dtype(self, compact: bool = False) -> str | pd.CategoricalDtype: """Return Pandas data type. Args: @@ -728,7 +728,7 @@ def to_sql( # noqa: C901 comment=self.description, ) - def encode(self, col: pd.Series, dtype: Union[type, None] = None) -> pd.Series: + def encode(self, col: pd.Series, dtype: type | None = None) -> pd.Series: """Recode the Field if it has an associated encoder.""" return self.encoder.encode(col, dtype=dtype) if self.encoder else col @@ -1305,9 +1305,7 @@ def to_pyarrow(self) -> pa.Schema: } return pa.schema(fields=fields, metadata=metadata) - def to_pandas_dtypes( - self, **kwargs: Any - ) -> dict[str, Union[str, pd.CategoricalDtype]]: + def to_pandas_dtypes(self, **kwargs: Any) -> dict[str, str | pd.CategoricalDtype]: """Return Pandas data type of each field by field name. Args: @@ -1315,7 +1313,7 @@ def to_pandas_dtypes( """ return {f.name: f.to_pandas_dtype(**kwargs) for f in self.schema.fields} - def match_primary_key(self, names: Iterable[str]) -> Optional[dict[str, str]]: + def match_primary_key(self, names: Iterable[str]) -> dict[str, str] | None: """Match primary key fields to input field names. An exact match is required unless :attr:`harvest` .`harvest=True`, diff --git a/src/pudl/metadata/fields.py b/src/pudl/metadata/fields.py index 782774cba3..55a0fa2698 100644 --- a/src/pudl/metadata/fields.py +++ b/src/pudl/metadata/fields.py @@ -1,6 +1,6 @@ """Field metadata.""" from copy import deepcopy -from typing import Any, Optional +from typing import Any import pandas as pd from pytz import all_timezones @@ -2103,10 +2103,10 @@ def get_pudl_dtypes( - group: Optional[str] = None, - field_meta: Optional[dict[str, Any]] = FIELD_METADATA, - field_meta_by_group: Optional[dict[str, Any]] = FIELD_METADATA_BY_GROUP, - dtype_map: Optional[dict[str, Any]] = FIELD_DTYPES_PANDAS, + group: str | None = None, + field_meta: dict[str, Any] | None = FIELD_METADATA, + field_meta_by_group: dict[str, Any] | None = FIELD_METADATA_BY_GROUP, + dtype_map: dict[str, Any] | None = FIELD_DTYPES_PANDAS, ) -> dict[str, Any]: """Compile a dictionary of field dtypes, applying group overrides. @@ -2136,9 +2136,9 @@ def get_pudl_dtypes( def apply_pudl_dtypes( df: pd.DataFrame, - group: Optional[str] = None, - field_meta: Optional[dict[str, Any]] = FIELD_METADATA, - field_meta_by_group: Optional[dict[str, Any]] = FIELD_METADATA_BY_GROUP, + group: str | None = None, + field_meta: dict[str, Any] | None = FIELD_METADATA, + field_meta_by_group: dict[str, Any] | None = FIELD_METADATA_BY_GROUP, ) -> pd.DataFrame: """Apply dtypes to those columns in a dataframe that have PUDL types defined. diff --git a/src/pudl/metadata/helpers.py b/src/pudl/metadata/helpers.py index fad60f4aab..518a555d4e 100644 --- a/src/pudl/metadata/helpers.py +++ b/src/pudl/metadata/helpers.py @@ -1,7 +1,7 @@ """Functions for manipulating metadata constants.""" from collections import defaultdict from collections.abc import Callable, Iterable -from typing import Any, Optional, Union +from typing import Any import numpy as np import pandas as pd @@ -45,7 +45,7 @@ def format_errors(*errors: str, title: str = None, pydantic: bool = False) -> st # --- Foreign keys --- # -def _parse_field_names(fields: list[Union[str, dict]]) -> list[str]: +def _parse_field_names(fields: list[str | dict]) -> list[str]: """Parse field names. Args: @@ -168,9 +168,9 @@ def build_foreign_keys( A resource's `foreign_key_rules` (if present) determines which other resources will be assigned a foreign key (`foreign_keys`) to the reference's primary key: - * `fields` (List[List[str]]): Sets of field names for which to create a foreign key. + * `fields` (list[list[str]]): Sets of field names for which to create a foreign key. These are assumed to match the order of the reference's primary key fields. - * `exclude` (Optional[List[str]]): Names of resources to exclude. + * `exclude` (Optional[list[str]]): Names of resources to exclude. Args: resources: Resource descriptors by name. @@ -179,9 +179,9 @@ def build_foreign_keys( Returns: Foreign keys for each resource (if any), by resource name. - * `fields` (List[str]): Field names. + * `fields` (list[str]): Field names. * `reference['resource']` (str): Reference resource name. - * `reference['fields']` (List[str]): Reference resource field names. + * `reference['fields']` (list[str]): Reference resource field names. Examples: >>> resources = { @@ -229,7 +229,7 @@ def build_foreign_keys( # --- Harvest --- # -def split_period(name: str) -> tuple[str, Optional[str]]: +def split_period(name: str) -> tuple[str, str | None]: """Split the time period from a column name. Args: @@ -343,7 +343,7 @@ def as_dict(x: pd.Series) -> dict[Any, list]: def try_aggfunc( # noqa: C901 func: Callable, raised: bool = True, - error: Union[str, Callable] = None, + error: str | Callable = None, ) -> Callable: """Wrap aggregate function in a try-except for error handling. diff --git a/src/pudl/output/eia923.py b/src/pudl/output/eia923.py index ea45184e4c..3b908abc80 100644 --- a/src/pudl/output/eia923.py +++ b/src/pudl/output/eia923.py @@ -2,7 +2,7 @@ import logging import os from datetime import date, datetime -from typing import Literal, Union +from typing import Literal import numpy as np import pandas as pd @@ -39,8 +39,8 @@ def generation_fuel_eia923( pudl_engine, freq: Literal["AS", "MS", None] = None, - start_date: Union[str, date, datetime, pd.Timestamp] = None, - end_date: Union[str, date, datetime, pd.Timestamp] = None, + start_date: str | date | datetime | pd.Timestamp = None, + end_date: str | date | datetime | pd.Timestamp = None, nuclear: bool = False, ): """Pull records from the generation_fuel_eia923 table in given date range. @@ -229,8 +229,8 @@ def generation_fuel_all_eia923(gf: pd.DataFrame, gfn: pd.DataFrame) -> pd.DataFr def fuel_receipts_costs_eia923( pudl_engine, freq: Literal["AS", "MS", None] = None, - start_date: Union[str, date, datetime, pd.Timestamp] = None, - end_date: Union[str, date, datetime, pd.Timestamp] = None, + start_date: str | date | datetime | pd.Timestamp = None, + end_date: str | date | datetime | pd.Timestamp = None, fill: bool = False, roll: bool = False, ) -> pd.DataFrame: diff --git a/src/pudl/output/epacems.py b/src/pudl/output/epacems.py index ab5c5e68e4..9d2b22d73e 100644 --- a/src/pudl/output/epacems.py +++ b/src/pudl/output/epacems.py @@ -2,7 +2,6 @@ from collections.abc import Iterable, Sequence from itertools import product from pathlib import Path -from typing import Optional, Union import dask.dataframe as dd import pandas as pd @@ -31,7 +30,7 @@ def epa_crosswalk() -> pd.DataFrame: def year_state_filter( years: Iterable[int] = None, states: Iterable[str] = None -) -> list[list[tuple[Union[str, int]]]]: +) -> list[list[tuple[str | int]]]: """Create filters to read given years and states from partitioned parquet dataset. A subset of an Apache Parquet dataset can be read in more efficiently if files which @@ -131,10 +130,10 @@ def get_plant_years(plant_ids, pudl_out): def epacems( - states: Optional[Sequence[str]] = None, - years: Optional[Sequence[int]] = None, - columns: Optional[Sequence[str]] = None, - epacems_path: Optional[Path] = None, + states: Sequence[str] | None = None, + years: Sequence[int] | None = None, + columns: Sequence[str] | None = None, + epacems_path: Path | None = None, ) -> dd.DataFrame: """Load EPA CEMS data from PUDL with optional subsetting. diff --git a/src/pudl/output/ferc714.py b/src/pudl/output/ferc714.py index 918f934e15..668183493c 100644 --- a/src/pudl/output/ferc714.py +++ b/src/pudl/output/ferc714.py @@ -189,7 +189,7 @@ def categorize_eia_code(eia_codes, ba_ids, util_ids, priority="balancing_authori return df -class Respondents(object): +class Respondents: """A class coordinating compilation of data related to FERC 714 Respondents. The FERC 714 Respondents themselves are not complex as they are reported, but diff --git a/src/pudl/output/pudltabl.py b/src/pudl/output/pudltabl.py index b4cb446b42..df641b334a 100644 --- a/src/pudl/output/pudltabl.py +++ b/src/pudl/output/pudltabl.py @@ -29,7 +29,7 @@ import logging from collections import defaultdict from datetime import date, datetime -from typing import Any, Literal, Union +from typing import Any, Literal # Useful high-level external modules. import pandas as pd @@ -52,16 +52,16 @@ ############################################################################### -class PudlTabl(object): +class PudlTabl: """A class for compiling common useful tabular outputs from the PUDL DB.""" def __init__( self, pudl_engine: sa.engine.Engine, - ds: Union[Datastore, None] = None, + ds: Datastore | None = None, freq: Literal["AS", "MS", None] = None, - start_date: Union[str, date, datetime, pd.Timestamp] = None, - end_date: Union[str, date, datetime, pd.Timestamp] = None, + start_date: str | date | datetime | pd.Timestamp = None, + end_date: str | date | datetime | pd.Timestamp = None, fill_fuel_cost: bool = False, roll_fuel_cost: bool = False, fill_net_gen: bool = False, @@ -109,7 +109,7 @@ def __init__( """ # Validating ds is deferred to the etl_eia861 & etl_ferc714 methods # because those are the only places a datastore is required. - self.ds: Union[Datastore, None] = ds + self.ds: Datastore | None = ds if not isinstance(pudl_engine, sa.engine.base.Engine): raise TypeError( "PudlTabl needs pudl_engine to be a SQLAlchemy Engine, but we " diff --git a/src/pudl/workspace/datastore.py b/src/pudl/workspace/datastore.py index 540b6831c4..9230445b5c 100644 --- a/src/pudl/workspace/datastore.py +++ b/src/pudl/workspace/datastore.py @@ -10,7 +10,7 @@ from collections import defaultdict from collections.abc import Iterator from pathlib import Path -from typing import Any, Optional +from typing import Any import coloredlogs import datapackage @@ -261,8 +261,8 @@ class Datastore: def __init__( self, - local_cache_path: Optional[Path] = None, - gcs_cache_path: Optional[str] = None, + local_cache_path: Path | None = None, + gcs_cache_path: str | None = None, sandbox: bool = False, timeout: float = 15, ): diff --git a/test/conftest.py b/test/conftest.py index 6050e6d98d..811a3172a6 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -68,7 +68,7 @@ def etl_parameters(request, test_dir): etl_settings_yml = Path( test_dir.parent / "src/pudl/package_data/settings/etl_fast.yml" ) - with open(etl_settings_yml, mode="r", encoding="utf8") as settings_file: + with open(etl_settings_yml, encoding="utf8") as settings_file: etl_settings_out = yaml.safe_load(settings_file) etl_settings = EtlSettings().parse_obj(etl_settings_out) return etl_settings diff --git a/test/unit/analysis/state_demand_test.py b/test/unit/analysis/state_demand_test.py index d238eb5547..9a4cd96107 100644 --- a/test/unit/analysis/state_demand_test.py +++ b/test/unit/analysis/state_demand_test.py @@ -1,7 +1,4 @@ """Tests for timeseries anomalies detection and imputation.""" - -from typing import Union - import numpy as np import pandas as pd import pytest @@ -33,8 +30,6 @@ pytest.param(None, {}, marks=pytest.mark.xfail), ], ) -def test_lookup_state( - state: Union[str, int], expected: dict[str, Union[str, int]] -) -> None: +def test_lookup_state(state: str | int, expected: dict[str, str | int]) -> None: """Check that various kinds of state lookups work.""" assert lookup_state(state) == expected