Fix some bugs: #134, #136, #162 (#181)

* fix: #162 enforce utf8 config encoding * chore: resolve #134 * fix: #136 dict string handeling * chore: resolve and raise `FutureWarning`s
PyPSA · Jul 23, 2024 · 8885715 · 8885715
1 parent 156b9b9
commit 8885715
Show file tree

Hide file tree

Showing 8 changed files with 54 additions and 26 deletions.
diff --git a/powerplantmatching/cleaning.py b/powerplantmatching/cleaning.py
@@ -405,7 +405,7 @@ def aggregate_units(
         for arg in used_deprecated_args:
             kwargs.pop(arg)
         msg = "The following arguments were deprecated and are being ignored: "
-        logger.warn(msg + f"{used_deprecated_args}")
+        logger.warning(msg + f"{used_deprecated_args}")
 
     df = get_obj_if_Acc(df)
 

diff --git a/powerplantmatching/collection.py b/powerplantmatching/collection.py
@@ -29,7 +29,7 @@
 from .matching import combine_multiple_datasets, reduce_matched_dataframe
 from .utils import (
     parmap,
-    projectID_to_dict,
+    parse_string_to_dict,
     set_column_name,
     to_dict_if_string,
 )
@@ -113,7 +113,7 @@ def df_by_name(name):
             df = pd.read_csv(
                 outfn_matched, index_col=0, header=[0, 1], low_memory=False
             )
-        return df.pipe(projectID_to_dict)
+        return df.pipe(parse_string_to_dict, ["projectID", "EIC"])
 
 
 def powerplants(
@@ -181,9 +181,9 @@ def powerplants(
     used_deprecated_args = deprecated_args.intersection(collection_kwargs.keys())
     if used_deprecated_args:
         msg = "The following arguments were deprecated and are being ignored: "
-        logger.warn(msg + f"{used_deprecated_args}")
+        logger.warning(msg + f"{used_deprecated_args}")
     if extendby_kwargs:
-        logger.warn(
+        logger.warning(
             DeprecationWarning,
             "`extendby_kwargs` is deprecated in the favor of extend_by_kwargs",
         )
@@ -204,7 +204,7 @@ def powerplants(
         logger.info(f"Retrieving data from {url}")
         df = (
             pd.read_csv(url, index_col=0)
-            .pipe(projectID_to_dict)
+            .pipe(parse_string_to_dict, ["projectID", "EIC"])
             .pipe(set_column_name, "Matched Data")
         )
         logger.info(f"Store data at {fn}")
@@ -214,7 +214,7 @@ def powerplants(
     if not update and os.path.exists(fn):
         df = (
             pd.read_csv(fn, index_col=0, header=header)
-            .pipe(projectID_to_dict)
+            .pipe(parse_string_to_dict, ["projectID", "EIC"])
             .pipe(set_column_name, "Matched Data")
         )
         if extend_by_vres:

diff --git a/powerplantmatching/core.py b/powerplantmatching/core.py
@@ -55,8 +55,7 @@ def _data_out(fn, config):
 
 # Logging: General Settings
 logger = logging.getLogger(__name__)
-logging.basicConfig(level=20)
-logger.setLevel("INFO")
+logger.setLevel(logging.INFO)
 # Logging: File
 logFormatter = logging.Formatter(
     "%(asctime)s [%(threadName)-12.12s] " "[%(levelname)-5.5s]  %(message)s"
@@ -100,10 +99,10 @@ def get_config(filename=None, **overrides):
     else:
         custom_config = package_config["custom_config"]
 
-    with open(base_config) as f:
+    with open(base_config, encoding="utf8") as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
     if exists(custom_config):
-        with open(custom_config) as f:
+        with open(custom_config, encoding="utf8") as f:
             config.update(yaml.load(f, Loader=yaml.FullLoader))
     config.update(overrides)
 

diff --git a/powerplantmatching/heuristics.py b/powerplantmatching/heuristics.py
@@ -241,7 +241,7 @@ def fill_missing_commissioning_years(df):
     df["DateIn"] = df.DateIn.fillna(df.groupby(["Country"]).DateIn.transform("mean"))
     if df.DateIn.isnull().any():
         count = len(df[df.DateIn.isnull()])
-        logger.warn(
+        logger.warning(
             f"""There are still *{count}* empty values for
                         'DateIn' in the DataFrame. These should
                         be either be filled manually or dropped.
@@ -621,6 +621,6 @@ def set_known_retire_years(df):
         if name_match_b.any():
             ppl_de_nuc.loc[name_match_b, "YearRetire"] = year
         else:
-            logger.warn(f"'{name}' was not found in given DataFrame.")
+            logger.warning(f"'{name}' was not found in given DataFrame.")
     df.loc[ppl_de_nuc.index, "YearRetire"] = ppl_de_nuc["YearRetire"]
     return df
diff --git a/powerplantmatching/matching.py b/powerplantmatching/matching.py
@@ -82,7 +82,7 @@ def compare_two_datasets(dfs, labels, country_wise=True, config=None, **dukeargs
         for arg in used_deprecated_args:
             dukeargs.pop(arg)
         msg = "The following arguments were deprecated and are being ignored: "
-        logger.warn(msg + f"{used_deprecated_args}")
+        logger.warning(msg + f"{used_deprecated_args}")
 
     dfs = list(map(read_csv_if_string, dfs))
     if "singlematch" not in dukeargs:
@@ -150,12 +150,12 @@ def cross_matches(sets_of_pairs, labels=None):
                 matches = pd.concat([matches, match_base], sort=True)
 
     if matches is None or matches.empty:
-        logger.warn("No matches found")
+        logger.warning("No matches found")
         return pd.DataFrame(columns=labels)
 
     if matches.isnull().all().any():
         cols = ", ".join(matches.columns[matches.isnull().all()])
-        logger.warn(f"No matches found for data source {cols}")
+        logger.warning(f"No matches found for data source {cols}")
 
     matches = matches.drop_duplicates().reset_index(drop=True)
     for label in labels:

diff --git a/powerplantmatching/plot.py b/powerplantmatching/plot.py
@@ -44,7 +44,7 @@
     cartopy_present = False
 
 if not cartopy_present:
-    logger.warn("Cartopy not existent.")
+    logger.warning("Cartopy not existent.")
 
 
 def fueltype_stats(df):
@@ -524,7 +524,7 @@ def calc(n, m):
 #                  .fillna(0.0))               # country (if all zero->drop!).
 #
 #    if (show_indicators or threshold >= 0.) and len(stats.columns) < 2:
-#        logger.warn('At least two objects for comparison needed when using '
+#        logger.warning('At least two objects for comparison needed when using '
 #                    '`show_indicators` or `threshold`. Arguments ignored.')
 #        show_indicators = False
 #        threshold = -1

diff --git a/powerplantmatching/utils.py b/powerplantmatching/utils.py
@@ -19,6 +19,7 @@
 
 import multiprocessing
 import os
+import re
 from ast import literal_eval as liteval
 
 import country_converter as coco
@@ -270,18 +271,39 @@ def to_dict_if_string(s):
         return s
 
 
-def projectID_to_dict(df):
+def parse_string_to_dict(df, cols):
     """
-    Convenience function to convert string of dict to dict type
+    Convenience function to convert string of dict to dict type for specified columns.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame on which to apply the parsing
+    cols : str, list
+        Column(s) to be parsed to dict type
+
+    Returns
+    -------
+    pd.DataFrame
+        DataFrame with specified columns parsed to dict type
     """
-    if df.columns.nlevels > 1:
+    if isinstance(cols, str):
+        cols = [cols]
+
+    def _replace_and_evaluate(value):
+        # Needed to read in older files with {nan} as string
+        value = re.sub(r"\bnan\b(, )?|, \bnan\b", "", value)
+        return liteval(value)
+
+    if isinstance(df.columns, pd.MultiIndex):
         return df.assign(
-            projectID=(
-                df.projectID.stack().dropna().apply(lambda ds: liteval(ds)).unstack()
-            )
+            **{
+                col: df[col].stack().dropna().apply(_replace_and_evaluate).unstack()
+                for col in cols
+            }
         )
     else:
-        return df.assign(projectID=df.projectID.apply(lambda x: liteval(x)))
+        return df.assign(**{col: df[col].apply(_replace_and_evaluate) for col in cols})
 
 
 def select_by_projectID(df, projectID, dataset_name=None):
@@ -563,7 +585,7 @@ def parse_Geoposition(
             exactly_one=True,
         )
     except geopy.exc.GeocoderQueryError as e:
-        logger.warn(e)
+        logger.warning(e)
         gdata = None
 
     if gdata is not None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -14,4 +14,11 @@ select = [
     'UP',  # pyupgrade
     'TID', # flake8-tidy-imports
     'NPY', # numpy
+]
+
+# Pytest settings
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error::FutureWarning",      # Raise all FutureWarnings as errors
 ]