compute feature importances for each community

rewiringamerica · Oct 23, 2024 · 10719c7 · 10719c7
1 parent fbe54f3
commit 10719c7
Show file tree

Hide file tree

Showing 2 changed files with 114 additions and 56 deletions.
diff --git a/notebooks/Feature Importance Analysis.py b/notebooks/Feature Importance Analysis.py
@@ -3,23 +3,28 @@
 
 # COMMAND ----------
 
-# MAGIC %pip install catboost
+# MAGIC %pip install catboost shap
 
 # COMMAND ----------
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import sys
 
 import catboost as cb
 import pyspark.sql.functions as F
 from sklearn.model_selection import train_test_split
 import shap
+from cloudpathlib import CloudPath
+
+sys.path.append("../src")
+from src.util import save_figure_to_gcfs
 
 # COMMAND ----------
 
 # # Paths
-# FIGPATH = G.EXPORT_FPATH / "sumo_feature_importances" / "figures"
+FIGPATH = CloudPath("gs://the-cube") / "export" / "sumo_feature_importances"
 
 # ML crap
 TEST_SIZE = 0.2
@@ -67,7 +72,7 @@ def get_catboost_shap_values(X_train, y_train, X_test, y_test, categorical_featu
 
 # COMMAND ----------
 
-def plot_shap_values(shap_values, fig_title, n_feature_display=None, save_figure=False):
+def plot_shap_values(shap_values, fig_title, gcspath, n_feature_display=None):
     """Created and saves a bar plot of SHAP values
 
     Args:
@@ -83,32 +88,45 @@ def plot_shap_values(shap_values, fig_title, n_feature_display=None, save_figure
     shap.plots.bar(shap_values, max_display=n_feature_display, show=False)
     plt.title(fig_title)
     fig = plt.gcf()
-    fig.set_size_inches(9, 7)
+    fig.set_size_inches(9, 9)
     fig.tight_layout()
-    # util.save_figure_to_gcfs(fig=fig, gcspath=gcspath, dpi=200, transparent=True)
+    save_figure_to_gcfs(fig=fig, gcspath=gcspath, dpi=200, transparent=True)
 
 # COMMAND ----------
 
-features = spark.table('ml.surrogate_model.building_features').where(F.col('upgrade_id')==0)
-outputs = spark.table('ml.surrogate_model.building_simulation_outputs_annual')
-
-# COMMAND ----------
+communities = [
+    ('Pittsburgh', ['42003', '42007']),
+    ('Rhode Island', ['44001', '44003', '44005', '44007', '44009']),
+    ('Atlanta', ['13121', '13089', '13067', '13135', '13063']), 
+    ('Denver', ['08001', '08031', '08059', '08005', '08013', '08014', '08035']),
+]
 
-target = 'site_energy__total'
+# Transforming the list into a dictionary with modified geoids
+communities_dict = {
+    city: [f"G{geoid[:2]}0{geoid[2:]}0" for geoid in geoids]
+    for city, geoids in communities
+}
 
-from pyspark.sql.window import Window
+gisjoin_to_community = {gisjoin: community for community, gisjoins in communities_dict.items() for gisjoin in gisjoins}
 
-w = Window.partitionBy('building_id').orderBy(F.col('upgrade_id').asc())
+# COMMAND ----------
 
+metadata = spark.table('ml.surrogate_model.building_metadata')
+features = spark.table('ml.surrogate_model.building_features').where(F.col('upgrade_id')==0)
 outputs_savings = (
-    outputs
-        .where(F.col('upgrade_id').isin([0, 11.05]))
-        .withColumn(target, F.first(target).over(w) - F.col(target))
-        .where(F.col('upgrade_id')==11.05)
+    spark.table('building_model.resstock_annual_savings_by_upgrade_enduse_fuel')
+    .where(F.col('end_use')=='total')
+    .where(F.col('upgrade_id')=='11.05')
+    .groupby('building_id')
+    .agg(F.sum('kwh_delta').alias('kwh_delta'),
+         F.sum('cost_delta').alias('cost_delta'))
     )
 
 # COMMAND ----------
 
+target = 'cost_delta'
+
+
 pkey_features = [
     'upgrade_id',
     'building_id'
@@ -121,66 +139,71 @@ def plot_shap_values(shap_values, fig_title, n_feature_display=None, save_figure
     'weather_file_city_index'
 ]
 
-upgrade_features = [
+other_appliance_features = [
     'heat_pump_sizing_methodology',
     'has_heat_pump_dryer',
     'has_induction_range',
     'has_methane_gas_appliance',
     'has_fuel_oil_appliance',
-    'has_propane_appliance'
+    'has_propane_appliance',
+    'cooking_range_fuel', 
+    'clothes_dryer_fuel', 
+]
+
+additional_metadata = [ 
+    'ashrae_iecc_climate_zone_2004',
+    'federal_poverty_level', 
+    'geometry_building_type_acs', 
+    'income',
+    'tenure', 
+    'county' #for selecting within communities              
 ]
 
 df_predict = (
     features
         .join(outputs_savings.select('building_id', target), on = 'building_id')
-        .drop(*pkey_features, *location_features, *upgrade_features)
+        .join(metadata.select('building_id', *additional_metadata), on = 'building_id')
+        .drop(*pkey_features, *location_features, *other_appliance_features)
+        .where(F.col('heating_appliance_type') != 'ASHP')
+        .where(~F.col('is_mobile_home'))
 ).toPandas()
 
 categorical_features = [k for k,v in df_predict.dtypes.items() if v == 'object']
 numeric_features = [k for k,v in df_predict.dtypes.items() if v != 'object' and k != target]
 
 # COMMAND ----------
 
-# -- Building Features and Responses -- #
-# seperate categorical and numerical vars and cast to appropriate types
-X_categorical = df_predict[categorical_features].astype(str)
-X_numerical = df_predict[numeric_features].astype(float)
-X = pd.concat([X_categorical, X_numerical], axis=1)
-y = df_predict[target].astype(int)
-# split into test/train sets
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
-
-# -- Train Model & Get Feature Importances -- #
-shap_values, model_score = get_catboost_shap_values(
-    X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, categorical_features=categorical_features
-)
-
-# -- Ouptut Plot of Feature Importances (Mean SHAP Values) -- #
-plot_shap_values(
-    shap_values=shap_values,
-    fig_title=f"All Feature Importances {target}: R^2 = {model_score:.3f}",
-    n_feature_display=20
-   # gcspath=FIGPATH / f"mean_shap_values_{climate_zone}_{response}.png",
-)
-plt.show()
-
+df_predict['community'] = df_predict['county'].map(gisjoin_to_community)
 
 # COMMAND ----------
 
-# -- Ouptut Plot of Feature Importances (Mean SHAP Values) -- #
-plot_shap_values(
-    shap_values=shap_values,
-    fig_title=f"All Feature Importances {target}: R^2 = {model_score:.3f}",
-    n_feature_display=30
-   # gcspath=FIGPATH / f"mean_shap_values_{climate_zone}_{response}.png",
-)
-plt.show()
+def fit_and_plot_shap_values(df, categorical_features, numeric_features, target, title_detail):
+    # -- Building Features and Responses -- #
+    # seperate categorical and numerical vars and cast to appropriate types
+    X_categorical = df[categorical_features].astype(str)
+    X_numerical = df[numeric_features].astype(float)
+    X = pd.concat([X_categorical, X_numerical], axis=1)
+    y = df[target].astype(int)
+    # split into test/train sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
+
+    # -- Train Model & Get Feature Importances -- #
+    shap_values, model_score = get_catboost_shap_values(
+        X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, categorical_features=categorical_features
+    )
+
+    # -- Ouptut Plot of Feature Importances (Mean SHAP Values) -- #
+    plot_shap_values(
+        shap_values=shap_values,
+        fig_title=f"Feature Importances for HP Savings ({title_detail}): R^2 = {model_score:.3f}",
+        n_feature_display=12, 
+        gcspath=FIGPATH / f"mean_shap_values_cost_savings_medhp_{'_'.join(title_detail.lower().split())}.png",
+    )
+    plt.show()
 
 # COMMAND ----------
 
-communities = [
-    ('Pittsburgh', ['42003', '42007']),
-    ('Rhode Island', ['44001', '44003', '44005', '44007', '44009']),
-    ('Atlanta', ['13121', '13089', '13067', '13135', '13063']), 
-    ('Denver', ['08001', '08031', '08059', '08005', '08013', '08014', '08035']),
-]
+for community_name, df_community in df_predict.groupby('community'):
+    fit_and_plot_shap_values(df_community, categorical_features, numeric_features, target, community_name)
+
+fit_and_plot_shap_values(df_predict.sample(frac=.3), categorical_features, numeric_features, "National")
diff --git a/src/util.py b/src/util.py
@@ -3,6 +3,8 @@
 from cloudpathlib import CloudPath
 import collections
 from functools import reduce, partial
+from google.cloud import storage
+import io
 import os
 import re
 from typing import List, Dict
@@ -297,3 +299,36 @@ def get_clean_rastock_df() -> DataFrame:
     rastock_df = clean_bsb_output_cols(rastock_df)
     rastock_df = convert_column_units(rastock_df)
     return rastock_df
+
+
+# already in dmutils
+def save_figure_to_gcfs(fig, gcspath, figure_format="png", dpi=200, transparent=False):
+    """
+    Write out a figure to google cloud storage
+
+    Args:
+        fig (matplotlib.figure.Figure): figure object to write out
+        gcspath (cloudpathlib.gs.gspath.GSPath): filepath to write to in GCFS
+        figure_format (str): file format in ['pdf', 'svg', 'png', 'jpg']. Defaults to 'png'.
+        dpi (int): resolution in dots per inch. Only relevant if format non-vector ('png', 'jpg'). Defaults to 200.
+
+    Returns:
+        pyspark.sql.dataframe.DataFrame
+
+    Modified from source:
+    https://stackoverflow.com/questions/54223769/writing-figure-to-google-cloud-storage-instead-of-local-drive
+    """
+    supported_formats = ["pdf", "svg", "png", "jpg"]
+    if figure_format not in supported_formats:
+        raise ValueError(f"Please pass supported format in {supported_formats}")
+
+    # Save figure image to a bytes buffer
+    buf = io.BytesIO()
+    fig.savefig(buf, format=figure_format, dpi=dpi, transparent=transparent)
+
+    # init GCS client and upload buffer contents
+    client = storage.Client()
+    bucket = client.get_bucket(gcspath.bucket)
+    blob = bucket.blob(gcspath.blob)
+    blob.upload_from_file(buf, content_type=figure_format, rewind=True)
+