Skip to content

Commit

Permalink
compute feature importances for each community
Browse files Browse the repository at this point in the history
  • Loading branch information
mikivee committed Oct 23, 2024
1 parent fbe54f3 commit 10719c7
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 56 deletions.
135 changes: 79 additions & 56 deletions notebooks/Feature Importance Analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,28 @@

# COMMAND ----------

# MAGIC %pip install catboost
# MAGIC %pip install catboost shap

# COMMAND ----------

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys

import catboost as cb
import pyspark.sql.functions as F
from sklearn.model_selection import train_test_split
import shap
from cloudpathlib import CloudPath

sys.path.append("../src")
from src.util import save_figure_to_gcfs

# COMMAND ----------

# # Paths
# FIGPATH = G.EXPORT_FPATH / "sumo_feature_importances" / "figures"
FIGPATH = CloudPath("gs://the-cube") / "export" / "sumo_feature_importances"

# ML crap
TEST_SIZE = 0.2
Expand Down Expand Up @@ -67,7 +72,7 @@ def get_catboost_shap_values(X_train, y_train, X_test, y_test, categorical_featu

# COMMAND ----------

def plot_shap_values(shap_values, fig_title, n_feature_display=None, save_figure=False):
def plot_shap_values(shap_values, fig_title, gcspath, n_feature_display=None):
"""Created and saves a bar plot of SHAP values
Args:
Expand All @@ -83,32 +88,45 @@ def plot_shap_values(shap_values, fig_title, n_feature_display=None, save_figure
shap.plots.bar(shap_values, max_display=n_feature_display, show=False)
plt.title(fig_title)
fig = plt.gcf()
fig.set_size_inches(9, 7)
fig.set_size_inches(9, 9)
fig.tight_layout()
# util.save_figure_to_gcfs(fig=fig, gcspath=gcspath, dpi=200, transparent=True)
save_figure_to_gcfs(fig=fig, gcspath=gcspath, dpi=200, transparent=True)

# COMMAND ----------

features = spark.table('ml.surrogate_model.building_features').where(F.col('upgrade_id')==0)
outputs = spark.table('ml.surrogate_model.building_simulation_outputs_annual')

# COMMAND ----------
communities = [
('Pittsburgh', ['42003', '42007']),
('Rhode Island', ['44001', '44003', '44005', '44007', '44009']),
('Atlanta', ['13121', '13089', '13067', '13135', '13063']),
('Denver', ['08001', '08031', '08059', '08005', '08013', '08014', '08035']),
]

target = 'site_energy__total'
# Transforming the list into a dictionary with modified geoids
communities_dict = {
city: [f"G{geoid[:2]}0{geoid[2:]}0" for geoid in geoids]
for city, geoids in communities
}

from pyspark.sql.window import Window
gisjoin_to_community = {gisjoin: community for community, gisjoins in communities_dict.items() for gisjoin in gisjoins}

w = Window.partitionBy('building_id').orderBy(F.col('upgrade_id').asc())
# COMMAND ----------

metadata = spark.table('ml.surrogate_model.building_metadata')
features = spark.table('ml.surrogate_model.building_features').where(F.col('upgrade_id')==0)
outputs_savings = (
outputs
.where(F.col('upgrade_id').isin([0, 11.05]))
.withColumn(target, F.first(target).over(w) - F.col(target))
.where(F.col('upgrade_id')==11.05)
spark.table('building_model.resstock_annual_savings_by_upgrade_enduse_fuel')
.where(F.col('end_use')=='total')
.where(F.col('upgrade_id')=='11.05')
.groupby('building_id')
.agg(F.sum('kwh_delta').alias('kwh_delta'),
F.sum('cost_delta').alias('cost_delta'))
)

# COMMAND ----------

target = 'cost_delta'


pkey_features = [
'upgrade_id',
'building_id'
Expand All @@ -121,66 +139,71 @@ def plot_shap_values(shap_values, fig_title, n_feature_display=None, save_figure
'weather_file_city_index'
]

upgrade_features = [
other_appliance_features = [
'heat_pump_sizing_methodology',
'has_heat_pump_dryer',
'has_induction_range',
'has_methane_gas_appliance',
'has_fuel_oil_appliance',
'has_propane_appliance'
'has_propane_appliance',
'cooking_range_fuel',
'clothes_dryer_fuel',
]

additional_metadata = [
'ashrae_iecc_climate_zone_2004',
'federal_poverty_level',
'geometry_building_type_acs',
'income',
'tenure',
'county' #for selecting within communities
]

df_predict = (
features
.join(outputs_savings.select('building_id', target), on = 'building_id')
.drop(*pkey_features, *location_features, *upgrade_features)
.join(metadata.select('building_id', *additional_metadata), on = 'building_id')
.drop(*pkey_features, *location_features, *other_appliance_features)
.where(F.col('heating_appliance_type') != 'ASHP')
.where(~F.col('is_mobile_home'))
).toPandas()

categorical_features = [k for k,v in df_predict.dtypes.items() if v == 'object']
numeric_features = [k for k,v in df_predict.dtypes.items() if v != 'object' and k != target]

# COMMAND ----------

# -- Building Features and Responses -- #
# seperate categorical and numerical vars and cast to appropriate types
X_categorical = df_predict[categorical_features].astype(str)
X_numerical = df_predict[numeric_features].astype(float)
X = pd.concat([X_categorical, X_numerical], axis=1)
y = df_predict[target].astype(int)
# split into test/train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# -- Train Model & Get Feature Importances -- #
shap_values, model_score = get_catboost_shap_values(
X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, categorical_features=categorical_features
)

# -- Ouptut Plot of Feature Importances (Mean SHAP Values) -- #
plot_shap_values(
shap_values=shap_values,
fig_title=f"All Feature Importances {target}: R^2 = {model_score:.3f}",
n_feature_display=20
# gcspath=FIGPATH / f"mean_shap_values_{climate_zone}_{response}.png",
)
plt.show()

df_predict['community'] = df_predict['county'].map(gisjoin_to_community)

# COMMAND ----------

# -- Ouptut Plot of Feature Importances (Mean SHAP Values) -- #
plot_shap_values(
shap_values=shap_values,
fig_title=f"All Feature Importances {target}: R^2 = {model_score:.3f}",
n_feature_display=30
# gcspath=FIGPATH / f"mean_shap_values_{climate_zone}_{response}.png",
)
plt.show()
def fit_and_plot_shap_values(df, categorical_features, numeric_features, target, title_detail):
# -- Building Features and Responses -- #
# seperate categorical and numerical vars and cast to appropriate types
X_categorical = df[categorical_features].astype(str)
X_numerical = df[numeric_features].astype(float)
X = pd.concat([X_categorical, X_numerical], axis=1)
y = df[target].astype(int)
# split into test/train sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# -- Train Model & Get Feature Importances -- #
shap_values, model_score = get_catboost_shap_values(
X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, categorical_features=categorical_features
)

# -- Ouptut Plot of Feature Importances (Mean SHAP Values) -- #
plot_shap_values(
shap_values=shap_values,
fig_title=f"Feature Importances for HP Savings ({title_detail}): R^2 = {model_score:.3f}",
n_feature_display=12,
gcspath=FIGPATH / f"mean_shap_values_cost_savings_medhp_{'_'.join(title_detail.lower().split())}.png",
)
plt.show()

# COMMAND ----------

communities = [
('Pittsburgh', ['42003', '42007']),
('Rhode Island', ['44001', '44003', '44005', '44007', '44009']),
('Atlanta', ['13121', '13089', '13067', '13135', '13063']),
('Denver', ['08001', '08031', '08059', '08005', '08013', '08014', '08035']),
]
for community_name, df_community in df_predict.groupby('community'):
fit_and_plot_shap_values(df_community, categorical_features, numeric_features, target, community_name)

fit_and_plot_shap_values(df_predict.sample(frac=.3), categorical_features, numeric_features, "National")
35 changes: 35 additions & 0 deletions src/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from cloudpathlib import CloudPath
import collections
from functools import reduce, partial
from google.cloud import storage
import io
import os
import re
from typing import List, Dict
Expand Down Expand Up @@ -297,3 +299,36 @@ def get_clean_rastock_df() -> DataFrame:
rastock_df = clean_bsb_output_cols(rastock_df)
rastock_df = convert_column_units(rastock_df)
return rastock_df


# already in dmutils
def save_figure_to_gcfs(fig, gcspath, figure_format="png", dpi=200, transparent=False):
"""
Write out a figure to google cloud storage
Args:
fig (matplotlib.figure.Figure): figure object to write out
gcspath (cloudpathlib.gs.gspath.GSPath): filepath to write to in GCFS
figure_format (str): file format in ['pdf', 'svg', 'png', 'jpg']. Defaults to 'png'.
dpi (int): resolution in dots per inch. Only relevant if format non-vector ('png', 'jpg'). Defaults to 200.
Returns:
pyspark.sql.dataframe.DataFrame
Modified from source:
https://stackoverflow.com/questions/54223769/writing-figure-to-google-cloud-storage-instead-of-local-drive
"""
supported_formats = ["pdf", "svg", "png", "jpg"]
if figure_format not in supported_formats:
raise ValueError(f"Please pass supported format in {supported_formats}")

# Save figure image to a bytes buffer
buf = io.BytesIO()
fig.savefig(buf, format=figure_format, dpi=dpi, transparent=transparent)

# init GCS client and upload buffer contents
client = storage.Client()
bucket = client.get_bucket(gcspath.bucket)
blob = bucket.blob(gcspath.blob)
blob.upload_from_file(buf, content_type=figure_format, rewind=True)

0 comments on commit 10719c7

Please sign in to comment.