From 25e909320fd6054e9c341132c5ee61157ebd48c2 Mon Sep 17 00:00:00 2001 From: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Date: Thu, 31 Aug 2023 14:00:14 +0200 Subject: [PATCH] Introduce non-linear feature engineering in more than 1D (#696) --- python_scripts/linear_models_ex_03.py | 58 ++++++---- python_scripts/linear_models_sol_03.py | 102 ++++++++++++------ .../linear_regression_non_linear_link.py | 20 +++- 3 files changed, 127 insertions(+), 53 deletions(-) diff --git a/python_scripts/linear_models_ex_03.py b/python_scripts/linear_models_ex_03.py index 07ca53ac7..3ab6949a3 100644 --- a/python_scripts/linear_models_ex_03.py +++ b/python_scripts/linear_models_ex_03.py @@ -14,17 +14,24 @@ # %% [markdown] # # 📝 Exercise M4.03 # -# In all previous notebooks, we only used a single feature in `data`. But we -# have already shown that we could add new features to make the model more -# expressive by deriving new features, based on the original feature. +# In the previous notebook, we showed that we can add new features based on the +# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. +# In that case we only used a single feature in `data`. # # The aim of this notebook is to train a linear regression algorithm on a -# dataset with more than a single feature. +# dataset with more than a single feature. In such a "multi-dimensional" feature +# space we can derive new features of the form `x1 * x2`, `x2 * x3`, +# etc. Products of features are usually called "non-linear or +# multiplicative interactions" between features. # -# We will load a dataset about house prices in California. The dataset consists -# of 8 features regarding the demography and geography of districts in -# California and the aim is to predict the median house price of each district. -# We will use all 8 features to predict the target, the median house price. +# Feature engineering can be an important step of a model pipeline as long as +# the new features are expected to be predictive. For instance, think of a +# classification model to decide if a patient has risk of developing a heart +# disease. This would depend on the patient's Body Mass Index which is defined +# as `weight / height ** 2`. +# +# We load the dataset penguins dataset. We first use a set of 3 numerical +# features to predict the target, i.e. the body mass of the penguin. # %% [markdown] # ```{note} @@ -33,10 +40,18 @@ # ``` # %% -from sklearn.datasets import fetch_california_housing +import pandas as pd + +penguins = pd.read_csv("../datasets/penguins.csv") + +columns = ["Flipper Length (mm)", "Culmen Length (mm)", "Culmen Depth (mm)"] +target_name = "Body Mass (g)" -data, target = fetch_california_housing(as_frame=True, return_X_y=True) -target *= 100 # rescale the target in k$ +# Remove lines with missing values for the columns of interest +penguins_non_missing = penguins[columns + [target_name]].dropna() + +data = penguins_non_missing[columns] +target = penguins_non_missing[target_name] data.head() # %% [markdown] @@ -48,24 +63,31 @@ # %% [markdown] # Execute a cross-validation with 10 folds and use the mean absolute error (MAE) -# as metric. Be sure to *return* the fitted *estimators*. +# as metric. # %% # Write your code here. # %% [markdown] -# Compute the mean and std of the MAE in thousands of dollars (k$). +# Compute the mean and std of the MAE in grams (g). # %% # Write your code here. # %% [markdown] -# Inspect the fitted model using a box plot to show the distribution of values -# for the coefficients returned from the cross-validation. Hint: use the -# function -# [`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html) -# to create a box plot. +# Now create a pipeline using `make_pipeline` consisting of a +# `PolynomialFeatures` and a linear regression. Set `degree=2` and +# `interaction_only=True` to the feature engineering step. Remember not to +# include the bias to avoid redundancies with the linear's regression intercept. +# +# Use the same strategy as before to cross-validate such a pipeline. +# %% +# Write your code here. + +# %% [markdown] +# Compute the mean and std of the MAE in grams (g) and compare with the results +# without feature engineering. # %% # Write your code here. diff --git a/python_scripts/linear_models_sol_03.py b/python_scripts/linear_models_sol_03.py index 7fadc8468..0cacfcf0d 100644 --- a/python_scripts/linear_models_sol_03.py +++ b/python_scripts/linear_models_sol_03.py @@ -8,17 +8,24 @@ # %% [markdown] # # 📃 Solution for Exercise M4.03 # -# In all previous notebooks, we only used a single feature in `data`. But we -# have already shown that we could add new features to make the model more -# expressive by deriving new features, based on the original feature. +# In the previous notebook, we showed that we can add new features based on the +# original feature to make the model more expressive, for instance `x ** 2` or `x ** 3`. +# In that case we only used a single feature in `data`. # # The aim of this notebook is to train a linear regression algorithm on a -# dataset with more than a single feature. +# dataset with more than a single feature. In such a "multi-dimensional" feature +# space we can derive new features of the form `x1 * x2`, `x2 * x3`, +# etc. Products of features are usually called "non-linear or +# multiplicative interactions" between features. # -# We will load a dataset about house prices in California. The dataset consists -# of 8 features regarding the demography and geography of districts in -# California and the aim is to predict the median house price of each district. -# We will use all 8 features to predict the target, the median house price. +# Feature engineering can be an important step of a model pipeline as long as +# the new features are expected to be predictive. For instance, think of a +# classification model to decide if a patient has risk of developing a heart +# disease. This would depend on the patient's Body Mass Index which is defined +# as `weight / height ** 2`. +# +# We load the dataset penguins dataset. We first use a set of 3 numerical +# features to predict the target, i.e. the body mass of the penguin. # %% [markdown] # ```{note} @@ -27,10 +34,18 @@ # ``` # %% -from sklearn.datasets import fetch_california_housing +import pandas as pd + +penguins = pd.read_csv("../datasets/penguins.csv") + +columns = ["Flipper Length (mm)", "Culmen Length (mm)", "Culmen Depth (mm)"] +target_name = "Body Mass (g)" -data, target = fetch_california_housing(as_frame=True, return_X_y=True) -target *= 100 # rescale the target in k$ +# Remove lines with missing values for the columns of interest +penguins_non_missing = penguins[columns + [target_name]].dropna() + +data = penguins_non_missing[columns] +target = penguins_non_missing[target_name] data.head() # %% [markdown] @@ -45,7 +60,7 @@ # %% [markdown] # Execute a cross-validation with 10 folds and use the mean absolute error (MAE) -# as metric. Be sure to *return* the fitted *estimators*. +# as metric. # %% # solution @@ -55,42 +70,65 @@ linear_regression, data, target, - scoring="neg_mean_absolute_error", - return_estimator=True, cv=10, + scoring="neg_mean_absolute_error", n_jobs=2, ) # %% [markdown] -# Compute the mean and std of the MAE in thousands of dollars (k$). +# Compute the mean and std of the MAE in grams (g). # %% # solution print( - "Mean absolute error on testing set: " - f"{-cv_results['test_score'].mean():.3f} k$ ± " - f"{cv_results['test_score'].std():.3f}" + "Mean absolute error on testing set with original features: " + f"{-cv_results['test_score'].mean():.3f} ± " + f"{cv_results['test_score'].std():.3f} g" ) # %% [markdown] -# Inspect the fitted model using a box plot to show the distribution of values -# for the coefficients returned from the cross-validation. Hint: use the -# function -# [`df.plot.box()`](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.box.html) -# to create a box plot. - +# Now create a pipeline using `make_pipeline` consisting of a +# `PolynomialFeatures` and a linear regression. Set `degree=2` and +# `interaction_only=True` to the feature engineering step. Remember not to +# include the bias to avoid redundancies with the linear's regression intercept. +# +# Use the same strategy as before to cross-validate such a pipeline. # %% # solution -import pandas as pd +from sklearn.preprocessing import PolynomialFeatures +from sklearn.pipeline import make_pipeline -weights = pd.DataFrame( - [est.coef_ for est in cv_results["estimator"]], columns=data.columns +poly_features = PolynomialFeatures( + degree=2, include_bias=False, interaction_only=True +) +linear_regression_interactions = make_pipeline( + poly_features, linear_regression ) -# %% tags=["solution"] -import matplotlib.pyplot as plt +cv_results = cross_validate( + linear_regression_interactions, + data, + target, + cv=10, + scoring="neg_mean_absolute_error", + n_jobs=2, +) + +# %% [markdown] +# Compute the mean and std of the MAE in grams (g) and compare with the results +# without feature engineering. + +# %% +# solution +print( + "Mean absolute error on testing set with interactions: " + f"{-cv_results['test_score'].mean():.3f} ± " + f"{cv_results['test_score'].std():.3f} g" +) -color = {"whiskers": "black", "medians": "black", "caps": "black"} -weights.plot.box(color=color, vert=False) -_ = plt.title("Value of linear regression coefficients") +# %% [markdown] tags=["solution"] +# We observe that the mean absolute error is lower and less spread with the +# enriched features. In this case the "interactions" are indeed predictive. In +# the following notebook we will see what happens when the enriched features are +# non-predictive and how to deal with this case. diff --git a/python_scripts/linear_regression_non_linear_link.py b/python_scripts/linear_regression_non_linear_link.py index 2fc6699ac..9e72fb49e 100644 --- a/python_scripts/linear_regression_non_linear_link.py +++ b/python_scripts/linear_regression_non_linear_link.py @@ -247,9 +247,9 @@ # line. `SVR(kernel="linear")` is indeed yet another example of a linear model. # # The estimator can also be configured to use a non-linear kernel. Then, it can -# learn a prediction function that computes non-linear interaction between -# samples for which we want to make a prediction and selected samples from the -# training set. +# learn a prediction function that computes non-linear relations between samples +# for which we want to make a prediction and selected samples from the training +# set. # # The result is another kind of non-linear regression model with a similar # expressivity as our previous polynomial regression pipeline: @@ -315,3 +315,17 @@ ) ax.plot(data, target_predicted) _ = ax.set_title(f"Mean squared error = {mse:.2f}") + +# %% [markdown] +# ## Notebook Recap +# +# In this notebook we explored several ways to expand a single numerical feature +# into several non-linearly derived new features. This makes our machine +# learning pipeline more expressive and less likely to underfit, even if the +# last stage of the pipeline is a simple linear regression model. + +# For the sake of simplicity, we introduced those transformers on a toy +# regression problem with a single input feature. However, non-linear feature +# transformers such as Nystroem can further improve the expressiveness of +# machine learning pipelines to model non-linear interactions between features. +# We will explore this possibility in the next exercise.