From 3083498054f442336e9655a852fdf44879f0e453 Mon Sep 17 00:00:00 2001 From: ArturoAmorQ Date: Wed, 21 Feb 2024 15:11:33 +0100 Subject: [PATCH] ENH Rework narrative of GBDT notebook --- python_scripts/ensemble_gradient_boosting.py | 209 +++++++++---------- 1 file changed, 103 insertions(+), 106 deletions(-) diff --git a/python_scripts/ensemble_gradient_boosting.py b/python_scripts/ensemble_gradient_boosting.py index 874c3ed20..206d19756 100644 --- a/python_scripts/ensemble_gradient_boosting.py +++ b/python_scripts/ensemble_gradient_boosting.py @@ -8,26 +8,28 @@ # %% [markdown] # # Gradient-boosting decision tree (GBDT) # -# In this notebook, we will present the gradient boosting decision tree -# algorithm and contrast it with AdaBoost. +# In this notebook, we present the gradient boosting decision tree algorithm. # -# Gradient-boosting differs from AdaBoost due to the following reason: instead -# of assigning weights to specific samples, GBDT will fit a decision tree on the -# residuals error (hence the name "gradient") of the previous tree. Therefore, -# each new tree in the ensemble predicts the error made by the previous learner -# instead of predicting the target directly. +# Even if AdaBoost and GBDT are both boosting algorithms, they are different in +# nature: the former assigns weights to specific samples, whereas GBDT fits +# succesive decision trees on the residual errors (hence the name "gradient") of +# their preceding tree. Therefore, each new tree in the ensemble tries to refine +# its predictions by specifically addressing the errors made by the previous +# learner, instead of predicting the target directly. # -# In this section, we will provide some intuition about the way learners are -# combined to give the final prediction. In this regard, let's go back to our -# regression problem which is more intuitive for demonstrating the underlying +# In this section, we provide some intuitions on the way learners are combined +# to give the final prediction. For such purpose, we tackle a single-feature +# regression problem, which is more intuitive for demonstrating the underlying # machinery. +# +# Later in this notebook we compare the performance of GBDT (boosting) with that +# of a Random Forest (bagging) for a particular dataset. # %% import pandas as pd import numpy as np -# Create a random number generator that will be used to set the randomness -rng = np.random.RandomState(0) +rng = np.random.RandomState(0) # Create a random number generator def generate_data(n_samples=50): @@ -60,9 +62,9 @@ def generate_data(n_samples=50): _ = plt.title("Synthetic regression dataset") # %% [markdown] -# As we previously discussed, boosting will be based on assembling a sequence of -# learners. We will start by creating a decision tree regressor. We will set the -# depth of the tree so that the resulting learner will underfit the data. +# As we previously discussed, boosting is based on assembling a sequence of +# learners. We start by creating a decision tree regressor. We set the depth of +# the tree to underfit the data on purpose. # %% from sklearn.tree import DecisionTreeRegressor @@ -74,29 +76,45 @@ def generate_data(n_samples=50): target_test_predicted = tree.predict(data_test) # %% [markdown] -# Using the term "test" here refers to data that was not used for training. It -# should not be confused with data coming from a train-test split, as it was -# generated in equally-spaced intervals for the visual evaluation of the -# predictions. +# Using the term "test" here refers to data not used for training. It should not +# be confused with data coming from a train-test split, as it was generated in +# equally-spaced intervals for the visual evaluation of the predictions. +# +# To avoid writing the same code in multiple places we define a helper function +# to plot the data samples as well as the decision tree predictions and +# residuals. + # %% -# plot the data -sns.scatterplot( - x=data_train["Feature"], y=target_train, color="black", alpha=0.5 -) -# plot the predictions -line_predictions = plt.plot(data_test["Feature"], target_test_predicted, "--") +def plot_decision_tree_with_residuals(y_train, y_train_pred, y_test_pred): + # Create a plot and get the Axes object + fig, ax = plt.subplots() + # plot the data + sns.scatterplot( + x=data_train["Feature"], y=y_train, color="black", alpha=0.5, ax=ax + ) + # plot the predictions + line_predictions = ax.plot(data_test["Feature"], y_test_pred, "--") + + # plot the residuals + for value, true, predicted in zip( + data_train["Feature"], y_train, y_train_pred + ): + lines_residuals = ax.plot( + [value, value], [true, predicted], color="red" + ) + + handles = [line_predictions[0], lines_residuals[0]] + + return handles, ax -# plot the residuals -for value, true, predicted in zip( - data_train["Feature"], target_train, target_train_predicted -): - lines_residuals = plt.plot([value, value], [true, predicted], color="red") -plt.legend( - [line_predictions[0], lines_residuals[0]], ["Fitted tree", "Residuals"] +handles, ax = plot_decision_tree_with_residuals( + target_train, target_train_predicted, target_test_predicted ) -_ = plt.title("Prediction function together \nwith errors on the training set") +legend_labels = ["Initial decision tree", "Initial residuals"] +ax.legend(handles, legend_labels, bbox_to_anchor=(1.05, 0.8), loc="upper left") +_ = ax.set_title("Decision Tree together \nwith errors on the training set") # %% [markdown] # ```{tip} @@ -104,15 +122,15 @@ def generate_data(n_samples=50): # for all the residual lines. # ``` # Since the tree underfits the data, its accuracy is far from perfect on the -# training data. We can observe this in the figure by looking at the difference -# between the predictions and the ground-truth data. We represent these errors, -# called "Residuals", by unbroken red lines. +# training data. We can observe this in the figure above by looking at the +# difference between the predictions and the ground-truth data. We represent +# these errors, called "residuals", using solid red lines. # -# Indeed, our initial tree was not expressive enough to handle the complexity of +# Indeed, our initial tree is not expressive enough to handle the complexity of # the data, as shown by the residuals. In a gradient-boosting algorithm, the -# idea is to create a second tree which, given the same data `data`, will try to -# predict the residuals instead of the vector `target`. We would therefore have -# a tree that is able to predict the errors made by the initial tree. +# idea is to create a second tree which, given the same `data`, tries to predict +# the residuals instead of the vector `target`, i.e. we have a second tree that +# is able to predict the errors made by the initial tree. # # Let's train such a tree. @@ -126,29 +144,22 @@ def generate_data(n_samples=50): target_test_predicted_residuals = tree_residuals.predict(data_test) # %% -sns.scatterplot(x=data_train["Feature"], y=residuals, color="black", alpha=0.5) -line_predictions = plt.plot( - data_test["Feature"], target_test_predicted_residuals, "--" +handles, ax = plot_decision_tree_with_residuals( + residuals, + target_train_predicted_residuals, + target_test_predicted_residuals, ) - -# plot the residuals of the predicted residuals -for value, true, predicted in zip( - data_train["Feature"], residuals, target_train_predicted_residuals -): - lines_residuals = plt.plot([value, value], [true, predicted], color="red") - -plt.legend( - [line_predictions[0], lines_residuals[0]], - ["Fitted tree", "Residuals"], - bbox_to_anchor=(1.05, 0.8), - loc="upper left", -) -_ = plt.title("Prediction of the previous residuals") +legend_labels = [ + "Predicted residuals", + "Residuals of the\npredicted residuals", +] +ax.legend(handles, legend_labels, bbox_to_anchor=(1.05, 0.8), loc="upper left") +_ = ax.set_title("Prediction of the initial residuals") # %% [markdown] -# We see that this new tree only manages to fit some of the residuals. We will -# focus on a specific sample from the training set (i.e. we know that the sample -# will be well predicted using two successive trees). We will use this sample to +# We see that this new tree only manages to fit some of the residuals. We now +# focus on a specific sample from the training set (as we know that the sample +# can be well predicted using two successive trees). We will use this sample to # explain how the predictions of both trees are combined. Let's first select # this sample in `data_train`. @@ -159,51 +170,30 @@ def generate_data(n_samples=50): target_true_residual = residuals.iloc[-2] # %% [markdown] -# Let's plot the previous information and highlight our sample of interest. -# Let's start by plotting the original data and the prediction of the first -# decision tree. +# Let's plot the original data, the predictions of the initial decision tree and +# highlight our sample of interest, i.e. this is just a zoom of the plot +# displaying the initial shallow tree. # %% -# Plot the previous information: -# * the dataset -# * the predictions -# * the residuals - -sns.scatterplot( - x=data_train["Feature"], y=target_train, color="black", alpha=0.5 +handles, ax = plot_decision_tree_with_residuals( + target_train, target_train_predicted, target_test_predicted ) -plt.plot(data_test["Feature"], target_test_predicted, "--") -for value, true, predicted in zip( - data_train["Feature"], target_train, target_train_predicted -): - lines_residuals = plt.plot([value, value], [true, predicted], color="red") - -# Highlight the sample of interest -plt.scatter( +ax.scatter( sample, target_true, label="Sample of interest", color="tab:orange", s=200 ) -plt.xlim([-1, 0]) -plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") -_ = plt.title("Tree predictions") +ax.set_xlim([-1, 0]) +ax.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") +_ = ax.set_title("Zoom of sample of interest\nin the initial decision tree") # %% [markdown] -# Now, let's plot the residuals information. We will plot the residuals computed -# from the first decision tree and show the residual predictions. +# Similarly we plot a zoom of the plot with the prediction of the initial residuals # %% -# Plot the previous information: -# * the residuals committed by the first tree -# * the residual predictions -# * the residuals of the residual predictions - -sns.scatterplot(x=data_train["Feature"], y=residuals, color="black", alpha=0.5) -plt.plot(data_test["Feature"], target_test_predicted_residuals, "--") -for value, true, predicted in zip( - data_train["Feature"], residuals, target_train_predicted_residuals -): - lines_residuals = plt.plot([value, value], [true, predicted], color="red") - -# Highlight the sample of interest +handles, ax = plot_decision_tree_with_residuals( + residuals, + target_train_predicted_residuals, + target_test_predicted_residuals, +) plt.scatter( sample, target_true_residual, @@ -211,14 +201,18 @@ def generate_data(n_samples=50): color="tab:orange", s=200, ) -plt.xlim([-1, 0]) -plt.legend() -_ = plt.title("Prediction of the residuals") +legend_labels = [ + "Predicted residuals", + "Residuals of the\npredicted residuals", +] +ax.set_xlim([-1, 0]) +ax.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left") +_ = ax.set_title("Zoom of sample of interest\nin the initial residuals") # %% [markdown] # For our sample of interest, our initial tree is making an error (small # residual). When fitting the second tree, the residual in this case is -# perfectly fitted and predicted. We will quantitatively check this prediction +# perfectly fitted and predicted. We can quantitatively check this prediction # using the fitted tree. First, let's check the prediction of the initial tree # and compare it with the true value. @@ -265,7 +259,9 @@ def generate_data(n_samples=50): # second tree corrects the first tree's error, while the third tree corrects the # second tree's error and so on). # -# We will compare the generalization performance of random-forest and gradient +# ## First comparison of GBDT vs random forests +# +# We now compare the generalization performance of random-forest and gradient # boosting on the California housing dataset. # %% @@ -322,11 +318,12 @@ def generate_data(n_samples=50): print(f"Average score time: {cv_results_rf['score_time'].mean():.3f} seconds") # %% [markdown] -# In term of computation performance, the forest can be parallelized and will +# In terms of computing performance, the forest can be parallelized and then # benefit from using multiple cores of the CPU. In terms of scoring performance, # both algorithms lead to very close results. # -# However, we see that the gradient boosting is a very fast algorithm to predict -# compared to random forest. This is due to the fact that gradient boosting uses -# shallow trees. We will go into details in the next notebook about the -# hyperparameters to consider when optimizing ensemble methods. +# However, we see that gradient boosting is overall faster than random forest. +# One of the reasons is that random forests typically rely on deep trees (that +# overfit individually) whereas boosting models build shallow trees (that +# underfit individually) which are faster to fit and predict. In the following +# exercise we will explore more in depth how these two models compare.