diff --git a/python_scripts/trees_classification.py b/python_scripts/trees_classification.py index d83b5203e..3723109a5 100644 --- a/python_scripts/trees_classification.py +++ b/python_scripts/trees_classification.py @@ -8,8 +8,11 @@ # %% [markdown] # # Build a classification decision tree # -# We will illustrate how decision tree fit data with a simple classification -# problem using the penguins dataset. +# In this notebook we illustrate decision trees in a multiclass classification +# problem by using the penguins dataset with 2 features and 3 classes. +# +# For the sake of simplicity, we focus the discussion on the hyperparamter +# `max_depth`, which controls the maximal depth of the decision tree. # %% [markdown] # ```{note} @@ -25,8 +28,8 @@ target_column = "Species" # %% [markdown] -# Besides, we split the data into two subsets to investigate how trees will -# predict values based on an out-of-samples dataset. +# First, we split the data into two subsets to investigate how trees predict +# values based on unseen data. # %% from sklearn.model_selection import train_test_split @@ -37,16 +40,13 @@ ) # %% [markdown] -# In a previous notebook, we learnt that a linear classifier will define a -# linear separation to split classes using a linear combination of the input -# features. In our 2-dimensional space, it means that a linear classifier will -# define some oblique lines that best separate our classes. We define a function -# below that, given a set of data points and a classifier, will plot the -# decision boundaries learnt by the classifier. -# -# Thus, for a linear classifier, we will obtain the following decision -# boundaries. These boundaries lines indicate where the model changes its -# prediction from one class to another. +# In a previous notebook, we learnt that linear classifiers define a linear +# separation to split classes using a linear combination of the input features. +# In our 2-dimensional feature space, it means that a linear classifier finds +# the oblique lines that best separate the classes. This is still true for +# multiclass problems, except that more than one line is fitted. We can use +# `DecisionBoundaryDisplay` to plot the decision boundaries learnt by the +# classifier. # %% from sklearn.linear_model import LogisticRegression @@ -56,15 +56,22 @@ # %% import matplotlib.pyplot as plt +import matplotlib as mpl import seaborn as sns from sklearn.inspection import DecisionBoundaryDisplay +tab10_norm = mpl.colors.Normalize(vmin=-0.5, vmax=8.5) # create a palette to be used in the scatterplot -palette = ["tab:red", "tab:blue", "black"] - -DecisionBoundaryDisplay.from_estimator( - linear_model, data_train, response_method="predict", cmap="RdBu", alpha=0.5 +palette = ["tab:blue", "tab:green", "tab:orange"] + +dbd = DecisionBoundaryDisplay.from_estimator( + linear_model, + data_train, + response_method="predict", + cmap="tab10", + norm=tab10_norm, + alpha=0.5, ) sns.scatterplot( data=penguins, @@ -80,7 +87,7 @@ # %% [markdown] # We see that the lines are a combination of the input features since they are # not perpendicular a specific axis. Indeed, this is due to the model -# parametrization that we saw in the previous notebook, controlled by the +# parametrization that we saw in some previous notebooks, i.e. controlled by the # model's weights and intercept. # # Besides, it seems that the linear model would be a good candidate for such @@ -92,13 +99,27 @@ print(f"Accuracy of the LogisticRegression: {test_score:.2f}") # %% [markdown] -# Unlike linear models, decision trees are non-parametric models: they are not -# controlled by a mathematical decision function and do not have weights or -# intercept to be optimized. +# Unlike linear models, the decision rule for the decision tree is not +# controlled by a simple linear combination of weights and feature values. +# +# Instead, the decision rules of trees can be defined in terms of +# - the feature index used at each split node of the tree, +# - the threshold value used at each split node, +# - the value to predict at each leaf node. # -# Indeed, decision trees will partition the space by considering a single -# feature at a time. Let's illustrate this behaviour by having a decision tree -# make a single split to partition the feature space. +# Decision trees partition the feature space by considering a single feature at +# a time. The number of splits depends on both the hyperparameters and the +# number of data points in the training set: the more flexible the +# hyperparameters and the larger the training set, the more splits can be +# considered by the model. +# +# As the number of adjustable components taking part in the decision rule +# changes with the training size, we say that decision trees are non-parametric +# models. +# +# Let's now visualize the shape of the decision boundary of a decision tree when +# we set the `max_depth` hyperparameter to only allow for a single split to +# partition the feature space. # %% from sklearn.tree import DecisionTreeClassifier @@ -108,7 +129,12 @@ # %% DecisionBoundaryDisplay.from_estimator( - tree, data_train, response_method="predict", cmap="RdBu", alpha=0.5 + tree, + data_train, + response_method="predict", + cmap="tab10", + norm=tab10_norm, + alpha=0.5, ) sns.scatterplot( data=penguins, @@ -123,8 +149,8 @@ # %% [markdown] # The partitions found by the algorithm separates the data along the axis # "Culmen Depth", discarding the feature "Culmen Length". Thus, it highlights -# that a decision tree does not use a combination of feature when making a -# split. We can look more in depth at the tree structure. +# that a decision tree does not use a combination of features when making a +# single split. We can look more in depth at the tree structure. # %% from sklearn.tree import plot_tree @@ -150,36 +176,40 @@ # dataset was subdivided into 2 sets based on the culmen depth (inferior or # superior to 16.45 mm). # -# This partition of the dataset minimizes the class diversities in each +# This partition of the dataset minimizes the class diversity in each # sub-partitions. This measure is also known as a **criterion**, and is a # settable parameter. # # If we look more closely at the partition, we see that the sample superior to -# 16.45 belongs mainly to the Adelie class. Looking at the values, we indeed -# observe 103 Adelie individuals in this space. We also count 52 Chinstrap -# samples and 6 Gentoo samples. We can make similar interpretation for the +# 16.45 belongs mainly to the "Adelie" class. Looking at the values, we indeed +# observe 103 "Adelie" individuals in this space. We also count 52 "Chinstrap" +# samples and 6 "Gentoo" samples. We can make similar interpretation for the # partition defined by a threshold inferior to 16.45mm. In this case, the most -# represented class is the Gentoo species. +# represented class is the "Gentoo" species. # # Let's see how our tree would work as a predictor. Let's start with a case # where the culmen depth is inferior to the threshold. # %% -sample_1 = pd.DataFrame({"Culmen Length (mm)": [0], "Culmen Depth (mm)": [15]}) -tree.predict(sample_1) +test_penguin_1 = pd.DataFrame( + {"Culmen Length (mm)": [0], "Culmen Depth (mm)": [15]} +) +tree.predict(test_penguin_1) # %% [markdown] -# The class predicted is the Gentoo. We can now check what happens if we pass a +# The class predicted is the "Gentoo". We can now check what happens if we pass a # culmen depth superior to the threshold. # %% -sample_2 = pd.DataFrame({"Culmen Length (mm)": [0], "Culmen Depth (mm)": [17]}) -tree.predict(sample_2) +test_penguin_2 = pd.DataFrame( + {"Culmen Length (mm)": [0], "Culmen Depth (mm)": [17]} +) +tree.predict(test_penguin_2) # %% [markdown] -# In this case, the tree predicts the Adelie specie. +# In this case, the tree predicts the "Adelie" specie. # -# Thus, we can conclude that a decision tree classifier will predict the most +# Thus, we can conclude that a decision tree classifier predicts the most # represented class within a partition. # # During the training, we have a count of samples in each partition, we can also @@ -187,7 +217,7 @@ # partition. # %% -y_pred_proba = tree.predict_proba(sample_2) +y_pred_proba = tree.predict_proba(test_penguin_2) y_proba_class_0 = pd.Series(y_pred_proba[0], index=tree.classes_) # %% @@ -212,14 +242,14 @@ # %% [markdown] # It is also important to note that the culmen length has been disregarded for -# the moment. It means that whatever the value given, it will not be used during -# the prediction. +# the moment. It means that regardless of its value, it is not used during the +# prediction. # %% -sample_3 = pd.DataFrame( +test_penguin_3 = pd.DataFrame( {"Culmen Length (mm)": [10_000], "Culmen Depth (mm)": [17]} ) -tree.predict_proba(sample_3) +tree.predict_proba(test_penguin_3) # %% [markdown] # Going back to our classification problem, the split found with a maximum depth @@ -232,9 +262,10 @@ print(f"Accuracy of the DecisionTreeClassifier: {test_score:.2f}") # %% [markdown] -# Indeed, it is not a surprise. We saw earlier that a single feature will not be -# able to separate all three species. However, from the previous analysis we saw -# that by using both features we should be able to get fairly good results. +# Indeed, it is not a surprise. We saw earlier that a single feature is not able +# to separate all three species: it underfits. However, from the previous +# analysis we saw that by using both features we should be able to get fairly +# good results. # -# In the next exercise, you will increase the size of the tree depth. You will -# get intuitions on how the space partitioning is repeated over time. +# In the next exercise, you will increase the tree depth to get an intuition on +# how such a parameter affects the space partitioning. diff --git a/python_scripts/trees_sol_01.py b/python_scripts/trees_sol_01.py index 34dcbf81c..e97b7e8b2 100644 --- a/python_scripts/trees_sol_01.py +++ b/python_scripts/trees_sol_01.py @@ -8,16 +8,13 @@ # %% [markdown] # # 📃 Solution for Exercise M5.01 # -# In the previous notebook, we showed how a tree with a depth of 1 level was -# working. The aim of this exercise is to repeat part of the previous experiment -# for a depth with 2 levels to show how the process of partitioning is repeated -# over time. +# In the previous notebook, we showed how a tree with 1 level depth works. The +# aim of this exercise is to repeat part of the previous experiment for a tree +# with 2 levels depth to show how such parameter affects the feature space +# partitioning. # -# Before to start, we will: -# -# * load the dataset; -# * split the dataset into training and testing dataset; -# * define the function to show the classification decision function. +# We first load the penguins dataset and split it into a training and a testing +# sets: # %% import pandas as pd @@ -42,10 +39,7 @@ # %% [markdown] # Create a decision tree classifier with a maximum depth of 2 levels and fit the -# training data. Once this classifier trained, plot the data and the decision -# boundary to see the benefit of increasing the depth. To plot the decision -# boundary, you should import the class `DecisionBoundaryDisplay` from the -# module `sklearn.inspection` as shown in the previous course notebook. +# training data. # %% # solution @@ -54,15 +48,39 @@ tree = DecisionTreeClassifier(max_depth=2) tree.fit(data_train, target_train) -# %% tags=["solution"] +# %% [markdown] +# Now plot the data and the decision boundary of the trained classifier to see +# the effect of increasing the depth of the tree. +# +# Hint: Use the class `DecisionBoundaryDisplay` from the module +# `sklearn.inspection` as shown in previous course notebooks. +# +# ```{warning} +# At this time, it is not possible to use `response_method="predict_proba"` for +# multiclass problems. This is a planned feature for a future version of +# scikit-learn. In the mean time, you can use `response_method="predict"` +# instead. +# ``` + +# %% +# solution import matplotlib.pyplot as plt +import matplotlib as mpl import seaborn as sns from sklearn.inspection import DecisionBoundaryDisplay -palette = ["tab:red", "tab:blue", "black"] + +tab10_norm = mpl.colors.Normalize(vmin=-0.5, vmax=8.5) + +palette = ["tab:blue", "tab:green", "tab:orange"] DecisionBoundaryDisplay.from_estimator( - tree, data_train, response_method="predict", cmap="RdBu", alpha=0.5 + tree, + data_train, + response_method="predict", + cmap="tab10", + norm=tab10_norm, + alpha=0.5, ) ax = sns.scatterplot( data=penguins, @@ -114,3 +132,79 @@ # which is not surprising since this partition was almost pure. If the feature # value is above the threshold, we predict the Gentoo penguin, the class that is # most probable. +# +# ## (Estimated) predicted probabilities in multi-class problems +# +# For those interested, one can further try to visualize the output of +# `predict_proba` for a multiclass problem using `DecisionBoundaryDisplay`, +# except that for a K-class problem you have K probability outputs for each +# data point. Visualizing all these on a single plot can quickly become tricky +# to interpret. It is then common to instead produce K separate plots, one for +# each class, in a one-vs-rest (or one-vs-all) fashion. +# +# For example, in the plot below, the first plot on the left shows in yellow the +# certainty on classifying a data point as belonging to the "Adelie" class. In +# the same plot, the spectre from green to purple represents the certainty of +# **not** belonging to the "Adelie" class. The same logic applies to the other +# plots in the figure. + +# %% tags=["solution"] +import numpy as np + +xx = np.linspace(30, 60, 100) +yy = np.linspace(10, 23, 100) +xx, yy = np.meshgrid(xx, yy) +Xfull = pd.DataFrame( + {"Culmen Length (mm)": xx.ravel(), "Culmen Depth (mm)": yy.ravel()} +) + +probas = tree.predict_proba(Xfull) +n_classes = len(np.unique(tree.classes_)) + +_, axs = plt.subplots(ncols=3, nrows=1, sharey=True, figsize=(12, 5)) +plt.suptitle("Predicted probabilities for decision tree model", y=0.8) + +for class_of_interest in range(n_classes): + axs[class_of_interest].set_title( + f"Class {tree.classes_[class_of_interest]}" + ) + imshow_handle = axs[class_of_interest].imshow( + probas[:, class_of_interest].reshape((100, 100)), + extent=(30, 60, 10, 23), + vmin=0.0, + vmax=1.0, + origin="lower", + cmap="viridis", + ) + axs[class_of_interest].set_xlabel("Culmen Length (mm)") + if class_of_interest == 0: + axs[class_of_interest].set_ylabel("Culmen Depth (mm)") + idx = target_test == tree.classes_[class_of_interest] + axs[class_of_interest].scatter( + data_test["Culmen Length (mm)"].loc[idx], + data_test["Culmen Depth (mm)"].loc[idx], + marker="o", + c="w", + edgecolor="k", + ) + +ax = plt.axes([0.15, 0.04, 0.7, 0.05]) +plt.colorbar(imshow_handle, cax=ax, orientation="horizontal") +_ = plt.title("Probability") + +# %% [markdown] tags=["solution"] +# ```{note} +# You may have noticed that we are no longer using a diverging colormap. Indeed, +# the chance level for a one-vs-rest binarization of the multi-class +# classification problem is almost never at predicted probability of 0.5. So +# using a colormap with a neutral white at 0.5 might give a false impression on +# the certainty. +# ``` +# +# In future versions of scikit-learn `DecisionBoundaryDisplay` will support a +# `class_of_interest` parameter that will allow in particular for a +# visualization of `predict_proba` in multi-class settings. +# +# We also plan to make it possible to visualize the `predict_proba` values for +# the class with the maximum predicted probability (without having to pass a +# given a fixed `class_of_interest` value).