diff --git a/python_scripts/trees_classification.py b/python_scripts/trees_classification.py
index d83b5203e..3723109a5 100644
--- a/python_scripts/trees_classification.py
+++ b/python_scripts/trees_classification.py
@@ -8,8 +8,11 @@
 # %% [markdown]
 # # Build a classification decision tree
 #
-# We will illustrate how decision tree fit data with a simple classification
-# problem using the penguins dataset.
+# In this notebook we illustrate decision trees in a multiclass classification
+# problem by using the penguins dataset with 2 features and 3 classes.
+#
+# For the sake of simplicity, we focus the discussion on the hyperparamter
+# `max_depth`, which controls the maximal depth of the decision tree.
 
 # %% [markdown]
 # ```{note}
@@ -25,8 +28,8 @@
 target_column = "Species"
 
 # %% [markdown]
-# Besides, we split the data into two subsets to investigate how trees will
-# predict values based on an out-of-samples dataset.
+# First, we split the data into two subsets to investigate how trees predict
+# values based on unseen data.
 
 # %%
 from sklearn.model_selection import train_test_split
@@ -37,16 +40,13 @@
 )
 
 # %% [markdown]
-# In a previous notebook, we learnt that a linear classifier will define a
-# linear separation to split classes using a linear combination of the input
-# features. In our 2-dimensional space, it means that a linear classifier will
-# define some oblique lines that best separate our classes. We define a function
-# below that, given a set of data points and a classifier, will plot the
-# decision boundaries learnt by the classifier.
-#
-# Thus, for a linear classifier, we will obtain the following decision
-# boundaries. These boundaries lines indicate where the model changes its
-# prediction from one class to another.
+# In a previous notebook, we learnt that linear classifiers define a linear
+# separation to split classes using a linear combination of the input features.
+# In our 2-dimensional feature space, it means that a linear classifier finds
+# the oblique lines that best separate the classes. This is still true for
+# multiclass problems, except that more than one line is fitted. We can use
+# `DecisionBoundaryDisplay` to plot the decision boundaries learnt by the
+# classifier.
 
 # %%
 from sklearn.linear_model import LogisticRegression
@@ -56,15 +56,22 @@
 
 # %%
 import matplotlib.pyplot as plt
+import matplotlib as mpl
 import seaborn as sns
 
 from sklearn.inspection import DecisionBoundaryDisplay
 
+tab10_norm = mpl.colors.Normalize(vmin=-0.5, vmax=8.5)
 # create a palette to be used in the scatterplot
-palette = ["tab:red", "tab:blue", "black"]
-
-DecisionBoundaryDisplay.from_estimator(
-    linear_model, data_train, response_method="predict", cmap="RdBu", alpha=0.5
+palette = ["tab:blue", "tab:green", "tab:orange"]
+
+dbd = DecisionBoundaryDisplay.from_estimator(
+    linear_model,
+    data_train,
+    response_method="predict",
+    cmap="tab10",
+    norm=tab10_norm,
+    alpha=0.5,
 )
 sns.scatterplot(
     data=penguins,
@@ -80,7 +87,7 @@
 # %% [markdown]
 # We see that the lines are a combination of the input features since they are
 # not perpendicular a specific axis. Indeed, this is due to the model
-# parametrization that we saw in the previous notebook, controlled by the
+# parametrization that we saw in some previous notebooks, i.e. controlled by the
 # model's weights and intercept.
 #
 # Besides, it seems that the linear model would be a good candidate for such
@@ -92,13 +99,27 @@
 print(f"Accuracy of the LogisticRegression: {test_score:.2f}")
 
 # %% [markdown]
-# Unlike linear models, decision trees are non-parametric models: they are not
-# controlled by a mathematical decision function and do not have weights or
-# intercept to be optimized.
+# Unlike linear models, the decision rule for the decision tree is not
+# controlled by a simple linear combination of weights and feature values.
+#
+# Instead, the decision rules of trees can be defined in terms of
+# - the feature index used at each split node of the tree,
+# - the threshold value used at each split node,
+# - the value to predict at each leaf node.
 #
-# Indeed, decision trees will partition the space by considering a single
-# feature at a time. Let's illustrate this behaviour by having a decision tree
-# make a single split to partition the feature space.
+# Decision trees partition the feature space by considering a single feature at
+# a time. The number of splits depends on both the hyperparameters and the
+# number of data points in the training set: the more flexible the
+# hyperparameters and the larger the training set, the more splits can be
+# considered by the model.
+#
+# As the number of adjustable components taking part in the decision rule
+# changes with the training size, we say that decision trees are non-parametric
+# models.
+#
+# Let's now visualize the shape of the decision boundary of a decision tree when
+# we set the `max_depth` hyperparameter to only allow for a single split to
+# partition the feature space.
 
 # %%
 from sklearn.tree import DecisionTreeClassifier
@@ -108,7 +129,12 @@
 
 # %%
 DecisionBoundaryDisplay.from_estimator(
-    tree, data_train, response_method="predict", cmap="RdBu", alpha=0.5
+    tree,
+    data_train,
+    response_method="predict",
+    cmap="tab10",
+    norm=tab10_norm,
+    alpha=0.5,
 )
 sns.scatterplot(
     data=penguins,
@@ -123,8 +149,8 @@
 # %% [markdown]
 # The partitions found by the algorithm separates the data along the axis
 # "Culmen Depth", discarding the feature "Culmen Length". Thus, it highlights
-# that a decision tree does not use a combination of feature when making a
-# split. We can look more in depth at the tree structure.
+# that a decision tree does not use a combination of features when making a
+# single split. We can look more in depth at the tree structure.
 
 # %%
 from sklearn.tree import plot_tree
@@ -150,36 +176,40 @@
 # dataset was subdivided into 2 sets based on the culmen depth (inferior or
 # superior to 16.45 mm).
 #
-# This partition of the dataset minimizes the class diversities in each
+# This partition of the dataset minimizes the class diversity in each
 # sub-partitions. This measure is also known as a **criterion**, and is a
 # settable parameter.
 #
 # If we look more closely at the partition, we see that the sample superior to
-# 16.45 belongs mainly to the Adelie class. Looking at the values, we indeed
-# observe 103 Adelie individuals in this space. We also count 52 Chinstrap
-# samples and 6 Gentoo samples. We can make similar interpretation for the
+# 16.45 belongs mainly to the "Adelie" class. Looking at the values, we indeed
+# observe 103 "Adelie" individuals in this space. We also count 52 "Chinstrap"
+# samples and 6 "Gentoo" samples. We can make similar interpretation for the
 # partition defined by a threshold inferior to 16.45mm. In this case, the most
-# represented class is the Gentoo species.
+# represented class is the "Gentoo" species.
 #
 # Let's see how our tree would work as a predictor. Let's start with a case
 # where the culmen depth is inferior to the threshold.
 
 # %%
-sample_1 = pd.DataFrame({"Culmen Length (mm)": [0], "Culmen Depth (mm)": [15]})
-tree.predict(sample_1)
+test_penguin_1 = pd.DataFrame(
+    {"Culmen Length (mm)": [0], "Culmen Depth (mm)": [15]}
+)
+tree.predict(test_penguin_1)
 
 # %% [markdown]
-# The class predicted is the Gentoo. We can now check what happens if we pass a
+# The class predicted is the "Gentoo". We can now check what happens if we pass a
 # culmen depth superior to the threshold.
 
 # %%
-sample_2 = pd.DataFrame({"Culmen Length (mm)": [0], "Culmen Depth (mm)": [17]})
-tree.predict(sample_2)
+test_penguin_2 = pd.DataFrame(
+    {"Culmen Length (mm)": [0], "Culmen Depth (mm)": [17]}
+)
+tree.predict(test_penguin_2)
 
 # %% [markdown]
-# In this case, the tree predicts the Adelie specie.
+# In this case, the tree predicts the "Adelie" specie.
 #
-# Thus, we can conclude that a decision tree classifier will predict the most
+# Thus, we can conclude that a decision tree classifier predicts the most
 # represented class within a partition.
 #
 # During the training, we have a count of samples in each partition, we can also
@@ -187,7 +217,7 @@
 # partition.
 
 # %%
-y_pred_proba = tree.predict_proba(sample_2)
+y_pred_proba = tree.predict_proba(test_penguin_2)
 y_proba_class_0 = pd.Series(y_pred_proba[0], index=tree.classes_)
 
 # %%
@@ -212,14 +242,14 @@
 
 # %% [markdown]
 # It is also important to note that the culmen length has been disregarded for
-# the moment. It means that whatever the value given, it will not be used during
-# the prediction.
+# the moment. It means that regardless of its value, it is not used during the
+# prediction.
 
 # %%
-sample_3 = pd.DataFrame(
+test_penguin_3 = pd.DataFrame(
     {"Culmen Length (mm)": [10_000], "Culmen Depth (mm)": [17]}
 )
-tree.predict_proba(sample_3)
+tree.predict_proba(test_penguin_3)
 
 # %% [markdown]
 # Going back to our classification problem, the split found with a maximum depth
@@ -232,9 +262,10 @@
 print(f"Accuracy of the DecisionTreeClassifier: {test_score:.2f}")
 
 # %% [markdown]
-# Indeed, it is not a surprise. We saw earlier that a single feature will not be
-# able to separate all three species. However, from the previous analysis we saw
-# that by using both features we should be able to get fairly good results.
+# Indeed, it is not a surprise. We saw earlier that a single feature is not able
+# to separate all three species: it underfits. However, from the previous
+# analysis we saw that by using both features we should be able to get fairly
+# good results.
 #
-# In the next exercise, you will increase the size of the tree depth. You will
-# get intuitions on how the space partitioning is repeated over time.
+# In the next exercise, you will increase the tree depth to get an intuition on
+# how such a parameter affects the space partitioning.
diff --git a/python_scripts/trees_sol_01.py b/python_scripts/trees_sol_01.py
index 34dcbf81c..e97b7e8b2 100644
--- a/python_scripts/trees_sol_01.py
+++ b/python_scripts/trees_sol_01.py
@@ -8,16 +8,13 @@
 # %% [markdown]
 # # 📃 Solution for Exercise M5.01
 #
-# In the previous notebook, we showed how a tree with a depth of 1 level was
-# working. The aim of this exercise is to repeat part of the previous experiment
-# for a depth with 2 levels to show how the process of partitioning is repeated
-# over time.
+# In the previous notebook, we showed how a tree with 1 level depth works. The
+# aim of this exercise is to repeat part of the previous experiment for a tree
+# with 2 levels depth to show how such parameter affects the feature space
+# partitioning.
 #
-# Before to start, we will:
-#
-# * load the dataset;
-# * split the dataset into training and testing dataset;
-# * define the function to show the classification decision function.
+# We first load the penguins dataset and split it into a training and a testing
+# sets:
 
 # %%
 import pandas as pd
@@ -42,10 +39,7 @@
 
 # %% [markdown]
 # Create a decision tree classifier with a maximum depth of 2 levels and fit the
-# training data. Once this classifier trained, plot the data and the decision
-# boundary to see the benefit of increasing the depth. To plot the decision
-# boundary, you should import the class `DecisionBoundaryDisplay` from the
-# module `sklearn.inspection` as shown in the previous course notebook.
+# training data.
 
 # %%
 # solution
@@ -54,15 +48,39 @@
 tree = DecisionTreeClassifier(max_depth=2)
 tree.fit(data_train, target_train)
 
-# %% tags=["solution"]
+# %% [markdown]
+# Now plot the data and the decision boundary of the trained classifier to see
+# the effect of increasing the depth of the tree.
+#
+# Hint: Use the class `DecisionBoundaryDisplay` from the module
+# `sklearn.inspection` as shown in previous course notebooks.
+#
+# ```{warning}
+# At this time, it is not possible to use `response_method="predict_proba"` for
+# multiclass problems. This is a planned feature for a future version of
+# scikit-learn. In the mean time, you can use `response_method="predict"`
+# instead.
+# ```
+
+# %%
+# solution
 import matplotlib.pyplot as plt
+import matplotlib as mpl
 import seaborn as sns
 
 from sklearn.inspection import DecisionBoundaryDisplay
 
-palette = ["tab:red", "tab:blue", "black"]
+
+tab10_norm = mpl.colors.Normalize(vmin=-0.5, vmax=8.5)
+
+palette = ["tab:blue", "tab:green", "tab:orange"]
 DecisionBoundaryDisplay.from_estimator(
-    tree, data_train, response_method="predict", cmap="RdBu", alpha=0.5
+    tree,
+    data_train,
+    response_method="predict",
+    cmap="tab10",
+    norm=tab10_norm,
+    alpha=0.5,
 )
 ax = sns.scatterplot(
     data=penguins,
@@ -114,3 +132,79 @@
 # which is not surprising since this partition was almost pure. If the feature
 # value is above the threshold, we predict the Gentoo penguin, the class that is
 # most probable.
+#
+# ## (Estimated) predicted probabilities in multi-class problems
+#
+# For those interested, one can further try to visualize the output of
+# `predict_proba` for a multiclass problem using `DecisionBoundaryDisplay`,
+# except that for a K-class problem you have K probability outputs for each
+# data point. Visualizing all these on a single plot can quickly become tricky
+# to interpret. It is then common to instead produce K separate plots, one for
+# each class, in a one-vs-rest (or one-vs-all) fashion.
+#
+# For example, in the plot below, the first plot on the left shows in yellow the
+# certainty on classifying a data point as belonging to the "Adelie" class. In
+# the same plot, the spectre from green to purple represents the certainty of
+# **not** belonging to the "Adelie" class. The same logic applies to the other
+# plots in the figure.
+
+# %% tags=["solution"]
+import numpy as np
+
+xx = np.linspace(30, 60, 100)
+yy = np.linspace(10, 23, 100)
+xx, yy = np.meshgrid(xx, yy)
+Xfull = pd.DataFrame(
+    {"Culmen Length (mm)": xx.ravel(), "Culmen Depth (mm)": yy.ravel()}
+)
+
+probas = tree.predict_proba(Xfull)
+n_classes = len(np.unique(tree.classes_))
+
+_, axs = plt.subplots(ncols=3, nrows=1, sharey=True, figsize=(12, 5))
+plt.suptitle("Predicted probabilities for decision tree model", y=0.8)
+
+for class_of_interest in range(n_classes):
+    axs[class_of_interest].set_title(
+        f"Class {tree.classes_[class_of_interest]}"
+    )
+    imshow_handle = axs[class_of_interest].imshow(
+        probas[:, class_of_interest].reshape((100, 100)),
+        extent=(30, 60, 10, 23),
+        vmin=0.0,
+        vmax=1.0,
+        origin="lower",
+        cmap="viridis",
+    )
+    axs[class_of_interest].set_xlabel("Culmen Length (mm)")
+    if class_of_interest == 0:
+        axs[class_of_interest].set_ylabel("Culmen Depth (mm)")
+    idx = target_test == tree.classes_[class_of_interest]
+    axs[class_of_interest].scatter(
+        data_test["Culmen Length (mm)"].loc[idx],
+        data_test["Culmen Depth (mm)"].loc[idx],
+        marker="o",
+        c="w",
+        edgecolor="k",
+    )
+
+ax = plt.axes([0.15, 0.04, 0.7, 0.05])
+plt.colorbar(imshow_handle, cax=ax, orientation="horizontal")
+_ = plt.title("Probability")
+
+# %% [markdown] tags=["solution"]
+# ```{note}
+# You may have noticed that we are no longer using a diverging colormap. Indeed,
+# the chance level for a one-vs-rest binarization of the multi-class
+# classification problem is almost never at predicted probability of 0.5. So
+# using a colormap with a neutral white at 0.5 might give a false impression on
+# the certainty.
+# ```
+#
+# In future versions of scikit-learn `DecisionBoundaryDisplay` will support a
+# `class_of_interest` parameter that will allow in particular for a
+# visualization of `predict_proba` in multi-class settings.
+#
+# We also plan to make it possible to visualize the `predict_proba` values for
+# the class with the maximum predicted probability (without having to pass a
+# given a fixed `class_of_interest` value).