diff --git a/notebooks/pipelines_sklearn/pipelines_sklearn.ipynb b/notebooks/pipelines_sklearn/pipelines_sklearn.ipynb
index 94b2b09..9ff4c1b 100644
--- a/notebooks/pipelines_sklearn/pipelines_sklearn.ipynb
+++ b/notebooks/pipelines_sklearn/pipelines_sklearn.ipynb
@@ -21,7 +21,7 @@
    },
    "outputs": [],
    "source": [
-    "%pip install --upgrade pip pandas scikit-learn scipy"
+    "%pip install --upgrade pip pandas scikit-learn scipy pyarrow"
    ]
   },
   {
@@ -49,14 +49,15 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "\n",
-    "from scipy.stats import randint, uniform\n",
+    "from scipy.stats import randint\n",
     "from sklearn.base import BaseEstimator, TransformerMixin\n",
     "from sklearn.compose import ColumnTransformer, make_column_selector\n",
-    "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.impute import SimpleImputer\n",
     "from sklearn.model_selection import RandomizedSearchCV, train_test_split\n",
     "from sklearn.pipeline import Pipeline\n",
-    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n"
+    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
+    "from typing import Self\n"
    ]
   },
   {
@@ -296,14 +297,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Custom Preprocessing"
+    "# Basic Pipeline"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "While sci-kit learn has a lot of preprocessing tools, some of the preprocessing steps are too specific to the dataset to be included in the library. For example, the `Cabin` column contains information about the deck, room number, and side of the ship. We can extract this information and create new features."
+    "First, we will create a pipeline for the numerical data. We will use the `SimpleImputer` to impute the missing values with the median. Then we will use the `StandardScaler` to scale the data."
    ]
   },
   {
@@ -311,28 +312,26 @@
    "execution_count": null,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-01-15T13:47:07.673725Z",
-     "iopub.status.busy": "2024-01-15T13:47:07.673480Z",
-     "iopub.status.idle": "2024-01-15T13:47:07.679498Z",
-     "shell.execute_reply": "2024-01-15T13:47:07.678668Z",
-     "shell.execute_reply.started": "2024-01-15T13:47:07.673700Z"
+     "iopub.execute_input": "2024-01-15T13:47:07.694513Z",
+     "iopub.status.busy": "2024-01-15T13:47:07.694275Z",
+     "iopub.status.idle": "2024-01-15T13:47:07.698948Z",
+     "shell.execute_reply": "2024-01-15T13:47:07.698162Z",
+     "shell.execute_reply.started": "2024-01-15T13:47:07.694487Z"
     }
    },
    "outputs": [],
    "source": [
-    "class PassengerIdSplitter(BaseEstimator, TransformerMixin):\n",
-    "    \"\"\"Split the PassengerId into Group and Number\"\"\"\n",
-    "    \n",
-    "    def fit(self, X: pd.DataFrame, y=None):\n",
-    "        return self\n",
-    "\n",
-    "    def transform(self, X: pd.DataFrame):\n",
-    "        # Split the PassengerId into Group and Number\n",
-    "        X['Group'] = X['PassengerId'].str.split('_').str[0]\n",
-    "        X['Number'] = X['PassengerId'].str.split('_').str[1]\n",
-    "        # Drop the original column\n",
-    "        return X.drop(['PassengerId'], axis=1)\n",
-    "        "
+    "numerical_preprocessor = Pipeline([\n",
+    "    ('imputer', SimpleImputer(strategy='median')),\n",
+    "    ('scaler', StandardScaler())\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will create a pipeline for the categorical data. We will use the `SimpleImputer` to impute the missing values with the most frequent value. Then we will use the `OneHotEncoder` to encode the categorical data. We will use the `handle_unknown='ignore'` parameter to ignore unknown categories in the test set and the `sparse_output=False` parameter to return a full array instead of a sparse matrix."
    ]
   },
   {
@@ -340,28 +339,27 @@
    "execution_count": null,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-01-15T13:47:07.680790Z",
-     "iopub.status.busy": "2024-01-15T13:47:07.680557Z",
-     "iopub.status.idle": "2024-01-15T13:47:07.686895Z",
-     "shell.execute_reply": "2024-01-15T13:47:07.685898Z",
-     "shell.execute_reply.started": "2024-01-15T13:47:07.680766Z"
+     "iopub.execute_input": "2024-01-15T13:47:07.700306Z",
+     "iopub.status.busy": "2024-01-15T13:47:07.700062Z",
+     "iopub.status.idle": "2024-01-15T13:47:07.704041Z",
+     "shell.execute_reply": "2024-01-15T13:47:07.703243Z",
+     "shell.execute_reply.started": "2024-01-15T13:47:07.700282Z"
     }
    },
    "outputs": [],
    "source": [
-    "class CabinSplitter(BaseEstimator, TransformerMixin):\n",
-    "    \"\"\"Split the Cabin into Deck and Room\"\"\"\n",
-    "    \n",
-    "    def fit(self, X: pd.DataFrame, y=None):\n",
-    "        return self\n",
-    "    \n",
-    "    def transform(self, X: pd.DataFrame):\n",
-    "        # Split the Cabin into Deck, Room and Side (port or starboard)\n",
-    "        X['Deck'] = X['Cabin'].str.split('/').str[0]\n",
-    "        X['Room'] = X['Cabin'].str.split('/').str[1].astype(int) # treat as numerical to avoid high cardinality\n",
-    "        X['Side'] = X['Cabin'].str.split('/').str[2]\n",
-    "        # Drop the original column\n",
-    "        return X.drop(['Cabin'], axis=1)"
+    "categorical_preprocessor = Pipeline([\n",
+    "    ('imputer', SimpleImputer(strategy='most_frequent')),\n",
+    "    ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "Finally, we will combine the two pipelines using the `ColumnTransformer`. Here we can use the `make_column_selector` to select the columns we want to apply the specific pipeline to. This works because we kept all numerical columns which represent categorical data as strings."
    ]
   },
   {
@@ -369,70 +367,71 @@
    "execution_count": null,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-01-15T13:47:07.687956Z",
-     "iopub.status.busy": "2024-01-15T13:47:07.687707Z",
-     "iopub.status.idle": "2024-01-15T13:47:07.693355Z",
-     "shell.execute_reply": "2024-01-15T13:47:07.692467Z",
-     "shell.execute_reply.started": "2024-01-15T13:47:07.687931Z"
+     "iopub.execute_input": "2024-01-15T13:47:07.705075Z",
+     "iopub.status.busy": "2024-01-15T13:47:07.704858Z",
+     "iopub.status.idle": "2024-01-15T13:47:07.709962Z",
+     "shell.execute_reply": "2024-01-15T13:47:07.708955Z",
+     "shell.execute_reply.started": "2024-01-15T13:47:07.705052Z"
     }
    },
    "outputs": [],
    "source": [
-    "class ColumnDropper(BaseEstimator, TransformerMixin):\n",
-    "    \"\"\"Drop the specified columns\"\"\"\n",
-    "\n",
-    "    def __init__(self, columns):\n",
-    "        self.columns = columns\n",
-    "\n",
-    "    def fit(self, X: pd.DataFrame, y=None):\n",
-    "        return self\n",
-    "\n",
-    "    def transform(self, X: pd.DataFrame):\n",
-    "        # Drop the specified columns\n",
-    "        return X.drop(self.columns, axis=1)"
+    "column_transformer = ColumnTransformer([\n",
+    "    ('numerical_preprocessing', numerical_preprocessor, make_column_selector(dtype_include=np.number)),\n",
+    "    ('categorical_preprocessing', categorical_preprocessor, make_column_selector(dtype_include=object))\n",
+    "])"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Column Transformer"
+    "Let's add a `RandomForestClassifier` to the pipeline and see how it performs."
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "Now that we have our custom preprocessing steps, we can create a column transformer. This will allow us to apply different preprocessing steps to different columns based on their data type.\n",
-    "\n",
-    "First, we will create a pipeline for the numerical data. We will use the `SimpleImputer` to impute the missing values with the median. Then we will use the `StandardScaler` to scale the data."
+    "basic_pipeline = Pipeline([\n",
+    "    ('column_transformer', column_transformer),\n",
+    "    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))\n",
+    "    ])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-01-15T13:47:07.694513Z",
-     "iopub.status.busy": "2024-01-15T13:47:07.694275Z",
-     "iopub.status.idle": "2024-01-15T13:47:07.698948Z",
-     "shell.execute_reply": "2024-01-15T13:47:07.698162Z",
-     "shell.execute_reply.started": "2024-01-15T13:47:07.694487Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "numerical_preprocessor = Pipeline([\n",
-    "    ('imputer', SimpleImputer(strategy='median')),\n",
-    "    ('scaler', StandardScaler())\n",
-    "])"
+    "basic_pipeline.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "baseline_accuracy = basic_pipeline.score(X_test, y_test)\n",
+    "print(f'Accuracy score: {baseline_accuracy:.3}')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Custom Preprocessing"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Next, we will create a pipeline for the categorical data. We will use the `SimpleImputer` to impute the missing values with the most frequent value. Then we will use the `OneHotEncoder` to encode the categorical data. We will use the `handle_unknown='ignore'` parameter to ignore unknown categories in the test set and the `sparse=False` parameter to return a full array instead of a sparse matrix."
+    "While sci-kit learn has a lot of preprocessing tools, some of the preprocessing steps are too specific to the dataset to be included in the library. For example, the `Cabin` column contains information about the deck, room number, and side of the ship. We can extract this information and create new features."
    ]
   },
   {
@@ -440,26 +439,68 @@
    "execution_count": null,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-01-15T13:47:07.700306Z",
-     "iopub.status.busy": "2024-01-15T13:47:07.700062Z",
-     "iopub.status.idle": "2024-01-15T13:47:07.704041Z",
-     "shell.execute_reply": "2024-01-15T13:47:07.703243Z",
-     "shell.execute_reply.started": "2024-01-15T13:47:07.700282Z"
+     "iopub.execute_input": "2024-01-15T13:47:07.673725Z",
+     "iopub.status.busy": "2024-01-15T13:47:07.673480Z",
+     "iopub.status.idle": "2024-01-15T13:47:07.679498Z",
+     "shell.execute_reply": "2024-01-15T13:47:07.678668Z",
+     "shell.execute_reply.started": "2024-01-15T13:47:07.673700Z"
     }
    },
    "outputs": [],
    "source": [
-    "categorical_preprocessor = Pipeline([\n",
-    "    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))\n",
-    "])"
+    "class PassengerIdSplitter(BaseEstimator, TransformerMixin):\n",
+    "    \"\"\"Split the PassengerId into Group and Number\"\"\"\n",
+    "    \n",
+    "    def fit(self, X: pd.DataFrame, y=None) -> Self:\n",
+    "        return self\n",
+    "\n",
+    "    def transform(self, X: pd.DataFrame) -> pd.DataFrame:\n",
+    "        # Split the PassengerId into Group and Number\n",
+    "        identifier_split = X['PassengerId'].str.split('_', expand=True)\n",
+    "        identifier_split.columns = ['Group', 'Number']\n",
+    "        \n",
+    "        # Concatenate the new columns\n",
+    "        X = pd.concat([X, identifier_split], axis=1)\n",
+    "\n",
+    "        # Drop the original column\n",
+    "        return X.drop(['PassengerId'], axis=1)\n",
+    "        "
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-01-15T13:47:07.680790Z",
+     "iopub.status.busy": "2024-01-15T13:47:07.680557Z",
+     "iopub.status.idle": "2024-01-15T13:47:07.686895Z",
+     "shell.execute_reply": "2024-01-15T13:47:07.685898Z",
+     "shell.execute_reply.started": "2024-01-15T13:47:07.680766Z"
+    }
+   },
+   "outputs": [],
    "source": [
+    "class CabinSplitter(BaseEstimator, TransformerMixin):\n",
+    "    \"\"\"Split the Cabin into Deck, Room and Side\"\"\"\n",
+    "    \n",
+    "    def fit(self, X: pd.DataFrame, y=None) -> Self:\n",
+    "        return self\n",
+    "    \n",
+    "    def transform(self, X: pd.DataFrame) -> pd.DataFrame:\n",
+    "        # Split the Cabin into Deck, Room and Side (port or starboard)\n",
+    "        cabin_df = X['Cabin'].str.split('/', expand=True)\n",
+    "        cabin_df.columns = ['Deck', 'Room', 'Side']\n",
     "\n",
-    "Finally, we will combine the two pipelines using the `ColumnTransformer`. Here we can use the `make_column_selector` to select the columns we want to apply the specific pipeline to. This works because we kept all numerical columns which represent categorical data as strings."
+    "        # Treat room as numerical to avoid high cardinality\n",
+    "        # Note: We are using numpys Int32 type to allow for missing values\n",
+    "        cabin_df['Room'] = cabin_df['Room'].astype(\"Int32\")\n",
+    "        \n",
+    "        # Merge the new columns\n",
+    "        X = pd.concat([X, cabin_df], axis=1)\n",
+    "\n",
+    "        # Drop the original column\n",
+    "        return X.drop(['Cabin'], axis=1)"
    ]
   },
   {
@@ -467,26 +508,34 @@
    "execution_count": null,
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2024-01-15T13:47:07.705075Z",
-     "iopub.status.busy": "2024-01-15T13:47:07.704858Z",
-     "iopub.status.idle": "2024-01-15T13:47:07.709962Z",
-     "shell.execute_reply": "2024-01-15T13:47:07.708955Z",
-     "shell.execute_reply.started": "2024-01-15T13:47:07.705052Z"
+     "iopub.execute_input": "2024-01-15T13:47:07.687956Z",
+     "iopub.status.busy": "2024-01-15T13:47:07.687707Z",
+     "iopub.status.idle": "2024-01-15T13:47:07.693355Z",
+     "shell.execute_reply": "2024-01-15T13:47:07.692467Z",
+     "shell.execute_reply.started": "2024-01-15T13:47:07.687931Z"
     }
    },
    "outputs": [],
    "source": [
-    "column_transformer = ColumnTransformer([\n",
-    "    ('numerical_preprocessing', numerical_preprocessor, make_column_selector(dtype_include=np.number)),\n",
-    "    ('categorical_preprocessing', categorical_preprocessor, make_column_selector(dtype_include=object))\n",
-    "])"
+    "class ColumnDropper(BaseEstimator, TransformerMixin):\n",
+    "    \"\"\"Drop the specified columns\"\"\"\n",
+    "\n",
+    "    def __init__(self, columns):\n",
+    "        self.columns = columns\n",
+    "\n",
+    "    def fit(self, X: pd.DataFrame, y=None) -> Self:\n",
+    "        return self\n",
+    "\n",
+    "    def transform(self, X: pd.DataFrame) -> pd.DataFrame:\n",
+    "        # Drop the specified columns\n",
+    "        return X.drop(self.columns, axis=1)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Creating a Baseline Model"
+    "# Extending the Pipeline"
    ]
   },
   {
@@ -510,7 +559,9 @@
    },
    "outputs": [],
    "source": [
-    "pipeline = Pipeline([\n",
+    "extended_pipeline = Pipeline([\n",
+    "    ('passenger_id_splitter', PassengerIdSplitter()),\n",
+    "    ('cabin_splitter', CabinSplitter()),\n",
     "    ('column_dropper', ColumnDropper(columns=['Name'])),\n",
     "    ('column_transformer', column_transformer),\n",
     "    ('classifier', RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))\n",
@@ -531,7 +582,7 @@
    },
    "outputs": [],
    "source": [
-    "pipeline.fit(X_train, y_train)"
+    "extended_pipeline.fit(X_train, y_train)"
    ]
   },
   {
@@ -548,22 +599,22 @@
    },
    "outputs": [],
    "source": [
-    "baseline_accuracy = pipeline.score(X_test, y_test)\n",
-    "print(f'Accuracy score: {baseline_accuracy:.3}')"
+    "extended_pipeline_accuracy = extended_pipeline.score(X_test, y_test)\n",
+    "print(f'Accuracy score: {extended_pipeline_accuracy:.3}')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Model Selection and Hyperparameter Tuning"
+    "# Hyperparameter Tuning"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now that we have a baseline model, we can try different models and tune the hyperparameters to improve the model performance. We will use the Random Forest Classifier and GradientBoosting with the `RandomizedSearchCV` to tune the hyperparameters for each model."
+    "Now that we have our preprocessing pipeline, we can try to improve our model by tuning the hyperparameters. We will use the `RandomizedSearchCV` to search for the best hyperparameters."
    ]
   },
   {
@@ -582,20 +633,11 @@
    "source": [
     "search_space = [\n",
     "    {\n",
-    "        'classifier': [RandomForestClassifier(random_state=RANDOM_STATE)],\n",
-    "        'classifier__n_estimators': randint(50, 1000),\n",
-    "        'classifier__max_depth': randint(3,50),\n",
-    "        'classifier__min_samples_split': randint(2, 100),\n",
-    "        'classifier__min_samples_leaf': randint(1, 50),\n",
-    "        'classifier__max_features': ['sqrt', 'log2'],\n",
-    "    },\n",
-    "    {\n",
-    "        'classifier': [GradientBoostingClassifier(random_state=RANDOM_STATE)],\n",
-    "        'classifier__n_estimators': randint(50, 1000),\n",
-    "        'classifier__learning_rate': uniform(0.01, 0.3),\n",
-    "        'classifier__max_depth': randint(3,50),\n",
-    "        'classifier__min_samples_split': randint(2, 100),  \n",
-    "        'classifier__min_samples_leaf': randint(1, 50),\n",
+    "        'column_transformer__numerical_preprocessing__imputer__strategy': ['mean', 'median', 'most_frequent'],\n",
+    "        'classifier__n_estimators': randint(50, 500),\n",
+    "        'classifier__max_depth': randint(3, 20),\n",
+    "        'classifier__min_samples_split': randint(2, 50),  \n",
+    "        'classifier__min_samples_leaf': randint(1, 25),\n",
     "        'classifier__max_features': ['sqrt', 'log2'],\n",
     "    }\n",
     "]"
@@ -616,11 +658,11 @@
    "outputs": [],
    "source": [
     "random_search = RandomizedSearchCV(\n",
-    "    pipeline, \n",
+    "    extended_pipeline, \n",
     "    search_space,\n",
     "    scoring='accuracy',\n",
     "    refit=True,\n",
-    "    n_iter=5000,\n",
+    "    n_iter=1000,\n",
     "    cv=10, \n",
     "    verbose=1, \n",
     "    n_jobs=-1,\n",
@@ -724,7 +766,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.6"
+   "version": "3.11.7"
   }
  },
  "nbformat": 4,