diff --git a/notebooks/pipelines_sklearn/pipelines_sklearn.ipynb b/notebooks/pipelines_sklearn/pipelines_sklearn.ipynb index 94b2b09..9ff4c1b 100644 --- a/notebooks/pipelines_sklearn/pipelines_sklearn.ipynb +++ b/notebooks/pipelines_sklearn/pipelines_sklearn.ipynb @@ -21,7 +21,7 @@ }, "outputs": [], "source": [ - "%pip install --upgrade pip pandas scikit-learn scipy" + "%pip install --upgrade pip pandas scikit-learn scipy pyarrow" ] }, { @@ -49,14 +49,15 @@ "import pandas as pd\n", "import numpy as np\n", "\n", - "from scipy.stats import randint, uniform\n", + "from scipy.stats import randint\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "from sklearn.compose import ColumnTransformer, make_column_selector\n", - "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.model_selection import RandomizedSearchCV, train_test_split\n", "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n" + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from typing import Self\n" ] }, { @@ -296,14 +297,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Custom Preprocessing" + "# Basic Pipeline" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "While sci-kit learn has a lot of preprocessing tools, some of the preprocessing steps are too specific to the dataset to be included in the library. For example, the `Cabin` column contains information about the deck, room number, and side of the ship. We can extract this information and create new features." + "First, we will create a pipeline for the numerical data. We will use the `SimpleImputer` to impute the missing values with the median. Then we will use the `StandardScaler` to scale the data." ] }, { @@ -311,28 +312,26 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-01-15T13:47:07.673725Z", - "iopub.status.busy": "2024-01-15T13:47:07.673480Z", - "iopub.status.idle": "2024-01-15T13:47:07.679498Z", - "shell.execute_reply": "2024-01-15T13:47:07.678668Z", - "shell.execute_reply.started": "2024-01-15T13:47:07.673700Z" + "iopub.execute_input": "2024-01-15T13:47:07.694513Z", + "iopub.status.busy": "2024-01-15T13:47:07.694275Z", + "iopub.status.idle": "2024-01-15T13:47:07.698948Z", + "shell.execute_reply": "2024-01-15T13:47:07.698162Z", + "shell.execute_reply.started": "2024-01-15T13:47:07.694487Z" } }, "outputs": [], "source": [ - "class PassengerIdSplitter(BaseEstimator, TransformerMixin):\n", - " \"\"\"Split the PassengerId into Group and Number\"\"\"\n", - " \n", - " def fit(self, X: pd.DataFrame, y=None):\n", - " return self\n", - "\n", - " def transform(self, X: pd.DataFrame):\n", - " # Split the PassengerId into Group and Number\n", - " X['Group'] = X['PassengerId'].str.split('_').str[0]\n", - " X['Number'] = X['PassengerId'].str.split('_').str[1]\n", - " # Drop the original column\n", - " return X.drop(['PassengerId'], axis=1)\n", - " " + "numerical_preprocessor = Pipeline([\n", + " ('imputer', SimpleImputer(strategy='median')),\n", + " ('scaler', StandardScaler())\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we will create a pipeline for the categorical data. We will use the `SimpleImputer` to impute the missing values with the most frequent value. Then we will use the `OneHotEncoder` to encode the categorical data. We will use the `handle_unknown='ignore'` parameter to ignore unknown categories in the test set and the `sparse_output=False` parameter to return a full array instead of a sparse matrix." ] }, { @@ -340,28 +339,27 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-01-15T13:47:07.680790Z", - "iopub.status.busy": "2024-01-15T13:47:07.680557Z", - "iopub.status.idle": "2024-01-15T13:47:07.686895Z", - "shell.execute_reply": "2024-01-15T13:47:07.685898Z", - "shell.execute_reply.started": "2024-01-15T13:47:07.680766Z" + "iopub.execute_input": "2024-01-15T13:47:07.700306Z", + "iopub.status.busy": "2024-01-15T13:47:07.700062Z", + "iopub.status.idle": "2024-01-15T13:47:07.704041Z", + "shell.execute_reply": "2024-01-15T13:47:07.703243Z", + "shell.execute_reply.started": "2024-01-15T13:47:07.700282Z" } }, "outputs": [], "source": [ - "class CabinSplitter(BaseEstimator, TransformerMixin):\n", - " \"\"\"Split the Cabin into Deck and Room\"\"\"\n", - " \n", - " def fit(self, X: pd.DataFrame, y=None):\n", - " return self\n", - " \n", - " def transform(self, X: pd.DataFrame):\n", - " # Split the Cabin into Deck, Room and Side (port or starboard)\n", - " X['Deck'] = X['Cabin'].str.split('/').str[0]\n", - " X['Room'] = X['Cabin'].str.split('/').str[1].astype(int) # treat as numerical to avoid high cardinality\n", - " X['Side'] = X['Cabin'].str.split('/').str[2]\n", - " # Drop the original column\n", - " return X.drop(['Cabin'], axis=1)" + "categorical_preprocessor = Pipeline([\n", + " ('imputer', SimpleImputer(strategy='most_frequent')),\n", + " ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Finally, we will combine the two pipelines using the `ColumnTransformer`. Here we can use the `make_column_selector` to select the columns we want to apply the specific pipeline to. This works because we kept all numerical columns which represent categorical data as strings." ] }, { @@ -369,70 +367,71 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-01-15T13:47:07.687956Z", - "iopub.status.busy": "2024-01-15T13:47:07.687707Z", - "iopub.status.idle": "2024-01-15T13:47:07.693355Z", - "shell.execute_reply": "2024-01-15T13:47:07.692467Z", - "shell.execute_reply.started": "2024-01-15T13:47:07.687931Z" + "iopub.execute_input": "2024-01-15T13:47:07.705075Z", + "iopub.status.busy": "2024-01-15T13:47:07.704858Z", + "iopub.status.idle": "2024-01-15T13:47:07.709962Z", + "shell.execute_reply": "2024-01-15T13:47:07.708955Z", + "shell.execute_reply.started": "2024-01-15T13:47:07.705052Z" } }, "outputs": [], "source": [ - "class ColumnDropper(BaseEstimator, TransformerMixin):\n", - " \"\"\"Drop the specified columns\"\"\"\n", - "\n", - " def __init__(self, columns):\n", - " self.columns = columns\n", - "\n", - " def fit(self, X: pd.DataFrame, y=None):\n", - " return self\n", - "\n", - " def transform(self, X: pd.DataFrame):\n", - " # Drop the specified columns\n", - " return X.drop(self.columns, axis=1)" + "column_transformer = ColumnTransformer([\n", + " ('numerical_preprocessing', numerical_preprocessor, make_column_selector(dtype_include=np.number)),\n", + " ('categorical_preprocessing', categorical_preprocessor, make_column_selector(dtype_include=object))\n", + "])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Column Transformer" + "Let's add a `RandomForestClassifier` to the pipeline and see how it performs." ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Now that we have our custom preprocessing steps, we can create a column transformer. This will allow us to apply different preprocessing steps to different columns based on their data type.\n", - "\n", - "First, we will create a pipeline for the numerical data. We will use the `SimpleImputer` to impute the missing values with the median. Then we will use the `StandardScaler` to scale the data." + "basic_pipeline = Pipeline([\n", + " ('column_transformer', column_transformer),\n", + " ('classifier', RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))\n", + " ])" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "execution": { - "iopub.execute_input": "2024-01-15T13:47:07.694513Z", - "iopub.status.busy": "2024-01-15T13:47:07.694275Z", - "iopub.status.idle": "2024-01-15T13:47:07.698948Z", - "shell.execute_reply": "2024-01-15T13:47:07.698162Z", - "shell.execute_reply.started": "2024-01-15T13:47:07.694487Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "numerical_preprocessor = Pipeline([\n", - " ('imputer', SimpleImputer(strategy='median')),\n", - " ('scaler', StandardScaler())\n", - "])" + "basic_pipeline.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "baseline_accuracy = basic_pipeline.score(X_test, y_test)\n", + "print(f'Accuracy score: {baseline_accuracy:.3}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Custom Preprocessing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Next, we will create a pipeline for the categorical data. We will use the `SimpleImputer` to impute the missing values with the most frequent value. Then we will use the `OneHotEncoder` to encode the categorical data. We will use the `handle_unknown='ignore'` parameter to ignore unknown categories in the test set and the `sparse=False` parameter to return a full array instead of a sparse matrix." + "While sci-kit learn has a lot of preprocessing tools, some of the preprocessing steps are too specific to the dataset to be included in the library. For example, the `Cabin` column contains information about the deck, room number, and side of the ship. We can extract this information and create new features." ] }, { @@ -440,26 +439,68 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-01-15T13:47:07.700306Z", - "iopub.status.busy": "2024-01-15T13:47:07.700062Z", - "iopub.status.idle": "2024-01-15T13:47:07.704041Z", - "shell.execute_reply": "2024-01-15T13:47:07.703243Z", - "shell.execute_reply.started": "2024-01-15T13:47:07.700282Z" + "iopub.execute_input": "2024-01-15T13:47:07.673725Z", + "iopub.status.busy": "2024-01-15T13:47:07.673480Z", + "iopub.status.idle": "2024-01-15T13:47:07.679498Z", + "shell.execute_reply": "2024-01-15T13:47:07.678668Z", + "shell.execute_reply.started": "2024-01-15T13:47:07.673700Z" } }, "outputs": [], "source": [ - "categorical_preprocessor = Pipeline([\n", - " ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))\n", - "])" + "class PassengerIdSplitter(BaseEstimator, TransformerMixin):\n", + " \"\"\"Split the PassengerId into Group and Number\"\"\"\n", + " \n", + " def fit(self, X: pd.DataFrame, y=None) -> Self:\n", + " return self\n", + "\n", + " def transform(self, X: pd.DataFrame) -> pd.DataFrame:\n", + " # Split the PassengerId into Group and Number\n", + " identifier_split = X['PassengerId'].str.split('_', expand=True)\n", + " identifier_split.columns = ['Group', 'Number']\n", + " \n", + " # Concatenate the new columns\n", + " X = pd.concat([X, identifier_split], axis=1)\n", + "\n", + " # Drop the original column\n", + " return X.drop(['PassengerId'], axis=1)\n", + " " ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": null, + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-15T13:47:07.680790Z", + "iopub.status.busy": "2024-01-15T13:47:07.680557Z", + "iopub.status.idle": "2024-01-15T13:47:07.686895Z", + "shell.execute_reply": "2024-01-15T13:47:07.685898Z", + "shell.execute_reply.started": "2024-01-15T13:47:07.680766Z" + } + }, + "outputs": [], "source": [ + "class CabinSplitter(BaseEstimator, TransformerMixin):\n", + " \"\"\"Split the Cabin into Deck, Room and Side\"\"\"\n", + " \n", + " def fit(self, X: pd.DataFrame, y=None) -> Self:\n", + " return self\n", + " \n", + " def transform(self, X: pd.DataFrame) -> pd.DataFrame:\n", + " # Split the Cabin into Deck, Room and Side (port or starboard)\n", + " cabin_df = X['Cabin'].str.split('/', expand=True)\n", + " cabin_df.columns = ['Deck', 'Room', 'Side']\n", "\n", - "Finally, we will combine the two pipelines using the `ColumnTransformer`. Here we can use the `make_column_selector` to select the columns we want to apply the specific pipeline to. This works because we kept all numerical columns which represent categorical data as strings." + " # Treat room as numerical to avoid high cardinality\n", + " # Note: We are using numpys Int32 type to allow for missing values\n", + " cabin_df['Room'] = cabin_df['Room'].astype(\"Int32\")\n", + " \n", + " # Merge the new columns\n", + " X = pd.concat([X, cabin_df], axis=1)\n", + "\n", + " # Drop the original column\n", + " return X.drop(['Cabin'], axis=1)" ] }, { @@ -467,26 +508,34 @@ "execution_count": null, "metadata": { "execution": { - "iopub.execute_input": "2024-01-15T13:47:07.705075Z", - "iopub.status.busy": "2024-01-15T13:47:07.704858Z", - "iopub.status.idle": "2024-01-15T13:47:07.709962Z", - "shell.execute_reply": "2024-01-15T13:47:07.708955Z", - "shell.execute_reply.started": "2024-01-15T13:47:07.705052Z" + "iopub.execute_input": "2024-01-15T13:47:07.687956Z", + "iopub.status.busy": "2024-01-15T13:47:07.687707Z", + "iopub.status.idle": "2024-01-15T13:47:07.693355Z", + "shell.execute_reply": "2024-01-15T13:47:07.692467Z", + "shell.execute_reply.started": "2024-01-15T13:47:07.687931Z" } }, "outputs": [], "source": [ - "column_transformer = ColumnTransformer([\n", - " ('numerical_preprocessing', numerical_preprocessor, make_column_selector(dtype_include=np.number)),\n", - " ('categorical_preprocessing', categorical_preprocessor, make_column_selector(dtype_include=object))\n", - "])" + "class ColumnDropper(BaseEstimator, TransformerMixin):\n", + " \"\"\"Drop the specified columns\"\"\"\n", + "\n", + " def __init__(self, columns):\n", + " self.columns = columns\n", + "\n", + " def fit(self, X: pd.DataFrame, y=None) -> Self:\n", + " return self\n", + "\n", + " def transform(self, X: pd.DataFrame) -> pd.DataFrame:\n", + " # Drop the specified columns\n", + " return X.drop(self.columns, axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Creating a Baseline Model" + "# Extending the Pipeline" ] }, { @@ -510,7 +559,9 @@ }, "outputs": [], "source": [ - "pipeline = Pipeline([\n", + "extended_pipeline = Pipeline([\n", + " ('passenger_id_splitter', PassengerIdSplitter()),\n", + " ('cabin_splitter', CabinSplitter()),\n", " ('column_dropper', ColumnDropper(columns=['Name'])),\n", " ('column_transformer', column_transformer),\n", " ('classifier', RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1))\n", @@ -531,7 +582,7 @@ }, "outputs": [], "source": [ - "pipeline.fit(X_train, y_train)" + "extended_pipeline.fit(X_train, y_train)" ] }, { @@ -548,22 +599,22 @@ }, "outputs": [], "source": [ - "baseline_accuracy = pipeline.score(X_test, y_test)\n", - "print(f'Accuracy score: {baseline_accuracy:.3}')" + "extended_pipeline_accuracy = extended_pipeline.score(X_test, y_test)\n", + "print(f'Accuracy score: {extended_pipeline_accuracy:.3}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Model Selection and Hyperparameter Tuning" + "# Hyperparameter Tuning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have a baseline model, we can try different models and tune the hyperparameters to improve the model performance. We will use the Random Forest Classifier and GradientBoosting with the `RandomizedSearchCV` to tune the hyperparameters for each model." + "Now that we have our preprocessing pipeline, we can try to improve our model by tuning the hyperparameters. We will use the `RandomizedSearchCV` to search for the best hyperparameters." ] }, { @@ -582,20 +633,11 @@ "source": [ "search_space = [\n", " {\n", - " 'classifier': [RandomForestClassifier(random_state=RANDOM_STATE)],\n", - " 'classifier__n_estimators': randint(50, 1000),\n", - " 'classifier__max_depth': randint(3,50),\n", - " 'classifier__min_samples_split': randint(2, 100),\n", - " 'classifier__min_samples_leaf': randint(1, 50),\n", - " 'classifier__max_features': ['sqrt', 'log2'],\n", - " },\n", - " {\n", - " 'classifier': [GradientBoostingClassifier(random_state=RANDOM_STATE)],\n", - " 'classifier__n_estimators': randint(50, 1000),\n", - " 'classifier__learning_rate': uniform(0.01, 0.3),\n", - " 'classifier__max_depth': randint(3,50),\n", - " 'classifier__min_samples_split': randint(2, 100), \n", - " 'classifier__min_samples_leaf': randint(1, 50),\n", + " 'column_transformer__numerical_preprocessing__imputer__strategy': ['mean', 'median', 'most_frequent'],\n", + " 'classifier__n_estimators': randint(50, 500),\n", + " 'classifier__max_depth': randint(3, 20),\n", + " 'classifier__min_samples_split': randint(2, 50), \n", + " 'classifier__min_samples_leaf': randint(1, 25),\n", " 'classifier__max_features': ['sqrt', 'log2'],\n", " }\n", "]" @@ -616,11 +658,11 @@ "outputs": [], "source": [ "random_search = RandomizedSearchCV(\n", - " pipeline, \n", + " extended_pipeline, \n", " search_space,\n", " scoring='accuracy',\n", " refit=True,\n", - " n_iter=5000,\n", + " n_iter=1000,\n", " cv=10, \n", " verbose=1, \n", " n_jobs=-1,\n", @@ -724,7 +766,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.11.7" } }, "nbformat": 4,