From be733c6161993deb6e284475224d6363c99a8962 Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Thu, 27 Apr 2023 13:18:39 +0100
Subject: [PATCH 01/10] feat: add sklearn transformer

---
 pyproject.toml                             |   3 +-
 src/textdescriptives/sklearn_featurizer.py | 110 +++++++++++++++++++++
 2 files changed, 112 insertions(+), 1 deletion(-)
 create mode 100644 src/textdescriptives/sklearn_featurizer.py

diff --git a/pyproject.toml b/pyproject.toml
index 2979a123..101cfb7a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,7 +71,8 @@ tutorials = [
     "jupyter",
     "seaborn",
     "matplotlib",
-    "datasets>=2.8.0,<2.11.0",
+    "datasets>=2.8.0,
+    "scikit-learn">=1.1.0,
 ]
 
 [project.readme]
diff --git a/src/textdescriptives/sklearn_featurizer.py b/src/textdescriptives/sklearn_featurizer.py
new file mode 100644
index 00000000..aea5f841
--- /dev/null
+++ b/src/textdescriptives/sklearn_featurizer.py
@@ -0,0 +1,110 @@
+from typing import Iterable, List, Optional
+
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from wasabi import msg
+
+from textdescriptives import extract_metrics
+
+
+def get_feature_names_from_metrics_and_model(
+    lang: Optional[str],
+    metrics: Optional[Iterable[str]],
+    spacy_model: Optional[str],
+    spacy_model_size: str,
+) -> List[str]:
+    """Get the names of the extracted features from the specified metrics
+    and model. Does this by extracting the metrics from an empty dummy text."""
+    df = extract_metrics(
+        text="",
+        lang=lang,
+        metrics=metrics,
+        spacy_model=spacy_model,
+        spacy_model_size=spacy_model_size,
+    )
+    return list(df.drop("text", axis=1).columns)
+
+
+class TextDescriptivesFeaturizer(TransformerMixin, BaseEstimator):
+    """Wrapper for extracting text metrics using textdescriptives and
+    using it in a sklearn pipeline."""
+
+    def __init__(
+        self,
+        lang: Optional[str] = None,
+        metrics: Optional[Iterable[str]] = None,
+        spacy_model: Optional[str] = None,
+        spacy_model_size: str = "lg",
+    ):
+        """Initialise the transformer with arguments to
+        textdescriptives.extract_metrics.
+
+        Args:
+            text (Union[str, List[str]]): A text or a list of texts.
+            lang (str, optional): Language of the text. If lang is set and no spacy
+                model is provided, will automatically download and use a spacy
+                model for the language. Defaults to None.
+            metrics (List[str]): Which metrics to extract.
+                One or more of ["descriptive_stats", "readability",
+                "dependency_distance", "pos_proportions", "coherence", "quality",
+                "information_theory"]. If None, will extract all metrics from
+                textdescriptives. Defaults to None.
+            spacy_model (str, optional): The spacy model to use. If not set,
+                will download one based on lang. Defaults to None.
+            spacy_model_size (str, optional): Size of the spacy model to download.
+        """
+        self.lang = lang
+        if isinstance(metrics, str):
+            metrics = [metrics]
+        self.metrics = metrics
+        self.spacy_model = spacy_model
+        self.spacy_model_size = spacy_model_size
+
+        if spacy_model is None and lang is None:
+            raise ValueError("Either a spacy model or a language must be provided.")
+        if spacy_model is not None and lang is not None:
+            msg.info(
+                "Both a spacy model and a language were provided. "
+                + "Will use the spacy model and ignore language.",
+            )
+        self.feature_names = get_feature_names_from_metrics_and_model(
+            lang=self.lang,
+            metrics=self.metrics,
+            spacy_model=self.spacy_model,
+            spacy_model_size=self.spacy_model_size,
+        )
+
+    def fit(self, X, y=None):
+        """Fit the transformer to the data. This is not needed for this
+        transformer, but is required for sklearn compatibility."""
+        return self
+
+    def transform(self, X) -> pd.DataFrame:
+        """Transform the data using textdescriptives.
+
+        Args:
+            X: Iterable of strings.
+
+        Returns:
+            Numpy array of shape (n_samples, n_features).
+        """
+        metrics = extract_metrics(
+            X,
+            lang=self.lang,
+            metrics=self.metrics,
+            spacy_model=self.spacy_model,
+            spacy_model_size=self.spacy_model_size,
+        )
+        return metrics.drop("text", axis=1)
+
+    def get_feature_names(self) -> List[str]:
+        """Get the names of the extracted features."""
+        return self.feature_names
+
+    def get_feature_names_out(self, input_features=None) -> List[str]:
+        """Get the names of the extracted features. input_features is only
+        present for API compatibility with sklearn."""
+        return self.feature_names
+
+

From 42200725c2c89c738a07fffdc1b8f7b6d111fc6c Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Thu, 27 Apr 2023 13:18:59 +0100
Subject: [PATCH 02/10] docs: add sklearn tutorial

---
 docs/tutorial.rst                        |   1 +
 docs/tutorials/sklearn_integration.ipynb | 416 +++++++++++++++++++++++
 2 files changed, 417 insertions(+)
 create mode 100644 docs/tutorials/sklearn_integration.ipynb

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
index 333c209e..499c4e20 100644
--- a/docs/tutorial.rst
+++ b/docs/tutorial.rst
@@ -11,4 +11,5 @@ locally.
 
    tutorials/introductory_tutorial.ipynb
    tutorials/filter_corpus_using_quality.ipynb
+   tutorials/sklearn_integration.ipynb
 
diff --git a/docs/tutorials/sklearn_integration.ipynb b/docs/tutorials/sklearn_integration.ipynb
new file mode 100644
index 00000000..3c89f265
--- /dev/null
+++ b/docs/tutorials/sklearn_integration.ipynb
@@ -0,0 +1,416 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scikit-learn Integration\n",
+    "\n",
+    "<a target=\"_blank\" href=\"https://colab.research.google.com/github/HLasse/TextDescriptives/blob/main/docs/tutorials/sklearn_integration.ipynb\">\n",
+    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>\n",
+    "\n",
+    "In the [introductory tutorial](https://hlasse.github.io/TextDescriptives/tutorials/introductory_tutorial.html) tutorial you learned how to use the components and extractors of TextDescriptives and saw how to use them for exploratory data analysis. \n",
+    "\n",
+    "In this tutorial we will walk through how to use TextDescriptives in a sklearn pipeline for e.g. text classification."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Setup\n",
+    "\n",
+    "We'll use the same dataset as in the introductory tutorial, i.e. the [SMS Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection).\n",
+    "The dataset contains 5572 SMS messages categorized as ham or spam. \n",
+    "\n",
+    "Load's load the dataset and the required packages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    import textdescriptives\n",
+    "except:\n",
+    "    !pip install \"textdescriptives[tutorials]\""
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To use the functionality of TextDescriptives in an sklearn pipeline, you simply need to instantiate `TextDecriptivesFeaturizer` with the same arguments as you would provide to `extract_metrics` and wrap it in a sklearn `Pipeline`. \n",
+    "\n",
+    "Let's try training a classifier on the SMS data using the `descriptive_stats` feature set as an example. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>label</th>\n",
+       "      <th>message</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>Go until jurong point, crazy.. Available only ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>Ok lar... Joking wif u oni...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>spam</td>\n",
+       "      <td>Free entry in 2 a wkly comp to win FA Cup fina...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>U dun say so early hor... U c already then say...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>ham</td>\n",
+       "      <td>Nah I don't think he goes to usf, he lives aro...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  label                                            message\n",
+       "0   ham  Go until jurong point, crazy.. Available only ...\n",
+       "1   ham                      Ok lar... Joking wif u oni...\n",
+       "2  spam  Free entry in 2 a wkly comp to win FA Cup fina...\n",
+       "3   ham  U dun say so early hor... U c already then say...\n",
+       "4   ham  Nah I don't think he goes to usf, he lives aro..."
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from textdescriptives.utils import load_sms_data\n",
+    "df = load_sms_data()\n",
+    "df.head()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alright, the \"message\" column contains the text we want to extract metrics from, and the \"label\" column contains the label. Now, let's instantiate the featurizer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/au554730/Desktop/Projects/TextDescriptives/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    }
+   ],
+   "source": [
+    "from textdescriptives.sklearn_featurizer import TextDescriptivesFeaturizer\n",
+    "\n",
+    "# instantiate the featurizer with the same options as you would pass\n",
+    "# to textdescriptives.extract_metrics\n",
+    "descriptive_stats_extractor = TextDescriptivesFeaturizer(\n",
+    "    lang=\"en\", metrics=[\"descriptive_stats\"]\n",
+    "    )"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Time to make the pipeline. Make sure to wrap the featurizer in a ColumnTransformer, as it's necessary to make sure the featurizer only operates on the \"message\" column, which is the column containing the text in this example.\n",
+    "\n",
+    "As there can be missing values values after extraction, we use a SimpleImputer to impute the missing values with the median.\n",
+    "\n",
+    "In the end, we use a RandomForestClassifier as the classifier, divide the data into a training and a test split and train and evaluate the model. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Test accuracy: 0.9461883408071748\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.model_selection import train_test_split \n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn import set_config\n",
+    "\n",
+    "# This tells sklearn to use pandas dataframes as output which means\n",
+    "# it's easier to access the feature names\n",
+    "set_config(transform_output=\"pandas\")\n",
+    "\n",
+    "pipe = Pipeline(\n",
+    "    [\n",
+    "        (\n",
+    "            \"featurizer\",\n",
+    "            ColumnTransformer(\n",
+    "                [(\"text_processing\", descriptive_stats_extractor, \"message\")]\n",
+    "            ,\n",
+    "            # removes the `text_processing__` prefix from feature names\n",
+    "            verbose_feature_names_out=False, \n",
+    "            ),\n",
+    "        ),\n",
+    "        (\"imputer\", SimpleImputer(strategy=\"median\")),\n",
+    "        (\"classifier\", RandomForestClassifier()),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "#  split the data into train and test\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    df.drop(\"label\", axis=1),\n",
+    "    df[\"label\"],\n",
+    "    test_size=0.2,\n",
+    "    random_state=42,\n",
+    ")\n",
+    "\n",
+    "# fit the pipeline and evaluate\n",
+    "pipe.fit(X_train, y_train)\n",
+    "print(\"Test accuracy:\", pipe.score(X_test, y_test))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Nice! TextDescriptivesFeaturizer implements the `get_features_out` method, which means the feature names are passed on in the pipeline and allows us to get informative names for e.g. feature importance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Feature importances:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Feature</th>\n",
+       "      <th>Importance</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>n_characters</td>\n",
+       "      <td>0.156498</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>token_length_mean</td>\n",
+       "      <td>0.149543</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>token_length_std</td>\n",
+       "      <td>0.120570</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>n_unique_tokens</td>\n",
+       "      <td>0.118347</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>n_tokens</td>\n",
+       "      <td>0.102766</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>proportion_unique_tokens</td>\n",
+       "      <td>0.055264</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>sentence_length_std</td>\n",
+       "      <td>0.054236</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>syllables_per_token_std</td>\n",
+       "      <td>0.053973</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>syllables_per_token_mean</td>\n",
+       "      <td>0.050366</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>sentence_length_mean</td>\n",
+       "      <td>0.038362</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>sentence_length_median</td>\n",
+       "      <td>0.037550</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>n_sentences</td>\n",
+       "      <td>0.031442</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>token_length_median</td>\n",
+       "      <td>0.029359</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>syllables_per_token_median</td>\n",
+       "      <td>0.001723</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                       Feature  Importance\n",
+       "12                n_characters    0.156498\n",
+       "0            token_length_mean    0.149543\n",
+       "2             token_length_std    0.120570\n",
+       "10             n_unique_tokens    0.118347\n",
+       "9                     n_tokens    0.102766\n",
+       "11    proportion_unique_tokens    0.055264\n",
+       "5          sentence_length_std    0.054236\n",
+       "8      syllables_per_token_std    0.053973\n",
+       "6     syllables_per_token_mean    0.050366\n",
+       "3         sentence_length_mean    0.038362\n",
+       "4       sentence_length_median    0.037550\n",
+       "13                 n_sentences    0.031442\n",
+       "1          token_length_median    0.029359\n",
+       "7   syllables_per_token_median    0.001723"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "# extract feature importances\n",
+    "feature_importance_mapping = list(\n",
+    "    zip(\n",
+    "        pipe[\"classifier\"].feature_names_in_,\n",
+    "        pipe.named_steps[\"classifier\"].feature_importances_,\n",
+    "    )\n",
+    ")\n",
+    "\n",
+    "print(\"Feature importances:\")\n",
+    "# sort by importance\n",
+    "df_importances = pd.DataFrame(\n",
+    "    feature_importance_mapping, columns=[\"Feature\", \"Importance\"]\n",
+    ").sort_values(by=\"Importance\", ascending=False)\n",
+    "df_importances"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 375d3e03a8792b54f5dfd6080e27ca34925628c3 Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Thu, 27 Apr 2023 13:19:11 +0100
Subject: [PATCH 03/10] docs: minor docstring changes

---
 src/textdescriptives/extractors.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/textdescriptives/extractors.py b/src/textdescriptives/extractors.py
index 6b08d6b3..348a0ecc 100644
--- a/src/textdescriptives/extractors.py
+++ b/src/textdescriptives/extractors.py
@@ -119,8 +119,6 @@ def extract_metrics(
 
     Args:
         text (Union[str, List[str]]): A text or a list of texts.
-        spacy_model (str, optional): The spacy model to use. If not set,
-            will download one based on lang. Defaults to None.
         lang (str, optional): Language of the text. If lang is set and no spacy
             model is provided, will automatically download and use a spacy
             model for the language. Defaults to None.
@@ -129,6 +127,8 @@ def extract_metrics(
             "dependency_distance", "pos_proportions", "coherence", "quality",
             "information_theory"]. If None, will extract all metrics from
             textdescriptives. Defaults to None.
+        spacy_model (str, optional): The spacy model to use. If not set,
+            will download one based on lang. Defaults to None.
         spacy_model_size (str, optional): Size of the spacy model to download.
 
     Returns:

From 44aa0f000f3a9825cd808e3ce306c594a4404689 Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Thu, 27 Apr 2023 13:20:13 +0100
Subject: [PATCH 04/10] chore: pre-commit

---
 pyproject.toml                             | 4 ++--
 src/textdescriptives/sklearn_featurizer.py | 3 ---
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 101cfb7a..88814539 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,8 +71,8 @@ tutorials = [
     "jupyter",
     "seaborn",
     "matplotlib",
-    "datasets>=2.8.0,
-    "scikit-learn">=1.1.0,
+    "datasets>=2.8.0",
+    "scikit-learn>=1.1.0",
 ]
 
 [project.readme]
diff --git a/src/textdescriptives/sklearn_featurizer.py b/src/textdescriptives/sklearn_featurizer.py
index aea5f841..16d2233f 100644
--- a/src/textdescriptives/sklearn_featurizer.py
+++ b/src/textdescriptives/sklearn_featurizer.py
@@ -1,6 +1,5 @@
 from typing import Iterable, List, Optional
 
-import numpy as np
 import pandas as pd
 from sklearn.base import BaseEstimator, TransformerMixin
 from wasabi import msg
@@ -106,5 +105,3 @@ def get_feature_names_out(self, input_features=None) -> List[str]:
         """Get the names of the extracted features. input_features is only
         present for API compatibility with sklearn."""
         return self.feature_names
-
-

From a0ad6798c9a90e28fbb8661c8586cd51fbd7162e Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Thu, 27 Apr 2023 13:52:36 +0100
Subject: [PATCH 05/10] build: py3.7 compatibility

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 88814539..2ed1832e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,7 +72,7 @@ tutorials = [
     "seaborn",
     "matplotlib",
     "datasets>=2.8.0",
-    "scikit-learn>=1.1.0",
+    "scikit-learn>=1.0.1",
 ]
 
 [project.readme]

From 33194c937803acdd2abb4198b97ead3981958fd1 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Thu, 27 Apr 2023 11:10:59 -0700
Subject: [PATCH 06/10] docs: add sklearn to requirements in docs

---
 docs/tutorials/sklearn_integration.ipynb | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/tutorials/sklearn_integration.ipynb b/docs/tutorials/sklearn_integration.ipynb
index 3c89f265..321cc754 100644
--- a/docs/tutorials/sklearn_integration.ipynb
+++ b/docs/tutorials/sklearn_integration.ipynb
@@ -37,6 +37,7 @@
    "source": [
     "try:\n",
     "    import textdescriptives\n",
+    "    import sklearn\n",
     "except:\n",
     "    !pip install \"textdescriptives[tutorials]\""
    ]

From 12bb8c4202a444740b31c6627b05dc641cd853dc Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Thu, 27 Apr 2023 20:58:38 +0100
Subject: [PATCH 07/10] refactor: move sklearn integration to separate module

---
 docs/tutorials/sklearn_integration.ipynb      | 24 ++++++++++++-------
 src/textdescriptives/integrations/__init__.py |  0
 .../{ => integrations}/sklearn_featurizer.py  | 12 +++++++++-
 3 files changed, 27 insertions(+), 9 deletions(-)
 create mode 100644 src/textdescriptives/integrations/__init__.py
 rename src/textdescriptives/{ => integrations}/sklearn_featurizer.py (90%)

diff --git a/docs/tutorials/sklearn_integration.ipynb b/docs/tutorials/sklearn_integration.ipynb
index 3c89f265..afe26f83 100644
--- a/docs/tutorials/sklearn_integration.ipynb
+++ b/docs/tutorials/sklearn_integration.ipynb
@@ -145,16 +145,24 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/au554730/Desktop/Projects/TextDescriptives/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     "ename": "ImportError",
+     "evalue": "Failed to import sklearn. If you want to use the sklearn integration, please install it with `pip install scikit-learn` or install textdescriptives with the [sklearn] extra: pip install textdescriptives[sklearn].",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "File \u001b[0;32m~/Desktop/Projects/TextDescriptives/src/textdescriptives/integrations/sklearn_featurizer.py:6\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m----> 6\u001b[0m     \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseEstimator, TransformerMixin\n\u001b[1;32m      7\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'",
+      "\nThe above exception was the direct cause of the following exception:\n",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtextdescriptives\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mintegrations\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msklearn_featurizer\u001b[39;00m \u001b[39mimport\u001b[39;00m TextDescriptivesFeaturizer\n\u001b[1;32m      3\u001b[0m \u001b[39m# instantiate the featurizer with the same options as you would pass\u001b[39;00m\n\u001b[1;32m      4\u001b[0m \u001b[39m# to textdescriptives.extract_metrics\u001b[39;00m\n\u001b[1;32m      5\u001b[0m descriptive_stats_extractor \u001b[39m=\u001b[39m TextDescriptivesFeaturizer(\n\u001b[1;32m      6\u001b[0m     lang\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39men\u001b[39m\u001b[39m\"\u001b[39m, metrics\u001b[39m=\u001b[39m[\u001b[39m\"\u001b[39m\u001b[39mdescriptive_stats\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m      7\u001b[0m     )\n",
+      "File \u001b[0;32m~/Desktop/Projects/TextDescriptives/src/textdescriptives/integrations/sklearn_featurizer.py:8\u001b[0m\n\u001b[1;32m      6\u001b[0m     \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseEstimator, TransformerMixin\n\u001b[1;32m      7\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m----> 8\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mImportError\u001b[39;00m(\n\u001b[1;32m      9\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mFailed to import sklearn. If you want to use the sklearn integration, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     10\u001b[0m         \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mplease install it with `pip install scikit-learn` or install \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     11\u001b[0m         \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mtextdescriptives with the [sklearn] extra: \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     12\u001b[0m         \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mpip install textdescriptives[sklearn].\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     13\u001b[0m     ) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m     14\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mwasabi\u001b[39;00m \u001b[39mimport\u001b[39;00m msg\n\u001b[1;32m     16\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtextdescriptives\u001b[39;00m \u001b[39mimport\u001b[39;00m extract_metrics\n",
+      "\u001b[0;31mImportError\u001b[0m: Failed to import sklearn. If you want to use the sklearn integration, please install it with `pip install scikit-learn` or install textdescriptives with the [sklearn] extra: pip install textdescriptives[sklearn]."
      ]
     }
    ],
    "source": [
-    "from textdescriptives.sklearn_featurizer import TextDescriptivesFeaturizer\n",
+    "from textdescriptives.integrations.sklearn_featurizer import TextDescriptivesFeaturizer\n",
     "\n",
     "# instantiate the featurizer with the same options as you would pass\n",
     "# to textdescriptives.extract_metrics\n",
@@ -177,7 +185,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
@@ -239,7 +247,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "metadata": {},
    "outputs": [
     {
diff --git a/src/textdescriptives/integrations/__init__.py b/src/textdescriptives/integrations/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/textdescriptives/sklearn_featurizer.py b/src/textdescriptives/integrations/sklearn_featurizer.py
similarity index 90%
rename from src/textdescriptives/sklearn_featurizer.py
rename to src/textdescriptives/integrations/sklearn_featurizer.py
index 16d2233f..bf2fcd42 100644
--- a/src/textdescriptives/sklearn_featurizer.py
+++ b/src/textdescriptives/integrations/sklearn_featurizer.py
@@ -1,7 +1,16 @@
 from typing import Iterable, List, Optional
 
 import pandas as pd
-from sklearn.base import BaseEstimator, TransformerMixin
+
+try:
+    from sklearn.base import BaseEstimator, TransformerMixin
+except ImportError as e:
+    raise ImportError(
+        "Failed to import sklearn. If you want to use the sklearn integration, "
+        + "please install it with `pip install scikit-learn` or install "
+        + "textdescriptives with the [sklearn] extra: "
+        + "pip install textdescriptives[sklearn]."
+    ) from e
 from wasabi import msg
 
 from textdescriptives import extract_metrics
@@ -105,3 +114,4 @@ def get_feature_names_out(self, input_features=None) -> List[str]:
         """Get the names of the extracted features. input_features is only
         present for API compatibility with sklearn."""
         return self.feature_names
+        return self.feature_names

From 86ff127c9e6f91f32a36f715d7a27d9ceda361c0 Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Thu, 27 Apr 2023 20:58:58 +0100
Subject: [PATCH 08/10] build: as sklearn as possible extra dependency

---
 pyproject.toml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 2ed1832e..929527e4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -74,6 +74,9 @@ tutorials = [
     "datasets>=2.8.0",
     "scikit-learn>=1.0.1",
 ]
+sklearn = [
+    "scikit-learn>=1.0.1",
+]
 
 [project.readme]
 file = "README.md"

From cad0b342bde99820a90f7332ba0b785ba75261de Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Thu, 27 Apr 2023 21:01:11 +0100
Subject: [PATCH 09/10] chore: pre-commit

---
 src/textdescriptives/integrations/sklearn_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/textdescriptives/integrations/sklearn_featurizer.py b/src/textdescriptives/integrations/sklearn_featurizer.py
index bf2fcd42..7c1f5fd0 100644
--- a/src/textdescriptives/integrations/sklearn_featurizer.py
+++ b/src/textdescriptives/integrations/sklearn_featurizer.py
@@ -9,7 +9,7 @@
         "Failed to import sklearn. If you want to use the sklearn integration, "
         + "please install it with `pip install scikit-learn` or install "
         + "textdescriptives with the [sklearn] extra: "
-        + "pip install textdescriptives[sklearn]."
+        + "pip install textdescriptives[sklearn].",
     ) from e
 from wasabi import msg
 

From 5e68fc2428d6be4e8f5fe2e6886b77ed306337f0 Mon Sep 17 00:00:00 2001
From: Lasse <lasseh0310@gmail.com>
Date: Fri, 28 Apr 2023 09:11:27 +0100
Subject: [PATCH 10/10] build: remove python3.7 support

---
 .github/workflows/tests.yml | 2 +-
 pyproject.toml              | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7e31f0d9..fb375f0a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -11,7 +11,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
-        python-version: ["3.9", "3.8", "3.7"]
+        python-version: ["3.9", "3.8"]
 
     # This allows a subsequently queued workflow run to interrupt previous runs
     concurrency:
diff --git a/pyproject.toml b/pyproject.toml
index 929527e4..8659e327 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,7 +15,6 @@ classifiers = [
     "Operating System :: MacOS :: MacOS X",
     "Operating System :: Microsoft :: Windows",
     "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.7",
     "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
 ]
@@ -40,7 +39,7 @@ dependencies = [
     "ftfy>=6.0.3,<6.1.0",
 ]
 
-requires-python = ">=3.7"
+requires-python = ">=3.8"
 
 [project.urls]
 homepage = "https://hlasse.github.io/TextDescriptives/"
@@ -72,10 +71,10 @@ tutorials = [
     "seaborn",
     "matplotlib",
     "datasets>=2.8.0",
-    "scikit-learn>=1.0.1",
+    "scikit-learn>=1.1.1",
 ]
 sklearn = [
-    "scikit-learn>=1.0.1",
+    "scikit-learn>=1.1.1",
 ]
 
 [project.readme]