From be733c6161993deb6e284475224d6363c99a8962 Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 27 Apr 2023 13:18:39 +0100 Subject: [PATCH 01/10] feat: add sklearn transformer --- pyproject.toml | 3 +- src/textdescriptives/sklearn_featurizer.py | 110 +++++++++++++++++++++ 2 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 src/textdescriptives/sklearn_featurizer.py diff --git a/pyproject.toml b/pyproject.toml index 2979a123..101cfb7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,8 @@ tutorials = [ "jupyter", "seaborn", "matplotlib", - "datasets>=2.8.0,<2.11.0", + "datasets>=2.8.0, + "scikit-learn">=1.1.0, ] [project.readme] diff --git a/src/textdescriptives/sklearn_featurizer.py b/src/textdescriptives/sklearn_featurizer.py new file mode 100644 index 00000000..aea5f841 --- /dev/null +++ b/src/textdescriptives/sklearn_featurizer.py @@ -0,0 +1,110 @@ +from typing import Iterable, List, Optional + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from wasabi import msg + +from textdescriptives import extract_metrics + + +def get_feature_names_from_metrics_and_model( + lang: Optional[str], + metrics: Optional[Iterable[str]], + spacy_model: Optional[str], + spacy_model_size: str, +) -> List[str]: + """Get the names of the extracted features from the specified metrics + and model. Does this by extracting the metrics from an empty dummy text.""" + df = extract_metrics( + text="", + lang=lang, + metrics=metrics, + spacy_model=spacy_model, + spacy_model_size=spacy_model_size, + ) + return list(df.drop("text", axis=1).columns) + + +class TextDescriptivesFeaturizer(TransformerMixin, BaseEstimator): + """Wrapper for extracting text metrics using textdescriptives and + using it in a sklearn pipeline.""" + + def __init__( + self, + lang: Optional[str] = None, + metrics: Optional[Iterable[str]] = None, + spacy_model: Optional[str] = None, + spacy_model_size: str = "lg", + ): + """Initialise the transformer with arguments to + textdescriptives.extract_metrics. + + Args: + text (Union[str, List[str]]): A text or a list of texts. + lang (str, optional): Language of the text. If lang is set and no spacy + model is provided, will automatically download and use a spacy + model for the language. Defaults to None. + metrics (List[str]): Which metrics to extract. + One or more of ["descriptive_stats", "readability", + "dependency_distance", "pos_proportions", "coherence", "quality", + "information_theory"]. If None, will extract all metrics from + textdescriptives. Defaults to None. + spacy_model (str, optional): The spacy model to use. If not set, + will download one based on lang. Defaults to None. + spacy_model_size (str, optional): Size of the spacy model to download. + """ + self.lang = lang + if isinstance(metrics, str): + metrics = [metrics] + self.metrics = metrics + self.spacy_model = spacy_model + self.spacy_model_size = spacy_model_size + + if spacy_model is None and lang is None: + raise ValueError("Either a spacy model or a language must be provided.") + if spacy_model is not None and lang is not None: + msg.info( + "Both a spacy model and a language were provided. " + + "Will use the spacy model and ignore language.", + ) + self.feature_names = get_feature_names_from_metrics_and_model( + lang=self.lang, + metrics=self.metrics, + spacy_model=self.spacy_model, + spacy_model_size=self.spacy_model_size, + ) + + def fit(self, X, y=None): + """Fit the transformer to the data. This is not needed for this + transformer, but is required for sklearn compatibility.""" + return self + + def transform(self, X) -> pd.DataFrame: + """Transform the data using textdescriptives. + + Args: + X: Iterable of strings. + + Returns: + Numpy array of shape (n_samples, n_features). + """ + metrics = extract_metrics( + X, + lang=self.lang, + metrics=self.metrics, + spacy_model=self.spacy_model, + spacy_model_size=self.spacy_model_size, + ) + return metrics.drop("text", axis=1) + + def get_feature_names(self) -> List[str]: + """Get the names of the extracted features.""" + return self.feature_names + + def get_feature_names_out(self, input_features=None) -> List[str]: + """Get the names of the extracted features. input_features is only + present for API compatibility with sklearn.""" + return self.feature_names + + From 42200725c2c89c738a07fffdc1b8f7b6d111fc6c Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 27 Apr 2023 13:18:59 +0100 Subject: [PATCH 02/10] docs: add sklearn tutorial --- docs/tutorial.rst | 1 + docs/tutorials/sklearn_integration.ipynb | 416 +++++++++++++++++++++++ 2 files changed, 417 insertions(+) create mode 100644 docs/tutorials/sklearn_integration.ipynb diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 333c209e..499c4e20 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -11,4 +11,5 @@ locally. tutorials/introductory_tutorial.ipynb tutorials/filter_corpus_using_quality.ipynb + tutorials/sklearn_integration.ipynb diff --git a/docs/tutorials/sklearn_integration.ipynb b/docs/tutorials/sklearn_integration.ipynb new file mode 100644 index 00000000..3c89f265 --- /dev/null +++ b/docs/tutorials/sklearn_integration.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scikit-learn Integration\n", + "\n", + "\n", + " \"Open\n", + "\n", + "\n", + "In the [introductory tutorial](https://hlasse.github.io/TextDescriptives/tutorials/introductory_tutorial.html) tutorial you learned how to use the components and extractors of TextDescriptives and saw how to use them for exploratory data analysis. \n", + "\n", + "In this tutorial we will walk through how to use TextDescriptives in a sklearn pipeline for e.g. text classification." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "We'll use the same dataset as in the introductory tutorial, i.e. the [SMS Spam Collection Data Set](https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection).\n", + "The dataset contains 5572 SMS messages categorized as ham or spam. \n", + "\n", + "Load's load the dataset and the required packages." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " import textdescriptives\n", + "except:\n", + " !pip install \"textdescriptives[tutorials]\"" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use the functionality of TextDescriptives in an sklearn pipeline, you simply need to instantiate `TextDecriptivesFeaturizer` with the same arguments as you would provide to `extract_metrics` and wrap it in a sklearn `Pipeline`. \n", + "\n", + "Let's try training a classifier on the SMS data using the `descriptive_stats` feature set as an example. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
labelmessage
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
\n", + "
" + ], + "text/plain": [ + " label message\n", + "0 ham Go until jurong point, crazy.. Available only ...\n", + "1 ham Ok lar... Joking wif u oni...\n", + "2 spam Free entry in 2 a wkly comp to win FA Cup fina...\n", + "3 ham U dun say so early hor... U c already then say...\n", + "4 ham Nah I don't think he goes to usf, he lives aro..." + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from textdescriptives.utils import load_sms_data\n", + "df = load_sms_data()\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alright, the \"message\" column contains the text we want to extract metrics from, and the \"label\" column contains the label. Now, let's instantiate the featurizer." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/au554730/Desktop/Projects/TextDescriptives/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from textdescriptives.sklearn_featurizer import TextDescriptivesFeaturizer\n", + "\n", + "# instantiate the featurizer with the same options as you would pass\n", + "# to textdescriptives.extract_metrics\n", + "descriptive_stats_extractor = TextDescriptivesFeaturizer(\n", + " lang=\"en\", metrics=[\"descriptive_stats\"]\n", + " )" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Time to make the pipeline. Make sure to wrap the featurizer in a ColumnTransformer, as it's necessary to make sure the featurizer only operates on the \"message\" column, which is the column containing the text in this example.\n", + "\n", + "As there can be missing values values after extraction, we use a SimpleImputer to impute the missing values with the median.\n", + "\n", + "In the end, we use a RandomForestClassifier as the classifier, divide the data into a training and a test split and train and evaluate the model. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test accuracy: 0.9461883408071748\n" + ] + } + ], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.model_selection import train_test_split \n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn import set_config\n", + "\n", + "# This tells sklearn to use pandas dataframes as output which means\n", + "# it's easier to access the feature names\n", + "set_config(transform_output=\"pandas\")\n", + "\n", + "pipe = Pipeline(\n", + " [\n", + " (\n", + " \"featurizer\",\n", + " ColumnTransformer(\n", + " [(\"text_processing\", descriptive_stats_extractor, \"message\")]\n", + " ,\n", + " # removes the `text_processing__` prefix from feature names\n", + " verbose_feature_names_out=False, \n", + " ),\n", + " ),\n", + " (\"imputer\", SimpleImputer(strategy=\"median\")),\n", + " (\"classifier\", RandomForestClassifier()),\n", + " ]\n", + ")\n", + "\n", + "# split the data into train and test\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " df.drop(\"label\", axis=1),\n", + " df[\"label\"],\n", + " test_size=0.2,\n", + " random_state=42,\n", + ")\n", + "\n", + "# fit the pipeline and evaluate\n", + "pipe.fit(X_train, y_train)\n", + "print(\"Test accuracy:\", pipe.score(X_test, y_test))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nice! TextDescriptivesFeaturizer implements the `get_features_out` method, which means the feature names are passed on in the pipeline and allows us to get informative names for e.g. feature importance." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature importances:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
FeatureImportance
12n_characters0.156498
0token_length_mean0.149543
2token_length_std0.120570
10n_unique_tokens0.118347
9n_tokens0.102766
11proportion_unique_tokens0.055264
5sentence_length_std0.054236
8syllables_per_token_std0.053973
6syllables_per_token_mean0.050366
3sentence_length_mean0.038362
4sentence_length_median0.037550
13n_sentences0.031442
1token_length_median0.029359
7syllables_per_token_median0.001723
\n", + "
" + ], + "text/plain": [ + " Feature Importance\n", + "12 n_characters 0.156498\n", + "0 token_length_mean 0.149543\n", + "2 token_length_std 0.120570\n", + "10 n_unique_tokens 0.118347\n", + "9 n_tokens 0.102766\n", + "11 proportion_unique_tokens 0.055264\n", + "5 sentence_length_std 0.054236\n", + "8 syllables_per_token_std 0.053973\n", + "6 syllables_per_token_mean 0.050366\n", + "3 sentence_length_mean 0.038362\n", + "4 sentence_length_median 0.037550\n", + "13 n_sentences 0.031442\n", + "1 token_length_median 0.029359\n", + "7 syllables_per_token_median 0.001723" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "# extract feature importances\n", + "feature_importance_mapping = list(\n", + " zip(\n", + " pipe[\"classifier\"].feature_names_in_,\n", + " pipe.named_steps[\"classifier\"].feature_importances_,\n", + " )\n", + ")\n", + "\n", + "print(\"Feature importances:\")\n", + "# sort by importance\n", + "df_importances = pd.DataFrame(\n", + " feature_importance_mapping, columns=[\"Feature\", \"Importance\"]\n", + ").sort_values(by=\"Importance\", ascending=False)\n", + "df_importances" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 375d3e03a8792b54f5dfd6080e27ca34925628c3 Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 27 Apr 2023 13:19:11 +0100 Subject: [PATCH 03/10] docs: minor docstring changes --- src/textdescriptives/extractors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/textdescriptives/extractors.py b/src/textdescriptives/extractors.py index 6b08d6b3..348a0ecc 100644 --- a/src/textdescriptives/extractors.py +++ b/src/textdescriptives/extractors.py @@ -119,8 +119,6 @@ def extract_metrics( Args: text (Union[str, List[str]]): A text or a list of texts. - spacy_model (str, optional): The spacy model to use. If not set, - will download one based on lang. Defaults to None. lang (str, optional): Language of the text. If lang is set and no spacy model is provided, will automatically download and use a spacy model for the language. Defaults to None. @@ -129,6 +127,8 @@ def extract_metrics( "dependency_distance", "pos_proportions", "coherence", "quality", "information_theory"]. If None, will extract all metrics from textdescriptives. Defaults to None. + spacy_model (str, optional): The spacy model to use. If not set, + will download one based on lang. Defaults to None. spacy_model_size (str, optional): Size of the spacy model to download. Returns: From 44aa0f000f3a9825cd808e3ce306c594a4404689 Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 27 Apr 2023 13:20:13 +0100 Subject: [PATCH 04/10] chore: pre-commit --- pyproject.toml | 4 ++-- src/textdescriptives/sklearn_featurizer.py | 3 --- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 101cfb7a..88814539 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,8 +71,8 @@ tutorials = [ "jupyter", "seaborn", "matplotlib", - "datasets>=2.8.0, - "scikit-learn">=1.1.0, + "datasets>=2.8.0", + "scikit-learn>=1.1.0", ] [project.readme] diff --git a/src/textdescriptives/sklearn_featurizer.py b/src/textdescriptives/sklearn_featurizer.py index aea5f841..16d2233f 100644 --- a/src/textdescriptives/sklearn_featurizer.py +++ b/src/textdescriptives/sklearn_featurizer.py @@ -1,6 +1,5 @@ from typing import Iterable, List, Optional -import numpy as np import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin from wasabi import msg @@ -106,5 +105,3 @@ def get_feature_names_out(self, input_features=None) -> List[str]: """Get the names of the extracted features. input_features is only present for API compatibility with sklearn.""" return self.feature_names - - From a0ad6798c9a90e28fbb8661c8586cd51fbd7162e Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 27 Apr 2023 13:52:36 +0100 Subject: [PATCH 05/10] build: py3.7 compatibility --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 88814539..2ed1832e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,7 @@ tutorials = [ "seaborn", "matplotlib", "datasets>=2.8.0", - "scikit-learn>=1.1.0", + "scikit-learn>=1.0.1", ] [project.readme] From 33194c937803acdd2abb4198b97ead3981958fd1 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 27 Apr 2023 11:10:59 -0700 Subject: [PATCH 06/10] docs: add sklearn to requirements in docs --- docs/tutorials/sklearn_integration.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/tutorials/sklearn_integration.ipynb b/docs/tutorials/sklearn_integration.ipynb index 3c89f265..321cc754 100644 --- a/docs/tutorials/sklearn_integration.ipynb +++ b/docs/tutorials/sklearn_integration.ipynb @@ -37,6 +37,7 @@ "source": [ "try:\n", " import textdescriptives\n", + " import sklearn\n", "except:\n", " !pip install \"textdescriptives[tutorials]\"" ] From 12bb8c4202a444740b31c6627b05dc641cd853dc Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 27 Apr 2023 20:58:38 +0100 Subject: [PATCH 07/10] refactor: move sklearn integration to separate module --- docs/tutorials/sklearn_integration.ipynb | 24 ++++++++++++------- src/textdescriptives/integrations/__init__.py | 0 .../{ => integrations}/sklearn_featurizer.py | 12 +++++++++- 3 files changed, 27 insertions(+), 9 deletions(-) create mode 100644 src/textdescriptives/integrations/__init__.py rename src/textdescriptives/{ => integrations}/sklearn_featurizer.py (90%) diff --git a/docs/tutorials/sklearn_integration.ipynb b/docs/tutorials/sklearn_integration.ipynb index 3c89f265..afe26f83 100644 --- a/docs/tutorials/sklearn_integration.ipynb +++ b/docs/tutorials/sklearn_integration.ipynb @@ -145,16 +145,24 @@ "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/au554730/Desktop/Projects/TextDescriptives/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + "ename": "ImportError", + "evalue": "Failed to import sklearn. If you want to use the sklearn integration, please install it with `pip install scikit-learn` or install textdescriptives with the [sklearn] extra: pip install textdescriptives[sklearn].", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m~/Desktop/Projects/TextDescriptives/src/textdescriptives/integrations/sklearn_featurizer.py:6\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m----> 6\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseEstimator, TransformerMixin\n\u001b[1;32m 7\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'sklearn'", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtextdescriptives\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mintegrations\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msklearn_featurizer\u001b[39;00m \u001b[39mimport\u001b[39;00m TextDescriptivesFeaturizer\n\u001b[1;32m 3\u001b[0m \u001b[39m# instantiate the featurizer with the same options as you would pass\u001b[39;00m\n\u001b[1;32m 4\u001b[0m \u001b[39m# to textdescriptives.extract_metrics\u001b[39;00m\n\u001b[1;32m 5\u001b[0m descriptive_stats_extractor \u001b[39m=\u001b[39m TextDescriptivesFeaturizer(\n\u001b[1;32m 6\u001b[0m lang\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39men\u001b[39m\u001b[39m\"\u001b[39m, metrics\u001b[39m=\u001b[39m[\u001b[39m\"\u001b[39m\u001b[39mdescriptive_stats\u001b[39m\u001b[39m\"\u001b[39m]\n\u001b[1;32m 7\u001b[0m )\n", + "File \u001b[0;32m~/Desktop/Projects/TextDescriptives/src/textdescriptives/integrations/sklearn_featurizer.py:8\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39msklearn\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbase\u001b[39;00m \u001b[39mimport\u001b[39;00m BaseEstimator, TransformerMixin\n\u001b[1;32m 7\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m----> 8\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mImportError\u001b[39;00m(\n\u001b[1;32m 9\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mFailed to import sklearn. If you want to use the sklearn integration, \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 10\u001b[0m \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mplease install it with `pip install scikit-learn` or install \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 11\u001b[0m \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mtextdescriptives with the [sklearn] extra: \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 12\u001b[0m \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mpip install textdescriptives[sklearn].\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 13\u001b[0m ) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 14\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mwasabi\u001b[39;00m \u001b[39mimport\u001b[39;00m msg\n\u001b[1;32m 16\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtextdescriptives\u001b[39;00m \u001b[39mimport\u001b[39;00m extract_metrics\n", + "\u001b[0;31mImportError\u001b[0m: Failed to import sklearn. If you want to use the sklearn integration, please install it with `pip install scikit-learn` or install textdescriptives with the [sklearn] extra: pip install textdescriptives[sklearn]." ] } ], "source": [ - "from textdescriptives.sklearn_featurizer import TextDescriptivesFeaturizer\n", + "from textdescriptives.integrations.sklearn_featurizer import TextDescriptivesFeaturizer\n", "\n", "# instantiate the featurizer with the same options as you would pass\n", "# to textdescriptives.extract_metrics\n", @@ -177,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -239,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { diff --git a/src/textdescriptives/integrations/__init__.py b/src/textdescriptives/integrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/textdescriptives/sklearn_featurizer.py b/src/textdescriptives/integrations/sklearn_featurizer.py similarity index 90% rename from src/textdescriptives/sklearn_featurizer.py rename to src/textdescriptives/integrations/sklearn_featurizer.py index 16d2233f..bf2fcd42 100644 --- a/src/textdescriptives/sklearn_featurizer.py +++ b/src/textdescriptives/integrations/sklearn_featurizer.py @@ -1,7 +1,16 @@ from typing import Iterable, List, Optional import pandas as pd -from sklearn.base import BaseEstimator, TransformerMixin + +try: + from sklearn.base import BaseEstimator, TransformerMixin +except ImportError as e: + raise ImportError( + "Failed to import sklearn. If you want to use the sklearn integration, " + + "please install it with `pip install scikit-learn` or install " + + "textdescriptives with the [sklearn] extra: " + + "pip install textdescriptives[sklearn]." + ) from e from wasabi import msg from textdescriptives import extract_metrics @@ -105,3 +114,4 @@ def get_feature_names_out(self, input_features=None) -> List[str]: """Get the names of the extracted features. input_features is only present for API compatibility with sklearn.""" return self.feature_names + return self.feature_names From 86ff127c9e6f91f32a36f715d7a27d9ceda361c0 Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 27 Apr 2023 20:58:58 +0100 Subject: [PATCH 08/10] build: as sklearn as possible extra dependency --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 2ed1832e..929527e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,9 @@ tutorials = [ "datasets>=2.8.0", "scikit-learn>=1.0.1", ] +sklearn = [ + "scikit-learn>=1.0.1", +] [project.readme] file = "README.md" From cad0b342bde99820a90f7332ba0b785ba75261de Mon Sep 17 00:00:00 2001 From: Lasse Date: Thu, 27 Apr 2023 21:01:11 +0100 Subject: [PATCH 09/10] chore: pre-commit --- src/textdescriptives/integrations/sklearn_featurizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/textdescriptives/integrations/sklearn_featurizer.py b/src/textdescriptives/integrations/sklearn_featurizer.py index bf2fcd42..7c1f5fd0 100644 --- a/src/textdescriptives/integrations/sklearn_featurizer.py +++ b/src/textdescriptives/integrations/sklearn_featurizer.py @@ -9,7 +9,7 @@ "Failed to import sklearn. If you want to use the sklearn integration, " + "please install it with `pip install scikit-learn` or install " + "textdescriptives with the [sklearn] extra: " - + "pip install textdescriptives[sklearn]." + + "pip install textdescriptives[sklearn].", ) from e from wasabi import msg From 5e68fc2428d6be4e8f5fe2e6886b77ed306337f0 Mon Sep 17 00:00:00 2001 From: Lasse Date: Fri, 28 Apr 2023 09:11:27 +0100 Subject: [PATCH 10/10] build: remove python3.7 support --- .github/workflows/tests.yml | 2 +- pyproject.toml | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7e31f0d9..fb375f0a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] - python-version: ["3.9", "3.8", "3.7"] + python-version: ["3.9", "3.8"] # This allows a subsequently queued workflow run to interrupt previous runs concurrency: diff --git a/pyproject.toml b/pyproject.toml index 929527e4..8659e327 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,7 +15,6 @@ classifiers = [ "Operating System :: MacOS :: MacOS X", "Operating System :: Microsoft :: Windows", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", ] @@ -40,7 +39,7 @@ dependencies = [ "ftfy>=6.0.3,<6.1.0", ] -requires-python = ">=3.7" +requires-python = ">=3.8" [project.urls] homepage = "https://hlasse.github.io/TextDescriptives/" @@ -72,10 +71,10 @@ tutorials = [ "seaborn", "matplotlib", "datasets>=2.8.0", - "scikit-learn>=1.0.1", + "scikit-learn>=1.1.1", ] sklearn = [ - "scikit-learn>=1.0.1", + "scikit-learn>=1.1.1", ] [project.readme]