From cb22a22f84fcbe1ffbdc95a399217d54c154bb50 Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 16 Aug 2021 16:37:19 +0200 Subject: [PATCH 1/4] add extract_dict function --- README.md | 5 +- docs/usingthepackage.rst | 7 +- textdescriptives/__init__.py | 1 + textdescriptives/dataframe_extract.py | 78 +++++++++++++++---- ...taframe_extractor.py => test_extractor.py} | 16 ++++ 5 files changed, 89 insertions(+), 18 deletions(-) rename textdescriptives/tests/{test_dataframe_extractor.py => test_extractor.py} (70%) diff --git a/README.md b/README.md index 187ba2cd..8ce17f9b 100644 --- a/README.md +++ b/README.md @@ -35,10 +35,11 @@ doc._.readability doc._.token_length ``` -TextDescriptives includes a convenience function for extracting metrics to a Pandas DataFrame +TextDescriptives includes convenience functions for extracting metrics to a Pandas DataFrame or a dictionary. ```py td.extract_df(doc) +# td.extract(dict(doc)) ``` | | text | token_length_mean | token_length_median | token_length_std | sentence_length_mean | sentence_length_median | sentence_length_std | syllables_per_token_mean | syllables_per_token_median | syllables_per_token_std | n_tokens | n_unique_tokens | proportion_unique_tokens | n_characters | n_sentences | flesch_reading_ease | flesch_kincaid_grade | smog | gunning_fog | automated_readability_index | coleman_liau_index | lix | rix | dependency_distance_mean | dependency_distance_std | prop_adjacent_dependency_relation_mean | prop_adjacent_dependency_relation_std | |---:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------:|----------------------:|-------------------:|-----------------------:|-------------------------:|----------------------:|---------------------------:|-----------------------------:|--------------------------:|-----------:|------------------:|---------------------------:|---------------:|--------------:|----------------------:|-----------------------:|--------:|--------------:|------------------------------:|---------------------:|--------:|------:|---------------------------:|--------------------------:|-----------------------------------------:|----------------------------------------:| @@ -46,7 +47,7 @@ td.extract_df(doc) Set which group(s) of metrics you want to extract using the `metrics` parameter (one or more of `readability`, `dependency_distance`, `descriptive_stats`, defaults to `all`) -If `extract_df` is called on an object created using `nlp.pipe` it will format the output with 1 row for each document and a column for each metric. +If `extract_df` is called on an object created using `nlp.pipe` it will format the output with 1 row for each document and a column for each metric. Similarly, `extract_dict` will have a key for each metric and values as a list of metrics (1 per doc). ```py docs = nlp.pipe(['The world is changed. I feel it in the water. I feel it in the earth. I smell it in the air. Much that once was is lost, for none now live who remember it.', 'He felt that his whole life was some kind of dream and he sometimes wondered whose it was and whether they were enjoying it.']) diff --git a/docs/usingthepackage.rst b/docs/usingthepackage.rst index 9efb38fb..1400b47e 100644 --- a/docs/usingthepackage.rst +++ b/docs/usingthepackage.rst @@ -16,12 +16,13 @@ Import the library and add the component to your pipeline using the string name doc._.readability doc._.token_length -The calculated metrics can be conveniently extracted to a Pandas DataFrame using the :code:`extract_df` function. +The calculated metrics can be conveniently extracted to a Pandas DataFrame using the :code:`extract_df` function or a dictionary using the :code:`extract_dict` function. .. code-block:: python td.extract_df(doc) + td.extract_dict(doc) You can control which measures to extract with the *metrics* argument. @@ -30,9 +31,9 @@ You can control which measures to extract with the *metrics* argument. td.extract_df(doc, metrics = ["descriptive_stats", "readability", "dependency_distance"]) .. note:: - By default, :code:`extract_df` adds a column containing the text. You can change this behaviour by setting :code:`include_text = False`. + By default, the :code:`extract_X` functions adds a column containing the text. You can change this behaviour by setting :code:`include_text = False`. -:code:`extract_df` also works on objects created by :code:`nlp.pipe`. The output will be formatted with 1 row for each document and a column for each metric. +:code:`extract_df` and :code:`extract_dict` also work on objects created by :code:`nlp.pipe`. The output will be formatted with 1 row for each document and a column for each metric. .. code-block:: python diff --git a/textdescriptives/__init__.py b/textdescriptives/__init__.py index 2efbf45e..b966fa74 100644 --- a/textdescriptives/__init__.py +++ b/textdescriptives/__init__.py @@ -2,6 +2,7 @@ from .components import DescriptiveStatistics, Readability, DependencyDistance from .dataframe_extract import ( extract_df, + extract_dict, readability_cols, dependency_cols, descriptive_stats_cols, diff --git a/textdescriptives/dataframe_extract.py b/textdescriptives/dataframe_extract.py index aca8c916..9a66f49f 100644 --- a/textdescriptives/dataframe_extract.py +++ b/textdescriptives/dataframe_extract.py @@ -1,20 +1,23 @@ """Extract metrics as Pandas DataFrame""" from spacy.tokens import Doc +from functools import reduce +from collections import defaultdict from typing import Union, List import types import pandas as pd -class DataFrameExtractor: +class Extractor: def __init__( self, doc: Doc, metrics: Union[List[str], str] = "all", include_text: bool = True, + as_dict = False ): - """Utility class to extract specified metrics to a Pandas DataFrame + """Utility class to extract specified metrics to a Pandas DataFrame or dictionary Args: doc (Doc): a spaCy doc @@ -40,27 +43,33 @@ def __init__( f"'metrics' contained invalid metric.\nValid metrics are: ['all', 'descriptive_stats', 'readability', 'dependency_distance']" ) + self.include_text = include_text + self.as_dict = as_dict + if include_text: - df_list = [pd.DataFrame([doc.text], columns=["text"])] + extraction = self.__extract_text(doc) else: - df_list = [] + extraction = [] if "all" in metrics: if doc.has_extension("counts"): - df_list.append(self.__descriptive_stats(doc)) + extraction.append(self.__descriptive_stats(doc)) if doc.has_extension("readability"): - df_list.append(self.__readability(doc)) + extraction.append(self.__readability(doc)) if doc.has_extension("dependency_distance"): - df_list.append(self.__dependency_distance(doc)) + extraction.append(self.__dependency_distance(doc)) else: if "descriptive_stats" in metrics: - df_list.append(self.__descriptive_stats(doc)) + extraction.append(self.__descriptive_stats(doc)) if "readability" in metrics: - df_list.append(self.__readability(doc)) + extraction.append(self.__readability(doc)) if "dependency_distance" in metrics: - df_list.append(self.__dependency_distance(doc)) + extraction.append(self.__dependency_distance(doc)) - self.df = pd.concat(df_list, axis=1) + if self.as_dict: + self.out = reduce(lambda a, b: {**a, **b}, extraction) + else: + self.out = pd.concat(extraction, axis=1) def __descriptive_stats(self, doc: Doc) -> pd.DataFrame: descriptive_stats = { @@ -69,14 +78,26 @@ def __descriptive_stats(self, doc: Doc) -> pd.DataFrame: **doc._.syllables, **doc._.counts, } + if self.as_dict: + return descriptive_stats return pd.DataFrame.from_records([descriptive_stats]) def __readability(self, doc: Doc) -> pd.DataFrame: + if self.as_dict: + return doc._.readability return pd.DataFrame.from_records([doc._.readability]) def __dependency_distance(self, doc: Doc) -> pd.DataFrame: + if self.as_dict: + return doc._.dependency_distance return pd.DataFrame.from_records([doc._.dependency_distance]) + def __extract_text(self, doc: Doc) -> Union[pd.DataFrame, str]: + if self.as_dict: + return [{"text" : doc.text}] + return [pd.DataFrame([doc.text], columns=["text"])] + + def extract_df( doc: Doc, metrics: Union[List[str], str] = "all", include_text: bool = True @@ -97,10 +118,40 @@ def extract_df( if isinstance(doc, types.GeneratorType): rows = [] for d in doc: - metric_df = DataFrameExtractor(d, metrics, include_text).df + metric_df = Extractor(d, metrics, include_text).out rows.append(metric_df) return pd.concat(rows, axis=0, ignore_index=True) - return DataFrameExtractor(doc, metrics, include_text).df + return Extractor(doc, metrics, include_text).out + + +def extract_dict( + doc: Doc, metrics: Union[List[str], str] = "all", include_text: bool = True +) -> dict: + """Extract calculated metrics from a spaCy Doc object or a generator of Docs from + nlp.pipe to a dictionary + + Args: + doc (Doc): a spaCy doc or a generator of spaCy Docs + metrics (Union[list[str], str], optional): Which metrics to extract. + One or more of ["descriptive_stats", "readability", "dependency_distance", "all"]. + Defaults to "all". + include_text (bool, optional): Whether to add an entry containing the text. Defaults to True. + + Returns: + dict: Dictionary with a key for each metric. + """ + if isinstance(doc, types.GeneratorType): + dict_list = [] + for d in doc: + metric_dict = Extractor(d, metrics, include_text, as_dict=True).out + dict_list.append(metric_dict) + # concatenate values from each dict in list + out = defaultdict(list) + for d in (dict_list): + for key, value in d.items(): + out[key].append(value) + return dict(out) + return Extractor(doc, metrics, include_text, as_dict=True).out """Helpers to subset an extracted dataframe""" @@ -139,3 +190,4 @@ def extract_df( "n_sentences", "n_characters", ] + diff --git a/textdescriptives/tests/test_dataframe_extractor.py b/textdescriptives/tests/test_extractor.py similarity index 70% rename from textdescriptives/tests/test_dataframe_extractor.py rename to textdescriptives/tests/test_extractor.py index c45ce3aa..3c77b65b 100644 --- a/textdescriptives/tests/test_dataframe_extractor.py +++ b/textdescriptives/tests/test_extractor.py @@ -43,3 +43,19 @@ def test_extract_df_error(nlp): td.extract_df(doc, metrics="not a metric") with pytest.raises(Exception) as e_info: td.extract_df(doc, metrics=True) + + +def test_extract_dict_single_doc(nlp): + doc = nlp("This is just a cute little text. Actually, it's two sentences.") + td.extract_dict(doc) + for metric in ["descriptive_stats", "readability", "dependency_distance"]: + td.extract_dict(doc, metrics=metric) + + +def test_extract_df_pipe(nlp): + text = [ + "I wonder how well the function works on multiple documents", + "Very exciting to see, don't you think?", + ] + docs = nlp.pipe(text) + assert len(td.extract_dict(docs)["token_length_mean"]) == 2 \ No newline at end of file From 77928957986d2c0eb541d683f57fdf21292f6ab7 Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 16 Aug 2021 16:40:34 +0200 Subject: [PATCH 2/4] minor --- textdescriptives/dataframe_extract.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/textdescriptives/dataframe_extract.py b/textdescriptives/dataframe_extract.py index 9a66f49f..b878c4eb 100644 --- a/textdescriptives/dataframe_extract.py +++ b/textdescriptives/dataframe_extract.py @@ -47,7 +47,7 @@ def __init__( self.as_dict = as_dict if include_text: - extraction = self.__extract_text(doc) + extraction = [self.__extract_text(doc)] else: extraction = [] @@ -94,8 +94,8 @@ def __dependency_distance(self, doc: Doc) -> pd.DataFrame: def __extract_text(self, doc: Doc) -> Union[pd.DataFrame, str]: if self.as_dict: - return [{"text" : doc.text}] - return [pd.DataFrame([doc.text], columns=["text"])] + return {"text" : doc.text} + return pd.DataFrame([doc.text], columns=["text"]) From e8b2fe4a630b7451c370eb32ee58bc6b2cf297d4 Mon Sep 17 00:00:00 2001 From: HLasse Date: Tue, 17 Aug 2021 12:38:58 +0200 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Kenneth Enevoldsen --- README.md | 2 +- textdescriptives/dataframe_extract.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 8ce17f9b..95fe0432 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ TextDescriptives includes convenience functions for extracting metrics to a Pand ```py td.extract_df(doc) -# td.extract(dict(doc)) +# td.extract_dict(doc) ``` | | text | token_length_mean | token_length_median | token_length_std | sentence_length_mean | sentence_length_median | sentence_length_std | syllables_per_token_mean | syllables_per_token_median | syllables_per_token_std | n_tokens | n_unique_tokens | proportion_unique_tokens | n_characters | n_sentences | flesch_reading_ease | flesch_kincaid_grade | smog | gunning_fog | automated_readability_index | coleman_liau_index | lix | rix | dependency_distance_mean | dependency_distance_std | prop_adjacent_dependency_relation_mean | prop_adjacent_dependency_relation_std | |---:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------:|----------------------:|-------------------:|-----------------------:|-------------------------:|----------------------:|---------------------------:|-----------------------------:|--------------------------:|-----------:|------------------:|---------------------------:|---------------:|--------------:|----------------------:|-----------------------:|--------:|--------------:|------------------------------:|---------------------:|--------:|------:|---------------------------:|--------------------------:|-----------------------------------------:|----------------------------------------:| diff --git a/textdescriptives/dataframe_extract.py b/textdescriptives/dataframe_extract.py index b878c4eb..f3f8e78e 100644 --- a/textdescriptives/dataframe_extract.py +++ b/textdescriptives/dataframe_extract.py @@ -190,4 +190,3 @@ def extract_dict( "n_sentences", "n_characters", ] - From 24afa64b1c27c53445a362cd029c1e0d04faf338 Mon Sep 17 00:00:00 2001 From: Lasse Date: Tue, 17 Aug 2021 12:39:46 +0200 Subject: [PATCH 4/4] fix subsetter test --- textdescriptives/tests/test_extractor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/textdescriptives/tests/test_extractor.py b/textdescriptives/tests/test_extractor.py index 3c77b65b..4aafa2ab 100644 --- a/textdescriptives/tests/test_extractor.py +++ b/textdescriptives/tests/test_extractor.py @@ -29,9 +29,9 @@ def test_extract_df_pipe(nlp): def test_extract_df_subsetters(nlp): doc = nlp("This is just a cute little text. Actually, it's two sentences.") df = td.extract_df(doc, include_text=False) - df[td.readability_cols] - df[td.dependency_cols] - df[td.descriptive_stats_cols] + assert "token_length_mean" not in df[td.readability_cols].columns + assert "token_length_mean" not in df[td.dependency_cols].columns + assert "lix" not in df[td.descriptive_stats_cols].columns def test_extract_df_error(nlp):