Merge pull request #5 from HLasse/dev

add extract_dict function
HLasse · Aug 17, 2021 · 70f1494 · 70f1494 · github-actions · Aug 17, 2021
2 parents a7ad3d6 + a72ab59
commit 70f1494
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 21 deletions.
diff --git a/README.md b/README.md
@@ -35,18 +35,19 @@ doc._.readability
 doc._.token_length
 ```
 
-TextDescriptives includes a convenience function for extracting metrics to a Pandas DataFrame
+TextDescriptives includes convenience functions for extracting metrics to a Pandas DataFrame or a dictionary.
 
 ```py
 td.extract_df(doc)
+# td.extract_dict(doc)
 ```
 |    | text                                                                                                                                                        |   token_length_mean |   token_length_median |   token_length_std |   sentence_length_mean |   sentence_length_median |   sentence_length_std |   syllables_per_token_mean |   syllables_per_token_median |   syllables_per_token_std |   n_tokens |   n_unique_tokens |   proportion_unique_tokens |   n_characters |   n_sentences |   flesch_reading_ease |   flesch_kincaid_grade |    smog |   gunning_fog |   automated_readability_index |   coleman_liau_index |     lix |   rix |   dependency_distance_mean |   dependency_distance_std |   prop_adjacent_dependency_relation_mean |   prop_adjacent_dependency_relation_std |
 |---:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------:|----------------------:|-------------------:|-----------------------:|-------------------------:|----------------------:|---------------------------:|-----------------------------:|--------------------------:|-----------:|------------------:|---------------------------:|---------------:|--------------:|----------------------:|-----------------------:|--------:|--------------:|------------------------------:|---------------------:|--------:|------:|---------------------------:|--------------------------:|-----------------------------------------:|----------------------------------------:|
 |  0 | The world (...) |             3.28571 |                     3 |            1.54127 |                      7 |                        6 |               3.09839 |                    1.08571 |                            1 |                  0.368117 |         35 |                23 |                   0.657143 |            121 |             5 |               107.879 |             -0.0485714 | 5.68392 |       3.94286 |                      -2.45429 |            -0.708571 | 12.7143 |   0.4 |                    1.69524 |                  0.422282 |                                  0.44381 |                               0.0863679 |
 
 Set which group(s) of metrics you want to extract using the `metrics` parameter (one or more of `readability`, `dependency_distance`, `descriptive_stats`, defaults to `all`)
 
-If `extract_df` is called on an object created using `nlp.pipe` it will format the output with 1 row for each document and a column for each metric.
+If `extract_df` is called on an object created using `nlp.pipe` it will format the output with 1 row for each document and a column for each metric. Similarly, `extract_dict` will have a key for each metric and values as a list of metrics (1 per doc).
 ```py
 docs = nlp.pipe(['The world is changed. I feel it in the water. I feel it in the earth. I smell it in the air. Much that once was is lost, for none now live who remember it.',
             'He felt that his whole life was some kind of dream and he sometimes wondered whose it was and whether they were enjoying it.'])

diff --git a/docs/usingthepackage.rst b/docs/usingthepackage.rst
@@ -16,12 +16,13 @@ Import the library and add the component to your pipeline using the string name
    doc._.readability
    doc._.token_length
 
-The calculated metrics can be conveniently extracted to a Pandas DataFrame using the :code:`extract_df` function.
+The calculated metrics can be conveniently extracted to a Pandas DataFrame using the :code:`extract_df` function or a dictionary using the :code:`extract_dict` function.
 
 
 .. code-block:: python
 
    td.extract_df(doc)
+   td.extract_dict(doc)
 
 You can control which measures to extract with the *metrics* argument.
 
@@ -30,9 +31,9 @@ You can control which measures to extract with the *metrics* argument.
    td.extract_df(doc, metrics = ["descriptive_stats", "readability", "dependency_distance"])
 
 .. note::
-   By default, :code:`extract_df` adds a column containing the text. You can change this behaviour by setting :code:`include_text = False`.
+   By default, the :code:`extract_X` functions adds a column containing the text. You can change this behaviour by setting :code:`include_text = False`.
 
-:code:`extract_df` also works on objects created by :code:`nlp.pipe`. The output will be formatted with 1 row for each document and a column for each metric.
+:code:`extract_df` and :code:`extract_dict` also work on objects created by :code:`nlp.pipe`. The output will be formatted with 1 row for each document and a column for each metric.
 
 .. code-block:: python
 

diff --git a/textdescriptives/__init__.py b/textdescriptives/__init__.py
@@ -2,6 +2,7 @@
 from .components import DescriptiveStatistics, Readability, DependencyDistance
 from .dataframe_extract import (
     extract_df,
+    extract_dict,
     readability_cols,
     dependency_cols,
     descriptive_stats_cols,

diff --git a/textdescriptives/dataframe_extract.py b/textdescriptives/dataframe_extract.py
@@ -1,20 +1,23 @@
 """Extract metrics as Pandas DataFrame"""
 from spacy.tokens import Doc
 
+from functools import reduce
+from collections import defaultdict
 from typing import Union, List
 import types
 
 import pandas as pd
 
 
-class DataFrameExtractor:
+class Extractor:
     def __init__(
         self,
         doc: Doc,
         metrics: Union[List[str], str] = "all",
         include_text: bool = True,
+        as_dict = False
     ):
-        """Utility class to extract specified metrics to a Pandas DataFrame
+        """Utility class to extract specified metrics to a Pandas DataFrame or dictionary
 
         Args:
             doc (Doc): a spaCy doc
@@ -40,27 +43,33 @@ def __init__(
                 f"'metrics' contained invalid metric.\nValid metrics are: ['all', 'descriptive_stats', 'readability', 'dependency_distance']"
             )
 
+        self.include_text = include_text
+        self.as_dict = as_dict
+
         if include_text:
-            df_list = [pd.DataFrame([doc.text], columns=["text"])]
+            extraction = [self.__extract_text(doc)]
         else:
-            df_list = []
+            extraction = []
 
         if "all" in metrics:
             if doc.has_extension("counts"):
-                df_list.append(self.__descriptive_stats(doc))
+                extraction.append(self.__descriptive_stats(doc))
             if doc.has_extension("readability"):
-                df_list.append(self.__readability(doc))
+                extraction.append(self.__readability(doc))
             if doc.has_extension("dependency_distance"):
-                df_list.append(self.__dependency_distance(doc))
+                extraction.append(self.__dependency_distance(doc))
         else:
             if "descriptive_stats" in metrics:
-                df_list.append(self.__descriptive_stats(doc))
+                extraction.append(self.__descriptive_stats(doc))
             if "readability" in metrics:
-                df_list.append(self.__readability(doc))
+                extraction.append(self.__readability(doc))
             if "dependency_distance" in metrics:
-                df_list.append(self.__dependency_distance(doc))
+                extraction.append(self.__dependency_distance(doc))
 
-        self.df = pd.concat(df_list, axis=1)
+        if self.as_dict:
+            self.out = reduce(lambda a, b: {**a, **b}, extraction)
+        else:
+            self.out = pd.concat(extraction, axis=1)
 
     def __descriptive_stats(self, doc: Doc) -> pd.DataFrame:
         descriptive_stats = {
@@ -69,14 +78,26 @@ def __descriptive_stats(self, doc: Doc) -> pd.DataFrame:
             **doc._.syllables,
             **doc._.counts,
         }
+        if self.as_dict:
+            return descriptive_stats
         return pd.DataFrame.from_records([descriptive_stats])
 
     def __readability(self, doc: Doc) -> pd.DataFrame:
+        if self.as_dict:
+            return doc._.readability
         return pd.DataFrame.from_records([doc._.readability])
 
     def __dependency_distance(self, doc: Doc) -> pd.DataFrame:
+        if self.as_dict:
+            return doc._.dependency_distance
         return pd.DataFrame.from_records([doc._.dependency_distance])
 
+    def __extract_text(self, doc: Doc) -> Union[pd.DataFrame, str]:
+        if self.as_dict:
+            return {"text" : doc.text}
+        return pd.DataFrame([doc.text], columns=["text"])
+
+
 
 def extract_df(
     doc: Doc, metrics: Union[List[str], str] = "all", include_text: bool = True
@@ -97,10 +118,40 @@ def extract_df(
     if isinstance(doc, types.GeneratorType):
         rows = []
         for d in doc:
-            metric_df = DataFrameExtractor(d, metrics, include_text).df
+            metric_df = Extractor(d, metrics, include_text).out
             rows.append(metric_df)
         return pd.concat(rows, axis=0, ignore_index=True)
-    return DataFrameExtractor(doc, metrics, include_text).df
+    return Extractor(doc, metrics, include_text).out
+
+
+def extract_dict(
+    doc: Doc, metrics: Union[List[str], str] = "all", include_text: bool = True
+) -> dict:
+    """Extract calculated metrics from a spaCy Doc object or a generator of Docs from
+    nlp.pipe to a dictionary
+
+    Args:
+        doc (Doc): a spaCy doc or a generator of spaCy Docs
+        metrics (Union[list[str], str], optional): Which metrics to extract.
+                One or more of ["descriptive_stats", "readability", "dependency_distance", "all"].
+                Defaults to "all".
+        include_text (bool, optional): Whether to add an entry containing the text. Defaults to True.
+
+    Returns:
+        dict: Dictionary with a key for each metric.
+    """
+    if isinstance(doc, types.GeneratorType):
+        dict_list = []
+        for d in doc:
+            metric_dict = Extractor(d, metrics, include_text, as_dict=True).out
+            dict_list.append(metric_dict)
+        # concatenate values from each dict in list
+        out = defaultdict(list)
+        for d in (dict_list): 
+            for key, value in d.items():
+                out[key].append(value)
+        return dict(out)
+    return Extractor(doc, metrics, include_text, as_dict=True).out
 
 
 """Helpers to subset an extracted dataframe"""

diff --git a/...iptives/tests/test_dataframe_extractor.py → textdescriptives/tests/test_extractor.py b/...iptives/tests/test_dataframe_extractor.py → textdescriptives/tests/test_extractor.py
@@ -29,9 +29,9 @@ def test_extract_df_pipe(nlp):
 def test_extract_df_subsetters(nlp):
     doc = nlp("This is just a cute little text. Actually, it's two sentences.")
     df = td.extract_df(doc, include_text=False)
-    df[td.readability_cols]
-    df[td.dependency_cols]
-    df[td.descriptive_stats_cols]
+    assert "token_length_mean" not in df[td.readability_cols].columns
+    assert "token_length_mean" not in df[td.dependency_cols].columns
+    assert "lix" not in df[td.descriptive_stats_cols].columns
 
 
 def test_extract_df_error(nlp):
@@ -43,3 +43,19 @@ def test_extract_df_error(nlp):
         td.extract_df(doc, metrics="not a metric")
     with pytest.raises(Exception) as e_info:
         td.extract_df(doc, metrics=True)
+
+
+def test_extract_dict_single_doc(nlp):
+    doc = nlp("This is just a cute little text. Actually, it's two sentences.")
+    td.extract_dict(doc)
+    for metric in ["descriptive_stats", "readability", "dependency_distance"]:
+        td.extract_dict(doc, metrics=metric)
+
+
+def test_extract_df_pipe(nlp):
+    text = [
+        "I wonder how well the function works on multiple documents",
+        "Very exciting to see, don't you think?",
+    ]
+    docs = nlp.pipe(text)
+    assert len(td.extract_dict(docs)["token_length_mean"]) == 2
File	Stmts	Miss	Cover	Missing
textdescriptives
init.py	4	0	100%
about.py	3	0	100%
dataframe_extract.py	78	5	94%	119–123
load_components.py	12	0	100%
textdescriptives/components
init.py	3	0	100%
dependency_distance.py	32	0	100%
descriptive_stats.py	51	1	98%	105
readability.py	72	0	100%
utils.py	16	0	100%
TOTAL	271	6	98%