Merge pull request #7 from HLasse/master

master to posstatistics
HLasse · Aug 31, 2021 · 64f9f17 · 64f9f17
2 parents a72ab59 + 555f091
commit 64f9f17
Show file tree

Hide file tree

Showing 8 changed files with 92 additions and 10 deletions.
diff --git a/textdescriptives/__init__.py b/textdescriptives/__init__.py
@@ -1,5 +1,5 @@
 from .load_components import TextDescriptives
-from .components import DescriptiveStatistics, Readability, DependencyDistance
+from .components import DescriptiveStatistics, Readability, DependencyDistance, POSStatistics
 from .dataframe_extract import (
     extract_df,
     extract_dict,

diff --git a/textdescriptives/about.py b/textdescriptives/about.py
@@ -1,3 +1,3 @@
 __title__ = "textdescriptives"
-__version__ = "1.0.2"  # the ONLY source of version ID
+__version__ = "1.0.3"  # the ONLY source of version ID
 __download_url__ = "https://github.com/HLasse/textdescriptives"
diff --git a/textdescriptives/components/__init__.py b/textdescriptives/components/__init__.py
@@ -1,3 +1,4 @@
 from .readability import Readability
 from .dependency_distance import DependencyDistance
 from .descriptive_stats import DescriptiveStatistics
+from .pos_stats import POSStatistics
diff --git a/textdescriptives/components/descriptive_stats.py b/textdescriptives/components/descriptive_stats.py
@@ -97,6 +97,13 @@ def syllables(self, doc: Doc):
         }
 
     def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True):
+        """Returns:
+             Dict with keys:
+                n_tokens,
+                n_unique_tokens,
+                proportion_unique_tokens,
+                n_characters
+        """
         n_tokens = doc._._n_tokens
         n_types = len(set([tok.lower_ for tok in doc._._filtered_tokens]))
         if ignore_whitespace:
@@ -116,4 +123,4 @@ def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True):
         }
         if isinstance(doc, Doc):
             out["n_sentences"] = doc._._n_sentences
-        return out
+        return out
diff --git a/textdescriptives/components/pos_stats.py b/textdescriptives/components/pos_stats.py
@@ -0,0 +1,44 @@
+"""Calculation of statistics that require a pos-tagger in the pipeline"""
+
+from spacy.tokens import Doc, Span
+from spacy.language import Language
+from typing import Counter
+
+from .utils import filtered_tokens
+
+@Language.factory("pos_stats")
+def create_pos_stats_component(nlp: Language, name: str):
+    """Allows PosStats to be added to a spaCy pipe using nlp.add_pipe("pos_stats").
+    If the pipe does not contain a tagger, is is silently added."""
+
+    tagger = set(["tagger"])
+    if not tagger.intersection(set(nlp.pipe_names)):
+        nlp.add_pipe("tagger")  # add a tagger if not one in pipe
+    return POSStatistics(nlp)
+
+class POSStatistics:
+    """spaCy v.3.0 component that adds attributes for POS statistics to `Doc` and `Span` objects.
+    """
+
+    def __init__(self, nlp: Language):
+        """Initialise components"""
+        if not Doc.has_extension("pos_proportions"):
+            Doc.set_extension("pos_proportions", getter=self.pos_proportions)
+
+
+    def __call__(self, doc):
+        """Run the pipeline component"""
+        return doc
+
+    def pos_proportions(self, doc: Doc) -> dict:
+        """
+            Returns:
+                Dict with proportions of part-of-speech tag in doc.
+        """
+        pos_counts = Counter()
+
+        pos_counts.update([token.tag_ for token in doc])
+
+        pos_proportions = {tag : pos_counts[tag] / sum(pos_counts.values()) for tag in pos_counts}
+
+        return pos_proportions
diff --git a/textdescriptives/dataframe_extract.py b/textdescriptives/dataframe_extract.py
@@ -22,15 +22,15 @@ def __init__(
         Args:
             doc (Doc): a spaCy doc
             metrics (Union[list[str], str], optional): Which metrics to extract.
-                One or more of ["descriptive_stats", "readability", "dependency_distance", "all"].
+                One or more of ["descriptive_stats", "readability", "dependency_distance", "pos_stats", "all"].
                 Defaults to "all".
             include_text (bool, optional): Whether to add a column containing the text. Defaults to True.
         """
         if not isinstance(doc, (Doc)):
             raise TypeError(f"doc should be a spaCy Doc object, not {type(doc)}.")
 
         valid_metrics = set(
-            ["descriptive_stats", "readability", "dependency_distance", "all"]
+            ["descriptive_stats", "readability", "dependency_distance", "pos_stats", "all"]
         )
         if isinstance(metrics, str):
             metrics = [metrics]
@@ -40,7 +40,7 @@ def __init__(
             )
         if not set(metrics).issubset(valid_metrics):
             raise ValueError(
-                f"'metrics' contained invalid metric.\nValid metrics are: ['all', 'descriptive_stats', 'readability', 'dependency_distance']"
+                f"'metrics' contained invalid metric.\nValid metrics are: ['all', 'descriptive_stats', 'readability', 'dependency_distance', 'pos_stats']"
             )
 
         self.include_text = include_text
@@ -58,13 +58,17 @@ def __init__(
                 extraction.append(self.__readability(doc))
             if doc.has_extension("dependency_distance"):
                 extraction.append(self.__dependency_distance(doc))
+            if doc.has_extension("pos_proportions"):
+                extraction.append(self.__pos_proportions(doc))
         else:
             if "descriptive_stats" in metrics:
                 extraction.append(self.__descriptive_stats(doc))
             if "readability" in metrics:
                 extraction.append(self.__readability(doc))
             if "dependency_distance" in metrics:
                 extraction.append(self.__dependency_distance(doc))
+            if "pos_stats" in metrics:
+                extraction.append(self.__pos_proportins(doc))
 
         if self.as_dict:
             self.out = reduce(lambda a, b: {**a, **b}, extraction)
@@ -96,6 +100,11 @@ def __extract_text(self, doc: Doc) -> Union[pd.DataFrame, str]:
         if self.as_dict:
             return {"text" : doc.text}
         return pd.DataFrame([doc.text], columns=["text"])
+
+    def __pos_stats(self, doc: Doc) -> pd.DataFrame:
+        if self.as_dict:
+            return doc._.pos_proportions
+        return pd.DataFrame.from_records([doc._.pos_proportions])
 
 
 
@@ -108,7 +117,7 @@ def extract_df(
     Args:
         doc (Doc): a spaCy doc or a generator of spaCy Docs
         metrics (Union[list[str], str], optional): Which metrics to extract.
-                One or more of ["descriptive_stats", "readability", "dependency_distance", "all"].
+                One or more of ["descriptive_stats", "readability", "dependency_distance", "pos_stats", "all"].
                 Defaults to "all".
         include_text (bool, optional): Whether to add a column containing the text. Defaults to True.
 
@@ -133,7 +142,7 @@ def extract_dict(
     Args:
         doc (Doc): a spaCy doc or a generator of spaCy Docs
         metrics (Union[list[str], str], optional): Which metrics to extract.
-                One or more of ["descriptive_stats", "readability", "dependency_distance", "all"].
+                One or more of ["descriptive_stats", "readability", "dependency_distance", "pos_stats", "all"].
                 Defaults to "all".
         include_text (bool, optional): Whether to add an entry containing the text. Defaults to True.
 

diff --git a/textdescriptives/tests/test_descriptive_stats.py b/textdescriptives/tests/test_descriptive_stats.py
@@ -74,11 +74,10 @@ def test_counts(nlp):
     assert doc[0:6]._.counts["proportion_unique_tokens"] == 1.0
     assert doc[0:6]._.counts["n_characters"] == 23
 
-
 @pytest.mark.parametrize("text", ["", "#"])
 def test_descriptive_edge(text, nlp):
     doc = nlp(text)
     assert doc._.token_length
     assert doc._.sentence_length
     assert doc._.syllables
-    assert doc._.counts
+    assert doc._.counts
diff --git a/textdescriptives/tests/test_pos_stats.py b/textdescriptives/tests/test_pos_stats.py
@@ -0,0 +1,22 @@
+import spacy
+from spacy.lang.en import English
+import pytest
+from textdescriptives.components import POSStatistics
+
+@pytest.fixture(scope="function")
+
+def nlp():
+    nlp = spacy.load("en_core_web_sm", disable=('ner', 'textcat'))
+    nlp.add_pipe("pos_stats")
+
+    return nlp
+
+def test_pos_integrations(nlp):
+    assert "pos_stats" == nlp.pipe_names[-1]
+
+def test_pos_proportions(nlp):
+    doc = nlp(
+        "Here is the first sentence. It was pretty short. Let's make another one that's slightly longer and more complex."
+    )
+
+    assert doc._.pos_proportions == {'RB': 0.125, 'VBZ': 0.08333333333333333, 'DT': 0.08333333333333333, 'JJ': 0.125, 'NN': 0.08333333333333333, '.': 0.125, 'PRP': 0.08333333333333333, 'VBD': 0.041666666666666664, 'VB': 0.08333333333333333, 'WDT': 0.041666666666666664, 'JJR': 0.041666666666666664, 'CC': 0.041666666666666664, 'RBR': 0.041666666666666664}