Add support for series

HLasse · Sep 8, 2021 · ea54a46 · ea54a46
1 parent 555f091
commit ea54a46
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 6 deletions.
diff --git a/textdescriptives/components/pos_stats.py b/textdescriptives/components/pos_stats.py
@@ -2,7 +2,7 @@
 
 from spacy.tokens import Doc, Span
 from spacy.language import Language
-from typing import Counter
+from typing import Counter, Union
 
 from .utils import filtered_tokens
 
@@ -20,24 +20,27 @@ class POSStatistics:
     """spaCy v.3.0 component that adds attributes for POS statistics to `Doc` and `Span` objects.
     """
 
-    def __init__(self, nlp: Language):
+    def __init__(self, nlp: Language): # Is the parameter-hint incorrect, should it be "model" instead?
         """Initialise components"""
         if not Doc.has_extension("pos_proportions"):
             Doc.set_extension("pos_proportions", getter=self.pos_proportions)
+
+        if not Span.has_extension("pos_proportions"):
+            Span.set_extension("pos_proportions", getter=self.pos_proportions)
 
 
     def __call__(self, doc):
         """Run the pipeline component"""
         return doc
 
-    def pos_proportions(self, doc: Doc) -> dict:
+    def pos_proportions(self, input: Union[Doc, Span]) -> dict:
         """
             Returns:
                 Dict with proportions of part-of-speech tag in doc.
         """
         pos_counts = Counter()
 
-        pos_counts.update([token.tag_ for token in doc])
+        pos_counts.update([token.tag_ for token in input])
 
         pos_proportions = {tag : pos_counts[tag] / sum(pos_counts.values()) for tag in pos_counts}
 

diff --git a/textdescriptives/tests/test_pos_stats.py b/textdescriptives/tests/test_pos_stats.py
@@ -14,9 +14,18 @@ def nlp():
 def test_pos_integrations(nlp):
     assert "pos_stats" == nlp.pipe_names[-1]
 
-def test_pos_proportions(nlp):
+def test_pos_proportions_doc(nlp):
     doc = nlp(
         "Here is the first sentence. It was pretty short. Let's make another one that's slightly longer and more complex."
     )
 
-    assert doc._.pos_proportions == {'RB': 0.125, 'VBZ': 0.08333333333333333, 'DT': 0.08333333333333333, 'JJ': 0.125, 'NN': 0.08333333333333333, '.': 0.125, 'PRP': 0.08333333333333333, 'VBD': 0.041666666666666664, 'VB': 0.08333333333333333, 'WDT': 0.041666666666666664, 'JJR': 0.041666666666666664, 'CC': 0.041666666666666664, 'RBR': 0.041666666666666664}
+    assert doc._.pos_proportions == {'RB': 0.125, 'VBZ': 0.08333333333333333, 'DT': 0.08333333333333333, 'JJ': 0.125, 'NN': 0.08333333333333333, '.': 0.125, 'PRP': 0.08333333333333333, 'VBD': 0.041666666666666664, 'VB': 0.08333333333333333, 'WDT': 0.041666666666666664, 'JJR': 0.041666666666666664, 'CC': 0.041666666666666664, 'RBR': 0.041666666666666664}
+
+def test_pos_proportions_span(nlp):
+    doc = nlp(
+        "Here is the first sentence. It was pretty short. Let's make another one that's slightly longer and more complex."
+    )
+
+    span = doc[0:]
+
+    assert span._.pos_proportions == {'RB': 0.125, 'VBZ': 0.08333333333333333, 'DT': 0.08333333333333333, 'JJ': 0.125, 'NN': 0.08333333333333333, '.': 0.125, 'PRP': 0.08333333333333333, 'VBD': 0.041666666666666664, 'VB': 0.08333333333333333, 'WDT': 0.041666666666666664, 'JJR': 0.041666666666666664, 'CC': 0.041666666666666664, 'RBR': 0.041666666666666664}