add utils and desc stats

HLasse · Jul 23, 2021 · 6fdcd3b · 6fdcd3b
1 parent 4c25726
commit 6fdcd3b
Show file tree

Hide file tree

Showing 4 changed files with 180 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -127,3 +127,6 @@ dmypy.json
 
 # Pyre type checker
 .pyre/
+
+# Mac stuff
+.DStore
diff --git a/spacy-textdescriptives/__init__.py b/spacy-textdescriptives/__init__.py
diff --git a/spacy-textdescriptives/descriptive_stats.py b/spacy-textdescriptives/descriptive_stats.py
@@ -0,0 +1,101 @@
+"""Calculation of descriptive statistics"""
+from spacy.tokens import Doc
+from spacy.language import Language
+
+import numpy as np
+
+
+@Language.factory("descriptive_stats")
+def create_descriptive_statistics_component(nlp: Language, name: str):
+    return DescriptiveStatistics(nlp)
+
+
+class DescriptiveStatistics:
+    def __init__(self, nlp: Language):
+        """Initialise components"""
+        if not Doc.has_extension("token_length"):
+            Doc.set_extension("token_length", getter=self.token_length)
+
+        if not Doc.has_extension("sentence_length"):
+            Doc.set_extension("sentence_length", getter=self.sentence_length)
+
+        if not Doc.has_extension("syllables"):
+            Doc.set_extension("syllables", getter=self.syllables)
+
+        if not Doc.has_extension("counts"):
+            Doc.set_extension("counts", getter=self.counts)
+
+    def __call__(self, doc):
+        """Run the pipeline component"""
+        return doc
+
+    def token_length(self, doc):
+        """Return dict with measures of token length"""
+        token_lengths = [len(token) for token in doc._.filtered_tokens]
+        return {
+            "token_length_mean": np.mean(token_lengths),
+            "token_length_median": np.median(token_lengths),
+            "token_length_std": np.std(token_lengths),
+        }
+
+    def sentence_length(self, doc):
+        """Return dict with measures of sentence length"""
+        # get length of filtered tokens per sentence
+        tokenized_sentences = [
+            [
+                token.text
+                for token in sent
+                if not token.is_punct and "'" not in token.text
+            ]
+            for sent in doc.sents
+        ]
+        len_sentences = [len(sentence) for sentence in tokenized_sentences]
+        return {
+            "sentence_length_mean": np.mean(len_sentences),
+            "sentence_length_median": np.median(len_sentences),
+            "sentence_length_std": np.std(len_sentences),
+        }
+
+    def syllables(self, doc):
+        """Return dict with measures of syllables per token"""
+        return {
+            "syllables_per_token_mean": np.mean(doc._.n_syllables),
+            "syllables_per_token_median": np.median(doc._.n_syllables),
+            "syllables_per_token_std": np.std(doc._.n_syllables),
+        }
+
+    def counts(self, doc, ignore_whitespace=True):
+        n_tokens = len(doc._.filtered_tokens)
+        n_types = len(set([tok.lower_ for tok in doc._.filtered_tokens]))
+        if ignore_whitespace:
+            n_chars = len(doc.text.replace(" ", ""))
+        else:
+            n_chars = len(doc.text)
+        return {
+            "n_tokens": n_tokens,
+            "n_unique_tokens": n_types,
+            "percent_unique_tokens": n_types / n_tokens,
+            "n_sentences": doc._.n_sentences,
+            "n_characters": n_chars,
+        }
+
+
+"""
+import spacy
+from utils import create_utils_component
+
+nlp = spacy.load('da_core_news_sm')
+nlp.add_pipe("utilities", last=True)
+nlp.add_pipe("descriptive_stats", last=True)
+
+doc = nlp("Det her er en testsætning. Her er sætning nummer 2")
+
+doc._.n_words
+doc._.filtered_tokens
+doc._.token_length
+doc._.sentence_length
+doc._.n_sentences
+doc._.n_syllables
+doc._.counts
+
+"""
diff --git a/spacy-textdescriptives/utils.py b/spacy-textdescriptives/utils.py
@@ -0,0 +1,76 @@
+"""Utility functions for calculating various text descriptives"""
+from spacy.tokens import Doc
+from spacy.language import Language
+from pyphen import Pyphen
+
+
+@Language.factory("utilities")
+def create_utils_component(nlp: Language, name: str):
+    return Utils(nlp)
+
+
+class Utils:
+    def __init__(self, nlp: Language):
+        """Initialise components
+        Only calculate n_sentences, n_words, n_syllabes when needed using getters"""
+        if not Doc.has_extension("n_sentences"):
+            Doc.set_extension("n_sentences", getter=self.n_sentences)
+
+        if not Doc.has_extension("n_words"):
+            Doc.set_extension("n_words", getter=self.n_words)
+
+        if not Doc.has_extension("n_syllables"):
+            Doc.set_extension("n_syllables", getter=self.n_syllables)
+
+        if not Doc.has_extension("filtered_tokens"):
+            Doc.set_extension("filtered_tokens", default=[])
+
+    def __call__(self, doc):
+        """Run the pipeline component"""
+        doc._.filtered_tokens = self.filtered_tokens(doc)
+        return doc
+
+    def filtered_tokens(self, doc: Doc):
+        """Return words in document.
+        Filters punctuation and words that start with an apostrophe (contractions)"""
+        filtered_tokens = [
+            word for word in doc if not word.is_punct and "'" not in word.text
+        ]
+        return filtered_tokens
+
+    def n_sentences(self, doc: Doc):
+        """Return number of sentences in the document"""
+        return len(list(doc.sents))
+
+    def n_words(self, doc: Doc):
+        """Return number of words in the document."""
+        return len(doc._.filtered_tokens)
+
+    def n_syllables(self, doc):
+        """
+        Return number of syllables per token
+        """
+        dic = Pyphen(lang=doc.lang_)
+
+        def count_syl(token):
+            word_hyphenated = dic.inserted(token.lower_)
+            return max(1, word_hyphenated.count("-") + 1)
+
+        return [count_syl(token) for token in doc._.filtered_tokens]
+
+"""
+import spacy
+nlp = spacy.load('da_core_news_sm')
+nlp.add_pipe("utilities", last=True)
+
+doc = nlp("Det her er en testsætning. Her er sætning nummer 2")
+
+for sent_i, sent in enumerate(doc.sents):
+    for token in sent:
+        print(sent_i, token.i, token.text)
+doc._.n_words
+doc._.filtered_tokens
+doc._.n_sentences
+doc._.n_syllables
+
+"""
-Original file line number
+Diff line change
@@ Expand Up / @@ -127,3 +127,6 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    # Mac stuff
+    .DStore