add tests, minor fixes

HLasse · Jul 27, 2021 · 48e1b3c · 48e1b3c
1 parent 483c86e
commit 48e1b3c
Show file tree

Hide file tree

Showing 18 changed files with 990 additions and 34 deletions.
diff --git a/.gitignore b/.gitignore
@@ -130,3 +130,5 @@ dmypy.json
 
 # Mac stuff
 .DStore
+
+.vscode
diff --git a/README.md b/README.md
@@ -103,7 +103,6 @@ The table below shows the metrics included in TextDescriptives and their attribu
 | `Doc._.readability`        | `readability` | Dict containing Flesch Reading Ease, Flesch-Kincaid Grade, SMOG, Gunning-Fog, Automated Readability Index, Coleman-Liau Index, LIX, and RIX readability metrics for the Doc. |
 | `Doc._.dependency_distance`        | `dependency_distance` | Dict containing the mean and standard deviation of the dependency distance and proportion adjacent dependency relations in the Doc.|
 `Span._.token_length`   | `descriptive_stats`                       | Dict containing mean, median, and std of token length in the span.                                |
-| `Span._.syllables`    | `descriptive_stats`                       | Dict containing mean, median, and std of number of syllables per token in the span.  |
 | `Span._.counts`        | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the span. |
 | `Span._.dependency_distance`        | `dependency_distance` | Dict containing the mean dependency distance and proportion adjacent dependency relations in the Doc.|
 | `Token._.dependency_distance`        | `dependency_distance` | Dict containing the dependency distance and whether the head word is adjacent for a Token.|

diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
     author="Lasse Hansen",
     author_email="[email protected]",
     url="https://github.com/HLasse/textdescriptives",
-    packages=["spacy-textdescriptives"],
+    packages=["textdescriptives"],
     install_requires=[
         "spacy>=3.0.3",
         "numpy>=1.20.0",

diff --git a/textdescriptives/__init__.py b/textdescriptives/__init__.py
@@ -1,2 +1,3 @@
 from .load_components import TextDescriptives
-from .extractor import extract_df
+from .components import DescriptiveStatistics, Readability, DependencyDistance
+from .extractor import extract_df, readability_cols, dependency_cols, descriptive_stats_cols
diff --git a/textdescriptives/components/__init__.py b/textdescriptives/components/__init__.py
@@ -1,3 +1,3 @@
-from .readability import create_readability_component
-from .dependency_distance import create_dependency_distance_component
-from .descriptive_stats import create_descriptive_stats_component
+from .readability import Readability
+from .dependency_distance import DependencyDistance
+from .descriptive_stats import DescriptiveStatistics
diff --git a/textdescriptives/components/dependency_distance.py b/textdescriptives/components/dependency_distance.py
@@ -46,6 +46,13 @@ def span_dependency(self, span: Span):
 
     def doc_dependency(self, doc: Doc):
         """Doc-level dependency distance aggregated on sentence level"""
+        if len(doc) == 0:
+            return {
+                "dependency_distance_mean": np.nan,
+                "dependency_distance_std": np.nan,
+                "prop_adjacent_dependency_relation_mean": np.nan,
+                "prop_adjacent_dependency_relation_std": np.nan,
+            }
         dep_dists, adj_deps = zip(
             *[sent._.dependency_distance.values() for sent in doc.sents]
         )

diff --git a/textdescriptives/components/descriptive_stats.py b/textdescriptives/components/descriptive_stats.py
@@ -9,6 +9,9 @@
 
 @Language.factory("descriptive_stats")
 def create_descriptive_stats_component(nlp: Language, name: str):
+    sentencizers = set(["sentencizer", "parser"])
+    if not sentencizers.intersection(set(nlp.pipe_names)):
+        nlp.add_pipe("sentencizer")  # add a sentencizer if not one in pipe
     return DescriptiveStatistics(nlp)
 
 
@@ -35,7 +38,7 @@ def __init__(self, nlp: Language):
             self.counts,
         ]
         for ext, fun in zip(extensions, ext_funs):
-            if ext not in ["_n_sentences", "sentence_length"]:
+            if ext not in ["_n_sentences", "sentence_length", "syllables"]:
                 if not Span.has_extension(ext):
                     Span.set_extension(ext, getter=fun)
             if not Doc.has_extension(ext):
@@ -60,7 +63,7 @@ def token_length(self, doc: Union[Doc, Span]):
             "token_length_std": np.std(token_lengths),
         }
 
-    def sentence_length(self, doc: Union[Doc, Span]):
+    def sentence_length(self, doc: Doc):
         """Return dict with measures of sentence length"""
         # get length of filtered tokens per sentence
         tokenized_sentences = [
@@ -78,7 +81,7 @@ def sentence_length(self, doc: Union[Doc, Span]):
             "sentence_length_std": np.std(len_sentences),
         }
 
-    def syllables(self, doc: Union[Doc, Span]):
+    def syllables(self, doc: Doc):
         """Return dict with measures of syllables per token"""
         n_syllables = doc._._n_syllables
         return {
@@ -95,12 +98,16 @@ def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True):
         else:
             n_chars = len(doc.text)
 
+        if n_tokens == 0:
+            prop_unique_tokens = np.nan
+        else:
+            prop_unique_tokens = n_types / n_tokens
         out = {
             "n_tokens": n_tokens,
             "n_unique_tokens": n_types,
-            "percent_unique_tokens": n_types / n_tokens,
+            "proportion_unique_tokens": prop_unique_tokens,
             "n_characters": n_chars,
         }
         if type(doc) == Doc:
             out["n_sentences"] = doc._._n_sentences
-        return out
+        return out
diff --git a/textdescriptives/components/readability.py b/textdescriptives/components/readability.py
@@ -1,7 +1,10 @@
 """Calculation of various readability metrics"""
+from textdescriptives.components.utils import n_sentences
 from spacy.tokens import Doc
 from spacy.language import Language
 
+import numpy as np
+
 from .descriptive_stats import create_descriptive_stats_component
 
 
@@ -47,48 +50,54 @@ def _flesch_reading_ease(self, doc: Doc):
         Higher = easier to read
         Works best for English
         """
-        score = (
-            206.835
-            - (1.015 * doc._.sentence_length["sentence_length_mean"])
-            - (84.6 * doc._.syllables["syllables_per_token_mean"])
-        )
+        avg_sentence_length = doc._.sentence_length["sentence_length_mean"]
+        avg_syl_per_word = doc._.syllables["syllables_per_token_mean"]
+        if avg_sentence_length == 0 or avg_syl_per_word == 0:
+            return np.nan
+        score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syl_per_word)
         return score
 
     def _flesch_kincaid_grade(self, doc: Doc):
         """
         Score = grade required to read the text
         """
-        score = (
-            0.39 * doc._.sentence_length["sentence_length_mean"]
-            + 11.8 * doc._.syllables["syllables_per_token_mean"]
-            - 15.59
-        )
+        avg_sentence_length = doc._.sentence_length["sentence_length_mean"]
+        avg_syl_per_word = doc._.syllables["syllables_per_token_mean"]
+        if avg_sentence_length == 0 or avg_syl_per_word == 0:
+            return np.nan
+        score = 0.39 * avg_sentence_length + 11.8 * avg_syl_per_word - 15.59
         return score
 
     def _smog(self, doc: Doc, hard_words: int):
         """
         grade level = 1.043( sqrt(30 * (hard words /n sentences)) + 3.1291
         Preferably need 30+ sentences. Will not work with less than 4
         """
-        if doc._._n_sentences >= 3:
-            smog = (1.043 * (30 * (hard_words / doc._._n_sentences)) ** 0.5) + 3.1291
+        n_sentences = doc._._n_sentences
+        if n_sentences >= 3:
+            smog = (1.043 * (30 * (hard_words / n_sentences)) ** 0.5) + 3.1291
             return smog
         else:
-            return 0.0
+            return np.nan
 
     def _gunning_fog(self, doc, hard_words: int):
         """
         Grade level = 0.4 * ((avg_sentence_length) + (percentage hard words))
         hard words = 3+ syllables
         """
+        n_tokens = doc._._n_tokens
+        if n_tokens == 0:
+            return np.nan
         avg_sent_len = doc._.sentence_length["sentence_length_mean"]
-        percent_hard_words = (hard_words / doc._._n_tokens) * 100
+        percent_hard_words = (hard_words / n_tokens) * 100
         return 0.4 * (avg_sent_len + percent_hard_words)
 
     def _automated_readability_index(self, doc: Doc):
         """
         Score = grade required to read the text
         """
+        if len(doc) == 0:
+            return np.nan
         score = (
             4.71 * doc._.token_length["token_length_mean"]
             + 0.5 * doc._.sentence_length["sentence_length_mean"]
@@ -102,17 +111,26 @@ def _coleman_liau_index(self, doc: Doc):
             0.296 * avg num of sents pr 100 words -15.8
         Score = grade required to read the text
         """
+        n_tokens = doc._._n_tokens
+        if n_tokens == 0:
+            return np.nan
         l = doc._.token_length["token_length_mean"] * 100
-        s = (doc._._n_sentences / doc._.sentence_length["sentence_length_mean"]) * 100
+        s = (doc._._n_sentences / n_tokens) * 100
         return 0.0588 * l - 0.296 * s - 15.8
 
     def _lix(self, doc: Doc, long_words: int):
         """
         (n_words / n_sentences) + (n_words longer than 6 letters * 100) / n_words
         """
-        percent_long_words = long_words / doc._._n_tokens * 100
+        n_tokens = doc._._n_tokens
+        if n_tokens == 0:
+            return np.nan
+        percent_long_words = long_words / n_tokens * 100
         return doc._.sentence_length["sentence_length_mean"] + percent_long_words
 
     def _rix(self, doc: Doc, long_words: int):
         """n_long_words / n_sentences"""
-        return long_words / doc._._n_sentences
+        n_sentences = doc._._n_sentences
+        if n_sentences == 0:
+            return np.nan
+        return long_words / n_sentences
diff --git a/textdescriptives/components/utils.py b/textdescriptives/components/utils.py
@@ -24,10 +24,11 @@ def n_tokens(doc: Union[Doc, Span]):
     return len(doc._._filtered_tokens)
 
 
-def n_syllables(doc: Union[Doc, Span]):
+def n_syllables(doc: Doc):
     """
     Return number of syllables per token
     """
+
     dic = Pyphen(lang=doc.lang_)
 
     def count_syl(token: Token):

diff --git a/textdescriptives/extractor.py b/textdescriptives/extractor.py
@@ -135,7 +135,7 @@ def extract_df(
     "syllables_per_token_std",
     "n_tokens",
     "n_unique_tokens",
-    "percent_unique_tokens",
+    "proportion_unique_tokens",
     "n_sentences",
     "n_characters",
 ]
diff --git a/textdescriptives/load_components.py b/textdescriptives/load_components.py
@@ -1,8 +1,8 @@
 """Adds all components to a spaCy pipeline"""
-from components import (
-    create_readability_component,
-    create_dependency_distance_component,
-    create_descriptive_stats_component,
+from .components import (
+    Readability,
+    DependencyDistance,
+    DescriptiveStatistics,
 )
 
 from spacy.language import Language

diff --git a/textdescriptives/subsetters.py → textdescriptives/tests/__init__.py b/textdescriptives/subsetters.py → textdescriptives/tests/__init__.py
Original file line number	Diff line number	Diff line change
Expand Up		@@ -130,3 +130,5 @@ dmypy.json

		# Mac stuff
		.DStore

		.vscode