Skip to content

Commit

Permalink
add tests, minor fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
HLasse committed Jul 27, 2021
1 parent 483c86e commit 48e1b3c
Show file tree
Hide file tree
Showing 18 changed files with 990 additions and 34 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -130,3 +130,5 @@ dmypy.json

# Mac stuff
.DStore

.vscode
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ The table below shows the metrics included in TextDescriptives and their attribu
| `Doc._.readability` | `readability` | Dict containing Flesch Reading Ease, Flesch-Kincaid Grade, SMOG, Gunning-Fog, Automated Readability Index, Coleman-Liau Index, LIX, and RIX readability metrics for the Doc. |
| `Doc._.dependency_distance` | `dependency_distance` | Dict containing the mean and standard deviation of the dependency distance and proportion adjacent dependency relations in the Doc.|
`Span._.token_length` | `descriptive_stats` | Dict containing mean, median, and std of token length in the span. |
| `Span._.syllables` | `descriptive_stats` | Dict containing mean, median, and std of number of syllables per token in the span. |
| `Span._.counts` | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the span. |
| `Span._.dependency_distance` | `dependency_distance` | Dict containing the mean dependency distance and proportion adjacent dependency relations in the Doc.|
| `Token._.dependency_distance` | `dependency_distance` | Dict containing the dependency distance and whether the head word is adjacent for a Token.|
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
author="Lasse Hansen",
author_email="[email protected]",
url="https://github.com/HLasse/textdescriptives",
packages=["spacy-textdescriptives"],
packages=["textdescriptives"],
install_requires=[
"spacy>=3.0.3",
"numpy>=1.20.0",
Expand Down
3 changes: 2 additions & 1 deletion textdescriptives/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .load_components import TextDescriptives
from .extractor import extract_df
from .components import DescriptiveStatistics, Readability, DependencyDistance
from .extractor import extract_df, readability_cols, dependency_cols, descriptive_stats_cols
6 changes: 3 additions & 3 deletions textdescriptives/components/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .readability import create_readability_component
from .dependency_distance import create_dependency_distance_component
from .descriptive_stats import create_descriptive_stats_component
from .readability import Readability
from .dependency_distance import DependencyDistance
from .descriptive_stats import DescriptiveStatistics
7 changes: 7 additions & 0 deletions textdescriptives/components/dependency_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ def span_dependency(self, span: Span):

def doc_dependency(self, doc: Doc):
"""Doc-level dependency distance aggregated on sentence level"""
if len(doc) == 0:
return {
"dependency_distance_mean": np.nan,
"dependency_distance_std": np.nan,
"prop_adjacent_dependency_relation_mean": np.nan,
"prop_adjacent_dependency_relation_std": np.nan,
}
dep_dists, adj_deps = zip(
*[sent._.dependency_distance.values() for sent in doc.sents]
)
Expand Down
17 changes: 12 additions & 5 deletions textdescriptives/components/descriptive_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@

@Language.factory("descriptive_stats")
def create_descriptive_stats_component(nlp: Language, name: str):
sentencizers = set(["sentencizer", "parser"])
if not sentencizers.intersection(set(nlp.pipe_names)):
nlp.add_pipe("sentencizer") # add a sentencizer if not one in pipe
return DescriptiveStatistics(nlp)


Expand All @@ -35,7 +38,7 @@ def __init__(self, nlp: Language):
self.counts,
]
for ext, fun in zip(extensions, ext_funs):
if ext not in ["_n_sentences", "sentence_length"]:
if ext not in ["_n_sentences", "sentence_length", "syllables"]:
if not Span.has_extension(ext):
Span.set_extension(ext, getter=fun)
if not Doc.has_extension(ext):
Expand All @@ -60,7 +63,7 @@ def token_length(self, doc: Union[Doc, Span]):
"token_length_std": np.std(token_lengths),
}

def sentence_length(self, doc: Union[Doc, Span]):
def sentence_length(self, doc: Doc):
"""Return dict with measures of sentence length"""
# get length of filtered tokens per sentence
tokenized_sentences = [
Expand All @@ -78,7 +81,7 @@ def sentence_length(self, doc: Union[Doc, Span]):
"sentence_length_std": np.std(len_sentences),
}

def syllables(self, doc: Union[Doc, Span]):
def syllables(self, doc: Doc):
"""Return dict with measures of syllables per token"""
n_syllables = doc._._n_syllables
return {
Expand All @@ -95,12 +98,16 @@ def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True):
else:
n_chars = len(doc.text)

if n_tokens == 0:
prop_unique_tokens = np.nan
else:
prop_unique_tokens = n_types / n_tokens
out = {
"n_tokens": n_tokens,
"n_unique_tokens": n_types,
"percent_unique_tokens": n_types / n_tokens,
"proportion_unique_tokens": prop_unique_tokens,
"n_characters": n_chars,
}
if type(doc) == Doc:
out["n_sentences"] = doc._._n_sentences
return out
return out
52 changes: 35 additions & 17 deletions textdescriptives/components/readability.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""Calculation of various readability metrics"""
from textdescriptives.components.utils import n_sentences
from spacy.tokens import Doc
from spacy.language import Language

import numpy as np

from .descriptive_stats import create_descriptive_stats_component


Expand Down Expand Up @@ -47,48 +50,54 @@ def _flesch_reading_ease(self, doc: Doc):
Higher = easier to read
Works best for English
"""
score = (
206.835
- (1.015 * doc._.sentence_length["sentence_length_mean"])
- (84.6 * doc._.syllables["syllables_per_token_mean"])
)
avg_sentence_length = doc._.sentence_length["sentence_length_mean"]
avg_syl_per_word = doc._.syllables["syllables_per_token_mean"]
if avg_sentence_length == 0 or avg_syl_per_word == 0:
return np.nan
score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syl_per_word)
return score

def _flesch_kincaid_grade(self, doc: Doc):
"""
Score = grade required to read the text
"""
score = (
0.39 * doc._.sentence_length["sentence_length_mean"]
+ 11.8 * doc._.syllables["syllables_per_token_mean"]
- 15.59
)
avg_sentence_length = doc._.sentence_length["sentence_length_mean"]
avg_syl_per_word = doc._.syllables["syllables_per_token_mean"]
if avg_sentence_length == 0 or avg_syl_per_word == 0:
return np.nan
score = 0.39 * avg_sentence_length + 11.8 * avg_syl_per_word - 15.59
return score

def _smog(self, doc: Doc, hard_words: int):
"""
grade level = 1.043( sqrt(30 * (hard words /n sentences)) + 3.1291
Preferably need 30+ sentences. Will not work with less than 4
"""
if doc._._n_sentences >= 3:
smog = (1.043 * (30 * (hard_words / doc._._n_sentences)) ** 0.5) + 3.1291
n_sentences = doc._._n_sentences
if n_sentences >= 3:
smog = (1.043 * (30 * (hard_words / n_sentences)) ** 0.5) + 3.1291
return smog
else:
return 0.0
return np.nan

def _gunning_fog(self, doc, hard_words: int):
"""
Grade level = 0.4 * ((avg_sentence_length) + (percentage hard words))
hard words = 3+ syllables
"""
n_tokens = doc._._n_tokens
if n_tokens == 0:
return np.nan
avg_sent_len = doc._.sentence_length["sentence_length_mean"]
percent_hard_words = (hard_words / doc._._n_tokens) * 100
percent_hard_words = (hard_words / n_tokens) * 100
return 0.4 * (avg_sent_len + percent_hard_words)

def _automated_readability_index(self, doc: Doc):
"""
Score = grade required to read the text
"""
if len(doc) == 0:
return np.nan
score = (
4.71 * doc._.token_length["token_length_mean"]
+ 0.5 * doc._.sentence_length["sentence_length_mean"]
Expand All @@ -102,17 +111,26 @@ def _coleman_liau_index(self, doc: Doc):
0.296 * avg num of sents pr 100 words -15.8
Score = grade required to read the text
"""
n_tokens = doc._._n_tokens
if n_tokens == 0:
return np.nan
l = doc._.token_length["token_length_mean"] * 100
s = (doc._._n_sentences / doc._.sentence_length["sentence_length_mean"]) * 100
s = (doc._._n_sentences / n_tokens) * 100
return 0.0588 * l - 0.296 * s - 15.8

def _lix(self, doc: Doc, long_words: int):
"""
(n_words / n_sentences) + (n_words longer than 6 letters * 100) / n_words
"""
percent_long_words = long_words / doc._._n_tokens * 100
n_tokens = doc._._n_tokens
if n_tokens == 0:
return np.nan
percent_long_words = long_words / n_tokens * 100
return doc._.sentence_length["sentence_length_mean"] + percent_long_words

def _rix(self, doc: Doc, long_words: int):
"""n_long_words / n_sentences"""
return long_words / doc._._n_sentences
n_sentences = doc._._n_sentences
if n_sentences == 0:
return np.nan
return long_words / n_sentences
3 changes: 2 additions & 1 deletion textdescriptives/components/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@ def n_tokens(doc: Union[Doc, Span]):
return len(doc._._filtered_tokens)


def n_syllables(doc: Union[Doc, Span]):
def n_syllables(doc: Doc):
"""
Return number of syllables per token
"""

dic = Pyphen(lang=doc.lang_)

def count_syl(token: Token):
Expand Down
2 changes: 1 addition & 1 deletion textdescriptives/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def extract_df(
"syllables_per_token_std",
"n_tokens",
"n_unique_tokens",
"percent_unique_tokens",
"proportion_unique_tokens",
"n_sentences",
"n_characters",
]
8 changes: 4 additions & 4 deletions textdescriptives/load_components.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""Adds all components to a spaCy pipeline"""
from components import (
create_readability_component,
create_dependency_distance_component,
create_descriptive_stats_component,
from .components import (
Readability,
DependencyDistance,
DescriptiveStatistics,
)

from spacy.language import Language
Expand Down
File renamed without changes.
Loading

0 comments on commit 48e1b3c

Please sign in to comment.