Skip to content

Commit

Permalink
add utils and desc stats
Browse files Browse the repository at this point in the history
  • Loading branch information
HLasse committed Jul 23, 2021
1 parent 4c25726 commit 6fdcd3b
Show file tree
Hide file tree
Showing 4 changed files with 180 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,6 @@ dmypy.json

# Pyre type checker
.pyre/

# Mac stuff
.DStore
Empty file.
101 changes: 101 additions & 0 deletions spacy-textdescriptives/descriptive_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
"""Calculation of descriptive statistics"""
from spacy.tokens import Doc
from spacy.language import Language

import numpy as np


@Language.factory("descriptive_stats")
def create_descriptive_statistics_component(nlp: Language, name: str):
return DescriptiveStatistics(nlp)


class DescriptiveStatistics:
def __init__(self, nlp: Language):
"""Initialise components"""
if not Doc.has_extension("token_length"):
Doc.set_extension("token_length", getter=self.token_length)

if not Doc.has_extension("sentence_length"):
Doc.set_extension("sentence_length", getter=self.sentence_length)

if not Doc.has_extension("syllables"):
Doc.set_extension("syllables", getter=self.syllables)

if not Doc.has_extension("counts"):
Doc.set_extension("counts", getter=self.counts)

def __call__(self, doc):
"""Run the pipeline component"""
return doc

def token_length(self, doc):
"""Return dict with measures of token length"""
token_lengths = [len(token) for token in doc._.filtered_tokens]
return {
"token_length_mean": np.mean(token_lengths),
"token_length_median": np.median(token_lengths),
"token_length_std": np.std(token_lengths),
}

def sentence_length(self, doc):
"""Return dict with measures of sentence length"""
# get length of filtered tokens per sentence
tokenized_sentences = [
[
token.text
for token in sent
if not token.is_punct and "'" not in token.text
]
for sent in doc.sents
]
len_sentences = [len(sentence) for sentence in tokenized_sentences]
return {
"sentence_length_mean": np.mean(len_sentences),
"sentence_length_median": np.median(len_sentences),
"sentence_length_std": np.std(len_sentences),
}

def syllables(self, doc):
"""Return dict with measures of syllables per token"""
return {
"syllables_per_token_mean": np.mean(doc._.n_syllables),
"syllables_per_token_median": np.median(doc._.n_syllables),
"syllables_per_token_std": np.std(doc._.n_syllables),
}

def counts(self, doc, ignore_whitespace=True):
n_tokens = len(doc._.filtered_tokens)
n_types = len(set([tok.lower_ for tok in doc._.filtered_tokens]))
if ignore_whitespace:
n_chars = len(doc.text.replace(" ", ""))
else:
n_chars = len(doc.text)
return {
"n_tokens": n_tokens,
"n_unique_tokens": n_types,
"percent_unique_tokens": n_types / n_tokens,
"n_sentences": doc._.n_sentences,
"n_characters": n_chars,
}


"""
import spacy
from utils import create_utils_component
nlp = spacy.load('da_core_news_sm')
nlp.add_pipe("utilities", last=True)
nlp.add_pipe("descriptive_stats", last=True)
doc = nlp("Det her er en testsætning. Her er sætning nummer 2")
doc._.n_words
doc._.filtered_tokens
doc._.token_length
doc._.sentence_length
doc._.n_sentences
doc._.n_syllables
doc._.counts
"""
76 changes: 76 additions & 0 deletions spacy-textdescriptives/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Utility functions for calculating various text descriptives"""
from spacy.tokens import Doc
from spacy.language import Language
from pyphen import Pyphen


@Language.factory("utilities")
def create_utils_component(nlp: Language, name: str):
return Utils(nlp)


class Utils:
def __init__(self, nlp: Language):
"""Initialise components
Only calculate n_sentences, n_words, n_syllabes when needed using getters"""
if not Doc.has_extension("n_sentences"):
Doc.set_extension("n_sentences", getter=self.n_sentences)

if not Doc.has_extension("n_words"):
Doc.set_extension("n_words", getter=self.n_words)

if not Doc.has_extension("n_syllables"):
Doc.set_extension("n_syllables", getter=self.n_syllables)

if not Doc.has_extension("filtered_tokens"):
Doc.set_extension("filtered_tokens", default=[])

def __call__(self, doc):
"""Run the pipeline component"""
doc._.filtered_tokens = self.filtered_tokens(doc)
return doc

def filtered_tokens(self, doc: Doc):
"""Return words in document.
Filters punctuation and words that start with an apostrophe (contractions)"""
filtered_tokens = [
word for word in doc if not word.is_punct and "'" not in word.text
]
return filtered_tokens

def n_sentences(self, doc: Doc):
"""Return number of sentences in the document"""
return len(list(doc.sents))

def n_words(self, doc: Doc):
"""Return number of words in the document."""
return len(doc._.filtered_tokens)

def n_syllables(self, doc):
"""
Return number of syllables per token
"""
dic = Pyphen(lang=doc.lang_)

def count_syl(token):
word_hyphenated = dic.inserted(token.lower_)
return max(1, word_hyphenated.count("-") + 1)

return [count_syl(token) for token in doc._.filtered_tokens]

"""
import spacy
nlp = spacy.load('da_core_news_sm')
nlp.add_pipe("utilities", last=True)
doc = nlp("Det her er en testsætning. Her er sætning nummer 2")
for sent_i, sent in enumerate(doc.sents):
for token in sent:
print(sent_i, token.i, token.text)
doc._.n_words
doc._.filtered_tokens
doc._.n_sentences
doc._.n_syllables
"""

0 comments on commit 6fdcd3b

Please sign in to comment.