-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
180 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,3 +127,6 @@ dmypy.json | |
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# Mac stuff | ||
.DStore |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
"""Calculation of descriptive statistics""" | ||
from spacy.tokens import Doc | ||
from spacy.language import Language | ||
|
||
import numpy as np | ||
|
||
|
||
@Language.factory("descriptive_stats") | ||
def create_descriptive_statistics_component(nlp: Language, name: str): | ||
return DescriptiveStatistics(nlp) | ||
|
||
|
||
class DescriptiveStatistics: | ||
def __init__(self, nlp: Language): | ||
"""Initialise components""" | ||
if not Doc.has_extension("token_length"): | ||
Doc.set_extension("token_length", getter=self.token_length) | ||
|
||
if not Doc.has_extension("sentence_length"): | ||
Doc.set_extension("sentence_length", getter=self.sentence_length) | ||
|
||
if not Doc.has_extension("syllables"): | ||
Doc.set_extension("syllables", getter=self.syllables) | ||
|
||
if not Doc.has_extension("counts"): | ||
Doc.set_extension("counts", getter=self.counts) | ||
|
||
def __call__(self, doc): | ||
"""Run the pipeline component""" | ||
return doc | ||
|
||
def token_length(self, doc): | ||
"""Return dict with measures of token length""" | ||
token_lengths = [len(token) for token in doc._.filtered_tokens] | ||
return { | ||
"token_length_mean": np.mean(token_lengths), | ||
"token_length_median": np.median(token_lengths), | ||
"token_length_std": np.std(token_lengths), | ||
} | ||
|
||
def sentence_length(self, doc): | ||
"""Return dict with measures of sentence length""" | ||
# get length of filtered tokens per sentence | ||
tokenized_sentences = [ | ||
[ | ||
token.text | ||
for token in sent | ||
if not token.is_punct and "'" not in token.text | ||
] | ||
for sent in doc.sents | ||
] | ||
len_sentences = [len(sentence) for sentence in tokenized_sentences] | ||
return { | ||
"sentence_length_mean": np.mean(len_sentences), | ||
"sentence_length_median": np.median(len_sentences), | ||
"sentence_length_std": np.std(len_sentences), | ||
} | ||
|
||
def syllables(self, doc): | ||
"""Return dict with measures of syllables per token""" | ||
return { | ||
"syllables_per_token_mean": np.mean(doc._.n_syllables), | ||
"syllables_per_token_median": np.median(doc._.n_syllables), | ||
"syllables_per_token_std": np.std(doc._.n_syllables), | ||
} | ||
|
||
def counts(self, doc, ignore_whitespace=True): | ||
n_tokens = len(doc._.filtered_tokens) | ||
n_types = len(set([tok.lower_ for tok in doc._.filtered_tokens])) | ||
if ignore_whitespace: | ||
n_chars = len(doc.text.replace(" ", "")) | ||
else: | ||
n_chars = len(doc.text) | ||
return { | ||
"n_tokens": n_tokens, | ||
"n_unique_tokens": n_types, | ||
"percent_unique_tokens": n_types / n_tokens, | ||
"n_sentences": doc._.n_sentences, | ||
"n_characters": n_chars, | ||
} | ||
|
||
|
||
""" | ||
import spacy | ||
from utils import create_utils_component | ||
nlp = spacy.load('da_core_news_sm') | ||
nlp.add_pipe("utilities", last=True) | ||
nlp.add_pipe("descriptive_stats", last=True) | ||
doc = nlp("Det her er en testsætning. Her er sætning nummer 2") | ||
doc._.n_words | ||
doc._.filtered_tokens | ||
doc._.token_length | ||
doc._.sentence_length | ||
doc._.n_sentences | ||
doc._.n_syllables | ||
doc._.counts | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
"""Utility functions for calculating various text descriptives""" | ||
from spacy.tokens import Doc | ||
from spacy.language import Language | ||
from pyphen import Pyphen | ||
|
||
|
||
@Language.factory("utilities") | ||
def create_utils_component(nlp: Language, name: str): | ||
return Utils(nlp) | ||
|
||
|
||
class Utils: | ||
def __init__(self, nlp: Language): | ||
"""Initialise components | ||
Only calculate n_sentences, n_words, n_syllabes when needed using getters""" | ||
if not Doc.has_extension("n_sentences"): | ||
Doc.set_extension("n_sentences", getter=self.n_sentences) | ||
|
||
if not Doc.has_extension("n_words"): | ||
Doc.set_extension("n_words", getter=self.n_words) | ||
|
||
if not Doc.has_extension("n_syllables"): | ||
Doc.set_extension("n_syllables", getter=self.n_syllables) | ||
|
||
if not Doc.has_extension("filtered_tokens"): | ||
Doc.set_extension("filtered_tokens", default=[]) | ||
|
||
def __call__(self, doc): | ||
"""Run the pipeline component""" | ||
doc._.filtered_tokens = self.filtered_tokens(doc) | ||
return doc | ||
|
||
def filtered_tokens(self, doc: Doc): | ||
"""Return words in document. | ||
Filters punctuation and words that start with an apostrophe (contractions)""" | ||
filtered_tokens = [ | ||
word for word in doc if not word.is_punct and "'" not in word.text | ||
] | ||
return filtered_tokens | ||
|
||
def n_sentences(self, doc: Doc): | ||
"""Return number of sentences in the document""" | ||
return len(list(doc.sents)) | ||
|
||
def n_words(self, doc: Doc): | ||
"""Return number of words in the document.""" | ||
return len(doc._.filtered_tokens) | ||
|
||
def n_syllables(self, doc): | ||
""" | ||
Return number of syllables per token | ||
""" | ||
dic = Pyphen(lang=doc.lang_) | ||
|
||
def count_syl(token): | ||
word_hyphenated = dic.inserted(token.lower_) | ||
return max(1, word_hyphenated.count("-") + 1) | ||
|
||
return [count_syl(token) for token in doc._.filtered_tokens] | ||
|
||
""" | ||
import spacy | ||
nlp = spacy.load('da_core_news_sm') | ||
nlp.add_pipe("utilities", last=True) | ||
doc = nlp("Det her er en testsætning. Her er sætning nummer 2") | ||
for sent_i, sent in enumerate(doc.sents): | ||
for token in sent: | ||
print(sent_i, token.i, token.text) | ||
doc._.n_words | ||
doc._.filtered_tokens | ||
doc._.n_sentences | ||
doc._.n_syllables | ||
""" |