diff --git a/.gitignore b/.gitignore index b6e47617..c17ef087 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# Mac stuff +.DStore \ No newline at end of file diff --git a/spacy-textdescriptives/__init__.py b/spacy-textdescriptives/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/spacy-textdescriptives/descriptive_stats.py b/spacy-textdescriptives/descriptive_stats.py new file mode 100644 index 00000000..973d3338 --- /dev/null +++ b/spacy-textdescriptives/descriptive_stats.py @@ -0,0 +1,101 @@ +"""Calculation of descriptive statistics""" +from spacy.tokens import Doc +from spacy.language import Language + +import numpy as np + + +@Language.factory("descriptive_stats") +def create_descriptive_statistics_component(nlp: Language, name: str): + return DescriptiveStatistics(nlp) + + +class DescriptiveStatistics: + def __init__(self, nlp: Language): + """Initialise components""" + if not Doc.has_extension("token_length"): + Doc.set_extension("token_length", getter=self.token_length) + + if not Doc.has_extension("sentence_length"): + Doc.set_extension("sentence_length", getter=self.sentence_length) + + if not Doc.has_extension("syllables"): + Doc.set_extension("syllables", getter=self.syllables) + + if not Doc.has_extension("counts"): + Doc.set_extension("counts", getter=self.counts) + + def __call__(self, doc): + """Run the pipeline component""" + return doc + + def token_length(self, doc): + """Return dict with measures of token length""" + token_lengths = [len(token) for token in doc._.filtered_tokens] + return { + "token_length_mean": np.mean(token_lengths), + "token_length_median": np.median(token_lengths), + "token_length_std": np.std(token_lengths), + } + + def sentence_length(self, doc): + """Return dict with measures of sentence length""" + # get length of filtered tokens per sentence + tokenized_sentences = [ + [ + token.text + for token in sent + if not token.is_punct and "'" not in token.text + ] + for sent in doc.sents + ] + len_sentences = [len(sentence) for sentence in tokenized_sentences] + return { + "sentence_length_mean": np.mean(len_sentences), + "sentence_length_median": np.median(len_sentences), + "sentence_length_std": np.std(len_sentences), + } + + def syllables(self, doc): + """Return dict with measures of syllables per token""" + return { + "syllables_per_token_mean": np.mean(doc._.n_syllables), + "syllables_per_token_median": np.median(doc._.n_syllables), + "syllables_per_token_std": np.std(doc._.n_syllables), + } + + def counts(self, doc, ignore_whitespace=True): + n_tokens = len(doc._.filtered_tokens) + n_types = len(set([tok.lower_ for tok in doc._.filtered_tokens])) + if ignore_whitespace: + n_chars = len(doc.text.replace(" ", "")) + else: + n_chars = len(doc.text) + return { + "n_tokens": n_tokens, + "n_unique_tokens": n_types, + "percent_unique_tokens": n_types / n_tokens, + "n_sentences": doc._.n_sentences, + "n_characters": n_chars, + } + + +""" +import spacy +from utils import create_utils_component + +nlp = spacy.load('da_core_news_sm') +nlp.add_pipe("utilities", last=True) +nlp.add_pipe("descriptive_stats", last=True) + +doc = nlp("Det her er en testsætning. Her er sætning nummer 2") + +doc._.n_words +doc._.filtered_tokens +doc._.token_length +doc._.sentence_length +doc._.n_sentences +doc._.n_syllables +doc._.counts + +""" diff --git a/spacy-textdescriptives/utils.py b/spacy-textdescriptives/utils.py new file mode 100644 index 00000000..8cf72678 --- /dev/null +++ b/spacy-textdescriptives/utils.py @@ -0,0 +1,76 @@ +"""Utility functions for calculating various text descriptives""" +from spacy.tokens import Doc +from spacy.language import Language +from pyphen import Pyphen + + +@Language.factory("utilities") +def create_utils_component(nlp: Language, name: str): + return Utils(nlp) + + +class Utils: + def __init__(self, nlp: Language): + """Initialise components + Only calculate n_sentences, n_words, n_syllabes when needed using getters""" + if not Doc.has_extension("n_sentences"): + Doc.set_extension("n_sentences", getter=self.n_sentences) + + if not Doc.has_extension("n_words"): + Doc.set_extension("n_words", getter=self.n_words) + + if not Doc.has_extension("n_syllables"): + Doc.set_extension("n_syllables", getter=self.n_syllables) + + if not Doc.has_extension("filtered_tokens"): + Doc.set_extension("filtered_tokens", default=[]) + + def __call__(self, doc): + """Run the pipeline component""" + doc._.filtered_tokens = self.filtered_tokens(doc) + return doc + + def filtered_tokens(self, doc: Doc): + """Return words in document. + Filters punctuation and words that start with an apostrophe (contractions)""" + filtered_tokens = [ + word for word in doc if not word.is_punct and "'" not in word.text + ] + return filtered_tokens + + def n_sentences(self, doc: Doc): + """Return number of sentences in the document""" + return len(list(doc.sents)) + + def n_words(self, doc: Doc): + """Return number of words in the document.""" + return len(doc._.filtered_tokens) + + def n_syllables(self, doc): + """ + Return number of syllables per token + """ + dic = Pyphen(lang=doc.lang_) + + def count_syl(token): + word_hyphenated = dic.inserted(token.lower_) + return max(1, word_hyphenated.count("-") + 1) + + return [count_syl(token) for token in doc._.filtered_tokens] + +""" +import spacy +nlp = spacy.load('da_core_news_sm') +nlp.add_pipe("utilities", last=True) + +doc = nlp("Det her er en testsætning. Her er sætning nummer 2") + +for sent_i, sent in enumerate(doc.sents): + for token in sent: + print(sent_i, token.i, token.text) +doc._.n_words +doc._.filtered_tokens +doc._.n_sentences +doc._.n_syllables + +"""