From 347ac8dc3e5fdc60ebe56c10c07bd6621d92b478 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 20 Sep 2022 16:34:13 +0200 Subject: [PATCH 01/11] feat: added quality estimation --- textdescriptives/components/quality.py | 383 +++++++++++++++++++++++++ textdescriptives/components/utils.py | 55 +++- 2 files changed, 432 insertions(+), 6 deletions(-) create mode 100644 textdescriptives/components/quality.py diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py new file mode 100644 index 00000000..f603cfa7 --- /dev/null +++ b/textdescriptives/components/quality.py @@ -0,0 +1,383 @@ +"""Calculation of various readability metrics.""" +from collections import Counter, defaultdict +from functools import partial +from typing import Callable, Dict, List, Tuple, Union + +import numpy as np +from spacy.language import Language +from spacy.tokens import Doc, Span + +from textdescriptives.components.utils import span_getter_to_doc_getter + + +def n_stop_words(span: Span) -> int: + """Count the number of stop words in a document. + + Args: + span (Span): spaCy span object + + Returns: + int: number of stop words + """ + return len([t for t in span if t.is_stop]) + + +def mean_word_length(span: Span) -> float: + """Calculate the mean word length of a document. + + Args: + span (Span): spaCy span object + + Returns: + float: mean word length + """ + return np.mean([len(t) for t in span]) + + +def alpha_ratio(span: Span) -> float: + """The percentage of spacy tokens in this document which contain + at leat one alphabetic character. + + Args: + span (Span): spaCy span object + + Returns: + float: alpha ratio + """ + + def contains_alpha(token): + for char in token.text: + if char.isalpha(): + return True + return False + + return np.mean([contains_alpha(t) for t in span]) + + +def proportion_bullet_points( # pylint: disable=dangerous-default-value + span: Span, bullet_point: set = {"-", "*"} +) -> float: + """Calculate the proportion of lines which start with a bullet points in a span. + + Args: + span (Span): spaCy span object + + Returns: + float: proportion of bullet points + """ + lines = span._.lines + return np.mean([line.strip().startswith(bullet_point) for line in lines]) + + +def proportion_ellipsis( # pylint: disable=dangerous-default-value + span: Span, ellipsis: set = {"…", "..."} +) -> float: + """Calculate the proportion line which ends with an ellipsis in a span. + + Args: + span (Span): spaCy span object + + Returns: + float: proportion of ellipsis + """ + lines = span._.lines + return np.mean([line.strip().endswith(ellipsis) for line in lines]) + + +def duplicate_line_fraction(span: Span) -> float: + """Calculate the proportion of of characters within duplicate lines. + + Args: + span (Span): spaCy span object + + Returns: + float: proportion of characters within a duplicate lines + """ + lines = span._.lines + unique_lines = set(lines) + return 1 - len(unique_lines) / len(lines) + + +def duplicate_chr_fraction_getter(doc: Doc, attr: str) -> float: + """Calculate the character fraction of duplicates based on a counter object. + Args: + doc (Doc): + A spaCy Doc. + attr (str): + The document attribute to extract. + Returns: + float: + The fraction of duplicate characters. + """ + counter = getattr(doc._, attr) + duplicate_chr = 0 + for t, c in counter.items(): + if c > 1: + duplicate_chr += len(t) * (c - 1) + frac = duplicate_chr / doc._.chr_len + return frac + + +def symbol_2_word_ratio(span: Span, symbol: str) -> float: + """Calculate the ratio of symbols to words in a span. + + Args: + span (Span): spaCy span object + ratio (float): ratio of symbols to words + symbol (str): symbol to count + + Returns: + float: ratio of symbols to words + """ + n_symbol = span.text.count(symbol) + return n_symbol / len([t for t in span if not (t.is_space or t.is_punct)]) + + +def duplicate_ngram_fraction( + span: Span, + ngram_range: Tuple[int, int], +) -> Dict[int, float]: + """calculates the character fraction of duplicate n-gram over the overall text, + taking care not to count overlapping n-grams twice. + + Args: + span (Span): spaCy span object + ngram_range (Tuple[int, int], optional): The n-gram range. + + Returns: + Dict[int, float]: the fraction of duplicate characters for each + n-gram size + """ + lower, upper = ngram_range + + # calcuate maximum chr. limits according to thresholds + ngrams = defaultdict(set) + duplicate_char = defaultdict(int) + minmax = defaultdict(lambda: [0, 0]) + max_len = len(span) + + for i, _ in enumerate(span): + for ngram_size in range(lower, upper + 1): + + min_, max_ = minmax[ngram_size] + end = i + ngram_size + + if end < max_len: + span = span[i:end] + ngram = span.text.lower() # create n-gram from span + + if ngram in ngrams[ngram_size]: + # if it doesn't overlap with other ngrams of the same size + # update + if span.start_char > max_: + duplicate_char[ngram_size] += max_ - min_ + minmax[ngram_size] = [span.start_char, span.end_char] + else: + # extend range of duplicates + minmax[ngram_size][1] = span.end_char + else: + ngrams[ngram_size].add(ngram) + + # empty buffer for of duplicate chr. which have yet to be added. + for ngram_size in range(lower, upper + 1): + min_, max_ = minmax[ngram_size] + duplicate_char[ngram_size] += max_ - min_ + + return duplicate_char + + +def n_gram_counter(span: Span, ngram_range: Tuple[int, int]) -> Dict[str, Counter]: + """Calculate the counts of n-grams in the specified range. + Args: + span (Span): spaCy span object + ngram_range (Tuple[int, int]): The n-gram range. + Returns: + Dict[str, Counter]: Dict with str keys and Counter values. A dictionary + containing the counts of n-grams for a specific n. + """ + max_len = len(span) + lower, upper = ngram_range + shingles_count = defaultdict(Counter) + for i, _ in enumerate(span): + for ngram_size in range(lower, upper + 1): + end = i + ngram_size + if end < max_len: + span = span[i:end] + shingles_count[ngram_size][span.text.lower()] += 1 + return shingles_count + + +def top_ngram_chr_fraction( + span: Span, + ngram_range: Tuple[int, int], +) -> float: + """Calculated whether the character fraction of the top n-grams is below the + given thresholds + + Args: + span (Span): spaCy span object + ngram_range (Tuple[int, int], optional): Range of n grams to examine. + + Returns: + float: The fraction of the top n-grams. + """ + ngram_counter = n_gram_counter(span, ngram_range=ngram_range) + top_ngram_chr_frac = {} + for n in zip(ngram_counter): + ngram, count = ngram_counter[n].most_common(1)[0] + top_ngram_chr_frac[n] = len(ngram) * count / span._.chr_len + top_ngram_chr_frac = len(ngram) * count / span._.chr_len + + +def contains_string(span: Span, string: str) -> bool: + """Check if a span contains a string. + + Args: + span (Span): spaCy span object + string (str): string to check for + + Returns: + bool: True if span contains string + """ + return string in span.text + + +class Quality: + """spaCy component for adding text quality metrics to the `Doc` and `Span` objects. + Extracts metrics and returns them as a dictionary as the ._.quality attribute. + """ + + def __init__( # pylint: disable=dangerous-default-value + self, + nlp: Language, + name: str, + symbols: List[str] = ["#"], + contains=["lorem ipsum"], + duplicate_n_gram_fraction_range: Tuple[int] = [5, 10], + force: bool = False, + ): # noqa: D107 + """Initialise components""" + self.name = name + self.force = force + + duplicate_lines_chr_fraction = partial( + duplicate_chr_fraction_getter, attr="lines_counter" + ) + duplicate_paragraph_chr_fraction = partial( + duplicate_chr_fraction_getter, attr="paragraphs_counter" + ) + + self.extensions = { + "lines": lambda span: span.text.split("\n"), + "paragrahs": lambda span: span.text.split("\n\n"), + "lines_counter": lambda span: Counter(span._.lines), + "paragraphs_counter": lambda span: Counter(span._.paragraphs), + "chr_len": lambda span: len(span.text), + } + + self.getters = { + # heuristic quality filters + "n_stop_words": n_stop_words, + "alpha_ratio": alpha_ratio, + "mean_word_length": mean_word_length, + "proportion_ellipsis": proportion_ellipsis, + "proportion_bullet_points": proportion_bullet_points, + # text repetition + "duplicate_lines_chr_fraction": duplicate_lines_chr_fraction, + "duplicate_paragraph_chr_fraction": duplicate_paragraph_chr_fraction, + "duplicate_ngram_chr_fraction": partial( + duplicate_ngram_fraction, ngram_range=duplicate_n_gram_fraction_range + ), + } + # add symbol to word ratio + for symbol in symbols: + self.getters[f"symbol_{symbol}_2_word_ratio"] = partial( + symbol_2_word_ratio, symbol=symbol + ) + # add contains + for string in contains: + self.getters[f"contains_{string}"] = partial(contains_string, string=string) + + self.set_extensions() + + if not Span.has_extension("quality") or force: + Span.set_extension("quality", getter=self.quality_getter, force=force) + if not Doc.has_extension("quality") or force: + Doc.set_extension( + "quality", + getter=span_getter_to_doc_getter(self.quality_getter), + force=force, + ) + + def __call__(self, doc: Doc): + """Run the pipeline component""" + return doc + + def quality_getter(self, span: Span) -> Dict[str, Union[float, int, bool]]: + """Apply quality functions to doc + + Args: + span (Span): spaCy span object + + Returns: + Dict[str, Union[float, int, bool]]: dictionary of quality metrics + """ + quality = {} + for name, getter in self.getters.items(): + if name == "top_ngram_chr_fraction": + chr_frac = getter(span) + for n_gram, frac in chr_frac.items(): + quality[f"{n_gram}_gram_chr_fraction"] = frac + if name == "duplicate_ngram_chr_fraction": + chr_frac = getter(span) + for n_gram, frac in chr_frac.items(): + quality[f"{n_gram}_gram_duplicate_chr_fraction"] = frac + + quality[name] = getter(span) + return quality + + def set_extensions(self): + """Set required extensions.""" + + for ext_name, span_getter in self.extensions.items(): + doc_getter = span_getter_to_doc_getter(span_getter) + + if not Span.has_extension(ext_name) or self.force is True: + Span.set_extension(ext_name, getter=span_getter) + if not Doc.has_extension(ext_name) or self.force is True: + Doc.set_extension(ext_name, getter=doc_getter) + + +@Language.factory("quality") +def create_quality_component( + nlp: Language, name: str, force: bool = False +) -> Callable[[Doc], Doc]: + """Allows Quality to be added to a spaCy pipe using nlp.add_pipe("quality"). + + Set the following extensions: + - {Span/Doc}._.quality + - {Span/Doc}._.lines + - {Span/Doc}._.paragraphs + - {Span/Doc}._.lines_counter + - {Span/Doc}._.paragraphs_counter + - {Span/Doc}._.chr_len + + Where the last are used to calculate some of the quality metrics. The can be + overwritten if you e.g. wish lines to be split on "\\r\\n" instead of "\\n". + + A large part of the quality metrics were proposed by [1]. + + References: + [1] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., ... & + Irving, G. (2021). Scaling language models: Methods, analysis & insights from + training gopher. arXiv preprint arXiv:2112.11446. + + + Args: + nlp (Language): spaCy language object + name (str): name of the component + + Returns: + Quality: the spaCy component + """ + return Quality(nlp, name=name, force=force) diff --git a/textdescriptives/components/utils.py b/textdescriptives/components/utils.py index 5dbe9730..d924dfbd 100644 --- a/textdescriptives/components/utils.py +++ b/textdescriptives/components/utils.py @@ -1,8 +1,10 @@ -"""Utility functions for calculating various text descriptives""" -from spacy.tokens import Doc, Span, Token -from pyphen import Pyphen +""" +Utility functions for calculating various text descriptives +""" +from typing import Any, Callable, Union -from typing import Union +from pyphen import Pyphen +from spacy.tokens import Doc, Span, Token def filtered_tokens(doc: Union[Doc, Span]): @@ -21,7 +23,7 @@ def n_sentences(doc: Union[Doc, Span]): def n_tokens(doc: Union[Doc, Span]): """Return number of words in the document.""" - return len(doc._._filtered_tokens) + return len(doc._._filtered_tokens) # pylint: disable=protected-access def n_syllables(doc: Doc): @@ -35,4 +37,45 @@ def count_syl(token: Token): word_hyphenated = dic.inserted(token.lower_) return max(1, word_hyphenated.count("-") + 1) - return [count_syl(token) for token in doc._._filtered_tokens] + return [ + count_syl(token) + for token in doc._._filtered_tokens # pylint: disable=protected-access + ] + + +def span_getter_to_token_getter( + span_getter: Callable[[Span], Any] +) -> Callable[[Token], Any]: + """Converts a span getter to a token getter. + + Args: + span_getter (Callable[[Span], Any]): + The span getter function. + + Returns: + Callable[[Token], Any]: The token getter function. + """ + + def token_getter(token): + return span_getter(token.doc[token.i : token.i + 1]) + + return token_getter + + +def span_getter_to_doc_getter( + span_getter: Callable[[Span], Any] +) -> Callable[[Doc], Any]: + """Converts a span getter to a document getter. + + Args: + span_getter (Callable[[Span], Any]): + The span getter function. + + Returns: + Callable[[Doc], Any]: The document getter function. + """ + + def doc_getter(doc): + return span_getter(doc[:]) + + return doc_getter From a1cdd3b166dfacec5524b1bd65de284e4a544bdb Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Tue, 20 Sep 2022 16:36:34 +0200 Subject: [PATCH 02/11] docs: minor additions to docs --- textdescriptives/components/quality.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py index f603cfa7..4592353f 100644 --- a/textdescriptives/components/quality.py +++ b/textdescriptives/components/quality.py @@ -365,13 +365,16 @@ def create_quality_component( Where the last are used to calculate some of the quality metrics. The can be overwritten if you e.g. wish lines to be split on "\\r\\n" instead of "\\n". - A large part of the quality metrics were proposed by [1]. + A large part of the quality metrics were proposed by [1] and [2] for filtering + out low quality text from large text corpora. References: [1] Rae, J. W., Borgeaud, S., Cai, T., Millican, K., Hoffmann, J., Song, F., ... & Irving, G. (2021). Scaling language models: Methods, analysis & insights from training gopher. arXiv preprint arXiv:2112.11446. - + [2] Raffel, C., Shazeer, N., Roberts, A., Lee, K., Narang, S., Matena, M., ... & + Liu, P. J. (2020). Exploring the limits of transfer learning with a unified + text-to-text transformer. J. Mach. Learn. Res., 21(140), 1-67. Args: nlp (Language): spaCy language object From 6231433b011e4047cf86590d62270928300e6a4f Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Wed, 21 Sep 2022 15:37:33 +0200 Subject: [PATCH 03/11] refactor: removed all but one set of defaults. --- textdescriptives/components/quality.py | 72 +++++++++++++++++++------- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py index 4592353f..cc35ed33 100644 --- a/textdescriptives/components/quality.py +++ b/textdescriptives/components/quality.py @@ -1,4 +1,4 @@ -"""Calculation of various readability metrics.""" +"""Component for calculating quality metrics.""" from collections import Counter, defaultdict from functools import partial from typing import Callable, Dict, List, Tuple, Union @@ -251,9 +251,10 @@ def __init__( # pylint: disable=dangerous-default-value self, nlp: Language, name: str, - symbols: List[str] = ["#"], - contains=["lorem ipsum"], - duplicate_n_gram_fraction_range: Tuple[int] = [5, 10], + symbols: List[str], + contains: List[str], + top_ngram_range: Tuple[int, int], + duplicate_n_gram_fraction_range: Tuple[int, int], force: bool = False, ): # noqa: D107 """Initialise components""" @@ -268,11 +269,11 @@ def __init__( # pylint: disable=dangerous-default-value ) self.extensions = { - "lines": lambda span: span.text.split("\n"), - "paragrahs": lambda span: span.text.split("\n\n"), - "lines_counter": lambda span: Counter(span._.lines), - "paragraphs_counter": lambda span: Counter(span._.paragraphs), - "chr_len": lambda span: len(span.text), + "_lines": lambda span: span.text.split("\n"), + "_paragrahs": lambda span: span.text.split("\n\n"), + "_lines_counter": lambda span: Counter(span._.lines), + "_paragraphs_counter": lambda span: Counter(span._.paragraphs), + "_chr_len": lambda span: len(span.text), } self.getters = { @@ -288,6 +289,9 @@ def __init__( # pylint: disable=dangerous-default-value "duplicate_ngram_chr_fraction": partial( duplicate_ngram_fraction, ngram_range=duplicate_n_gram_fraction_range ), + "top_ngram_chr_fraction": partial( + top_ngram_chr_fraction, ngram_range=top_ngram_range + ), } # add symbol to word ratio for symbol in symbols: @@ -348,19 +352,34 @@ def set_extensions(self): Doc.set_extension(ext_name, getter=doc_getter) -@Language.factory("quality") +@Language.factory( + "quality", + default_config={ + "symbols": ["#"], + "contains": ["lorem ipsum"], + "top_ngram_range": [2, 4], + "duplicate_n_gram_fraction_range": [5, 10], + "force": False, + }, +) def create_quality_component( - nlp: Language, name: str, force: bool = False + nlp: Language, + name: str, + symbols: List[str], + contains: List[str], + top_ngram_range: Tuple[int, int], + duplicate_n_gram_fraction_range: Tuple[int, int], + force: bool, ) -> Callable[[Doc], Doc]: """Allows Quality to be added to a spaCy pipe using nlp.add_pipe("quality"). Set the following extensions: - {Span/Doc}._.quality - - {Span/Doc}._.lines - - {Span/Doc}._.paragraphs - - {Span/Doc}._.lines_counter - - {Span/Doc}._.paragraphs_counter - - {Span/Doc}._.chr_len + - {Span/Doc}._._lines + - {Span/Doc}._._paragraphs + - {Span/Doc}._._lines_counter + - {Span/Doc}._._paragraphs_counter + - {Span/Doc}._._chr_len Where the last are used to calculate some of the quality metrics. The can be overwritten if you e.g. wish lines to be split on "\\r\\n" instead of "\\n". @@ -379,8 +398,25 @@ def create_quality_component( Args: nlp (Language): spaCy language object name (str): name of the component + symbols (List[str]): list of symbols for which to calculate the + proportion the ratio of symbols to words. Defaults to ["#"]. + contains (List[str]): list of strings for which to check whether the + document contains them. Defaults to ["lorem ipsum"]. + top_ngram_range (Tuple[int]): range of n-grams to calculate the + proportion of the top n-gram. Defaults to [2, 4]. + duplicate_n_gram_fraction_range (Tuple[int]): range of n-grams to + calculate the proportion of duplicate n-grams. Defaults to [5, 10]. + force (bool): whether to overwrite existing extensions. Defaults to False. Returns: - Quality: the spaCy component + Callable[[Doc], Doc]: the spaCy component """ - return Quality(nlp, name=name, force=force) + return Quality( + nlp, + name=name, + symbols=symbols, + contains=contains, + top_ngram_range=top_ngram_range, + duplicate_n_gram_fraction_range=duplicate_n_gram_fraction_range, + force=force, + ) From 976c050e8a4db19a5e60bbb0e5fa2db3e656537e Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Thu, 22 Sep 2022 13:55:16 +0200 Subject: [PATCH 04/11] docs: added documentation for quality --- NEWS.md | 7 +++++-- README.md | 50 ++++++++++++++++++++++++------------------------ docs/quality.rst | 34 ++++++++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 27 deletions(-) create mode 100644 docs/quality.rst diff --git a/NEWS.md b/NEWS.md index 95d4f4fa..31adf6e4 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,10 @@ # News -## V1.0.7 - May 4, 2022 +## v1.1.0 - 21st of September, 2022 +- Added the new pipe; "quality". This pipe implements a series of metrics related to text quality, some of which were used by Rae et al. (2021) and Raffel et al. (2020) to filter large text corpora. + +## v1.0.7 - 4th May, 2022 - Some minor fixes and bells and whistles. -## V1.0.5 - October 4, 2021 +## v1.0.5 - 4th October, 2021 - POS proportions now use `pos_` instead of `tag_` by default. This behavior can be changed by setting `use_tag` to `False` when initialising the `pos_stats` module. \ No newline at end of file diff --git a/README.md b/README.md index 0a2da16a..b3a5a688 100644 --- a/README.md +++ b/README.md @@ -43,9 +43,9 @@ TextDescriptives includes convenience functions for extracting metrics to a Pand td.extract_df(doc) # td.extract_dict(doc) ``` -| | text | token_length_mean | token_length_median | token_length_std | sentence_length_mean | sentence_length_median | sentence_length_std | syllables_per_token_mean | syllables_per_token_median | syllables_per_token_std | n_tokens | n_unique_tokens | proportion_unique_tokens | n_characters | n_sentences | flesch_reading_ease | flesch_kincaid_grade | smog | gunning_fog | automated_readability_index | coleman_liau_index | lix | rix | dependency_distance_mean | dependency_distance_std | prop_adjacent_dependency_relation_mean | prop_adjacent_dependency_relation_std | pos_prop_DT | pos_prop_NN | pos_prop_VBZ | pos_prop_VBN | pos_prop_. | pos_prop_PRP | pos_prop_VBP | pos_prop_IN | pos_prop_RB | pos_prop_VBD | pos_prop_, | pos_prop_WP | -|---:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------:|----------------------:|-------------------:|-----------------------:|-------------------------:|----------------------:|---------------------------:|-----------------------------:|--------------------------:|-----------:|------------------:|---------------------------:|---------------:|--------------:|----------------------:|-----------------------:|--------:|--------------:|------------------------------:|---------------------:|--------:|------:|---------------------------:|--------------------------:|-----------------------------------------:|----------------------------------------:|--------------:|--------------:|---------------:|---------------:|-------------:|---------------:|---------------:|--------------:|--------------:|---------------:|-------------:|--------------:| -| 0 | The world (...) | 3.28571 | 3 | 1.54127 | 7 | 6 | 3.09839 | 1.08571 | 1 | 0.368117 | 35 | 23 | 0.657143 | 121 | 5 | 107.879 | -0.0485714 | 5.68392 | 3.94286 | -2.45429 | -0.708571 | 12.7143 | 0.4 | 1.69524 | 0.422282 | 0.44381 | 0.0863679 | 0.097561 | 0.121951 | 0.0487805 | 0.0487805 | 0.121951 | 0.170732 | 0.121951 | 0.121951 | 0.0731707 | 0.0243902 | 0.0243902 | 0.0243902 | +| | text | token_length_mean | token_length_median | token_length_std | sentence_length_mean | sentence_length_median | sentence_length_std | syllables_per_token_mean | syllables_per_token_median | syllables_per_token_std | n_tokens | n_unique_tokens | proportion_unique_tokens | n_characters | n_sentences | flesch_reading_ease | flesch_kincaid_grade | smog | gunning_fog | automated_readability_index | coleman_liau_index | lix | rix | dependency_distance_mean | dependency_distance_std | prop_adjacent_dependency_relation_mean | prop_adjacent_dependency_relation_std | pos_prop_DT | pos_prop_NN | pos_prop_VBZ | pos_prop_VBN | pos_prop_. | pos_prop_PRP | pos_prop_VBP | pos_prop_IN | pos_prop_RB | pos_prop_VBD | pos_prop_, | pos_prop_WP | +| ---: | :--------------- | ----------------: | ------------------: | ---------------: | -------------------: | ---------------------: | ------------------: | -----------------------: | -------------------------: | ----------------------: | -------: | --------------: | -----------------------: | -----------: | ----------: | ------------------: | -------------------: | ------: | ----------: | --------------------------: | -----------------: | ------: | ---: | -----------------------: | ----------------------: | -------------------------------------: | ------------------------------------: | ----------: | ----------: | -----------: | -----------: | ---------: | -----------: | -----------: | ----------: | ----------: | -----------: | ---------: | ----------: | +| 0 | The world (...) | 3.28571 | 3 | 1.54127 | 7 | 6 | 3.09839 | 1.08571 | 1 | 0.368117 | 35 | 23 | 0.657143 | 121 | 5 | 107.879 | -0.0485714 | 5.68392 | 3.94286 | -2.45429 | -0.708571 | 12.7143 | 0.4 | 1.69524 | 0.422282 | 0.44381 | 0.0863679 | 0.097561 | 0.121951 | 0.0487805 | 0.0487805 | 0.121951 | 0.170732 | 0.121951 | 0.121951 | 0.0731707 | 0.0243902 | 0.0243902 | 0.0243902 | Set which group(s) of metrics you want to extract using the `metrics` parameter (one or more of `readability`, `dependency_distance`, `descriptive_stats`, `pos_stats`, defaults to `all`) @@ -56,10 +56,10 @@ docs = nlp.pipe(['The world is changed. I feel it in the water. I feel it in the td.extract_df(docs, metrics="dependency_distance") ``` -| | text | dependency_distance_mean | dependency_distance_std | prop_adjacent_dependency_relation_mean | prop_adjacent_dependency_relation_std | -|---:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------:|--------------------------:|-----------------------------------------:|----------------------------------------:| -| 0 | The world (...) | 1.69524 | 0.422282 | 0.44381 | 0.0863679 | -| 1 | He felt (...) | 2.56 | 0 | 0.44 | 0 | +| | text | dependency_distance_mean | dependency_distance_std | prop_adjacent_dependency_relation_mean | prop_adjacent_dependency_relation_std | +| ---: | :-------------- | -----------------------: | ----------------------: | -------------------------------------: | ------------------------------------: | +| 0 | The world (...) | 1.69524 | 0.422282 | 0.44381 | 0.0863679 | +| 1 | He felt (...) | 2.56 | 0 | 0.44 | 0 | The `text` column can by exluded by setting `include_text` to `False`. @@ -76,28 +76,28 @@ docs = nlp.pipe(['Da jeg var atten, tog jeg patent på ild. Det skulle senere vi td.extract_df(docs, include_text = False) ``` -| | token_length_mean | token_length_median | token_length_std | sentence_length_mean | sentence_length_median | sentence_length_std | syllables_per_token_mean | syllables_per_token_median | syllables_per_token_std | n_tokens | n_unique_tokens | proportion_unique_tokens | n_characters | n_sentences | -|---:|--------------------:|----------------------:|-------------------:|-----------------------:|-------------------------:|----------------------:|---------------------------:|-----------------------------:|--------------------------:|-----------:|------------------:|---------------------------:|---------------:|--------------:| -| 0 | 4.4 | 3 | 2.59615 | 10 | 10 | 1 | 1.65 | 1 | 0.852936 | 20 | 19 | 0.95 | 90 | 2 | -| 1 | 4 | 3.5 | 2.44949 | 6 | 6 | 3 | 1.58333 | 1 | 0.862007 | 12 | 12 | 1 | 53 | 2 | +| | token_length_mean | token_length_median | token_length_std | sentence_length_mean | sentence_length_median | sentence_length_std | syllables_per_token_mean | syllables_per_token_median | syllables_per_token_std | n_tokens | n_unique_tokens | proportion_unique_tokens | n_characters | n_sentences | +| ---: | ----------------: | ------------------: | ---------------: | -------------------: | ---------------------: | ------------------: | -----------------------: | -------------------------: | ----------------------: | -------: | --------------: | -----------------------: | -----------: | ----------: | +| 0 | 4.4 | 3 | 2.59615 | 10 | 10 | 1 | 1.65 | 1 | 0.852936 | 20 | 19 | 0.95 | 90 | 2 | +| 1 | 4 | 3.5 | 2.44949 | 6 | 6 | 3 | 1.58333 | 1 | 0.862007 | 12 | 12 | 1 | 53 | 2 | ## Available attributes The table below shows the metrics included in TextDescriptives and their attributes on spaCy's `Doc`, `Span`, and `Token` objects. For more information, see the [docs](https://hlasse.github.io/TextDescriptives/). -| Attribute | Component | Description | -| -------------------- | -------------------------- | ------------------------------------------------------------- | -| `Doc._.token_length` | `descriptive_stats` | Dict containing mean, median, and std of token length. | -| `Doc._.sentence_length` | `descriptive_stats` | Dict containing mean, median, and std of sentence length. | -| `Doc._.syllables` | `descriptive_stats` | Dict containing mean, median, and std of number of syllables per token. | -| `Doc._.counts` | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the Doc.| -| `Doc._.pos_proportions` | `pos_stats` | Dict of `{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG. | -| `Doc._.readability` | `readability` | Dict containing Flesch Reading Ease, Flesch-Kincaid Grade, SMOG, Gunning-Fog, Automated Readability Index, Coleman-Liau Index, LIX, and RIX readability metrics for the Doc. | -| `Doc._.dependency_distance` | `dependency_distance` | Dict containing the mean and standard deviation of the dependency distance and proportion adjacent dependency relations in the Doc. | -| `Span._.token_length` | `descriptive_stats` | Dict containing mean, median, and std of token length in the span. | -| `Span._.counts` | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the span. | -| `Span._.pos_proportions` | `pos_stats` | Dict of `{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the span fit the POSTAG. | -| `Span._.dependency_distance` | `dependency_distance` | Dict containing the mean dependency distance and proportion adjacent dependency relations in the Doc. | -| `Token._.dependency_distance` | `dependency_distance` | Dict containing the dependency distance and whether the head word is adjacent for a Token. | +| Attribute | Component | Description | +| ----------------------------------- | --------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `Doc._.sentence_length` | `descriptive_stats` | Dict containing mean, median, and std of sentence length. | +| `Doc._.syllables` | `descriptive_stats` | Dict containing mean, median, and std of number of syllables per token. | +| `Doc._.readability` | `readability` | Dict containing Flesch Reading Ease, Flesch-Kincaid Grade, SMOG, Gunning-Fog, Automated Readability Index, Coleman-Liau Index, LIX, and RIX readability metrics for the Doc. | +| `{Doc/Span}._.dependency_distance` | `dependency_distance` | Dict containing the mean and standard deviation of the dependency distance and proportion adjacent dependency relations in the Doc. | +| `{Doc/Span}._.counts` | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the Doc/Span. | +| `{Doc/Span}._.pos_proportions` | `pos_stats` | Dict of `{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG. | +| `{Doc/Span}._.token_length` | `descriptive_stats` | Dict containing mean, median, and std of token length. | +| `{Doc/Span}._.quality` | `quality` | Dict a series of heuristic metrics related to text quality. Targeted at filtering out low-quality text. | +| `{Doc/Span}._.passed_quality_check` | `quality` | Boolean on whether the document or span passed threshold sets for quality checks. | + + +| `Token._.dependency_distance` | `dependency_distance` | Dict containing the dependency distance and whether the head word is adjacent for a Token. | diff --git a/docs/quality.rst b/docs/quality.rst new file mode 100644 index 00000000..1ef2a5f1 --- /dev/null +++ b/docs/quality.rst @@ -0,0 +1,34 @@ +Quality +-------------------- + +The :code:`quality` component adds the following quality metrics under the +:code:`._.quality`` attribute to :code:`Doc` and :code:`Span` objects. + +Heuristic quality metrics: + +* Number of stop words (:code:`n_stop_words``): The number of stop words in the document. +* Alpha Ratio (:code:`alpha_ratio`): Ratio of words containing at least one alphabetic characters. +* Mean word length (:code:`mean_word_length`): Mean/average word length. +* Proportion of ellipsis (:code:`proportion_ellipsis`): Proportion of lines in a documents which end with an ellipsis. +* Proportion of bullet points (:code:`proportion_bullet_points`): Proportion of lines in a documents which start with a bullet point. +* Symbol to word ratio (:code:`symbol_{symbol}_2_word_ratio`): Ratio of specified symbols to words, could e.g. include ratio of hashtags or curly brackets. +* Contains string (:code:`contains_{string}`): Whether the document contains a specified string. For instance documents containing the string "lorem ipsum". + +Repititious text metrics: + +* Duplicate lines character fraction (:code:`duplicate_lines_chr_fraction`): Fraction of characters in a document which are contained within duplicate lines. +* Duplicate paragraphs character fraction (:code:`duplicate_paragraphs_chr_fraction`): Fraction of characters in a document which are contained within duplicate paragraphs. +* Duplicate n-gram character fraction (:code:`duplicate_{n}_gram_chr_fraction`): Fraction of characters in a document which are contained within duplicate n-grams. For a speciifed n-gram range. +* Top n-gram character fraction (:code:`top_{n}_gram_chr_fraction`): Fraction of characters in a document which are contained within the top n-grams. For a speciifed n-gram range. + + +These quality metrics were for example used by +`Rae et al. (2021) `__ and +`Raffel et al. (2020) `__` to filter large text +corpora for pre-training language models. + + +Quality Component +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autofunction:: textdescriptives.components.quality.create_quality_component From 9e2940d7ceed68a45e641ab27f487e5c4a65ebd9 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sun, 25 Sep 2022 16:16:09 +0200 Subject: [PATCH 05/11] style: Added style to requirement.txt --- requirements.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/requirements.txt b/requirements.txt index ec06e650..48ed166f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,8 @@ ftfy>=6.0.3,<6.2.0 pytest>=7.1.3,<7.2.0 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz en_core_web_sm==3.2.0 + +# style +flake8 +black +isort \ No newline at end of file From b70f85088f25437971a9c6bef6fe082675ed90f5 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sun, 25 Sep 2022 16:16:42 +0200 Subject: [PATCH 06/11] test: Added tests to quality filter --- textdescriptives/__init__.py | 18 +- textdescriptives/components/__init__.py | 9 +- textdescriptives/components/quality.py | 402 ++++++++++++++++-------- textdescriptives/tests/test_quality.py | 251 +++++++++++++++ 4 files changed, 542 insertions(+), 138 deletions(-) create mode 100644 textdescriptives/tests/test_quality.py diff --git a/textdescriptives/__init__.py b/textdescriptives/__init__.py index a11e0955..994e3bfe 100644 --- a/textdescriptives/__init__.py +++ b/textdescriptives/__init__.py @@ -1,16 +1,16 @@ -from .load_components import TextDescriptives -from .components import ( - DescriptiveStatistics, - Readability, +from .about import __title__, __version__ # noqa: F401 +from .components import ( # noqa: F401 DependencyDistance, + DescriptiveStatistics, POSStatistics, + Quality, + Readability, ) -from .dataframe_extract import ( +from .dataframe_extract import ( # noqa: F401 + dependency_cols, + descriptive_stats_cols, extract_df, extract_dict, readability_cols, - dependency_cols, - descriptive_stats_cols, ) - -from .about import __version__, __title__ +from .load_components import TextDescriptives # noqa: F401 diff --git a/textdescriptives/components/__init__.py b/textdescriptives/components/__init__.py index 4647fd11..e48e9620 100644 --- a/textdescriptives/components/__init__.py +++ b/textdescriptives/components/__init__.py @@ -1,4 +1,5 @@ -from .readability import Readability -from .dependency_distance import DependencyDistance -from .descriptive_stats import DescriptiveStatistics -from .pos_stats import POSStatistics +from .dependency_distance import DependencyDistance # noqa: F401 +from .descriptive_stats import DescriptiveStatistics # noqa: F401 +from .pos_stats import POSStatistics # noqa: F401 +from .quality import Quality # noqa: F401 +from .readability import Readability # noqa: F401 diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py index cc35ed33..3fc74810 100644 --- a/textdescriptives/components/quality.py +++ b/textdescriptives/components/quality.py @@ -1,7 +1,7 @@ """Component for calculating quality metrics.""" from collections import Counter, defaultdict from functools import partial -from typing import Callable, Dict, List, Tuple, Union +from typing import Callable, Dict, List, Optional, Tuple, Union import numpy as np from spacy.language import Language @@ -9,6 +9,28 @@ from textdescriptives.components.utils import span_getter_to_doc_getter +DEFAULT_QUALITY_THRESHOLDS = { + "n_stop_words": (2, None), + "alpha_ratio": (0.8, None), + "mean_word_length": (3, 10), + "doc_length": (10, 100_000), + "symbol_#_2_word_ratio": (None, 0.1), + "proportion_ellipsis": (None, 0.3), + "proportion_bullet_points": (None, 0.8), + "duplicate_line_chr_fraction": (None, 0.2), + "duplicate_paragraph_chr_fraction": (None, 0.2), + "duplicate_5-gram_chr_fraction": (None, 0.15), + "duplicate_6-gram_chr_fraction": (None, 0.14), + "duplicate_7-gram_chr_fraction": (None, 0.13), + "duplicate_8-gram_chr_fraction": (None, 0.12), + "duplicate_9-gram_chr_fraction": (None, 0.11), + "duplicate_10-gram_chr_fraction": (None, 0.1), + "top_2-gram_chr_fraction": (None, 0.20), + "top_3-gram_chr_fraction": (None, 0.18), + "top_4-gram_chr_fraction": (None, 0.16), + "contains_lorem ipsum": False, +} + def n_stop_words(span: Span) -> int: """Count the number of stop words in a document. @@ -19,7 +41,7 @@ def n_stop_words(span: Span) -> int: Returns: int: number of stop words """ - return len([t for t in span if t.is_stop]) + return sum(t.is_stop for t in span) def mean_word_length(span: Span) -> float: @@ -31,7 +53,10 @@ def mean_word_length(span: Span) -> float: Returns: float: mean word length """ - return np.mean([len(t) for t in span]) + tokens_lengths = [len(t) for t in span] + if tokens_lengths: + return float(np.mean(tokens_lengths)) + return 0.0 def alpha_ratio(span: Span) -> float: @@ -51,7 +76,10 @@ def contains_alpha(token): return True return False - return np.mean([contains_alpha(t) for t in span]) + token_contains_alpha = [contains_alpha(token) for token in span] + if token_contains_alpha: + return float(np.mean(token_contains_alpha)) + return 0.0 def proportion_bullet_points( # pylint: disable=dangerous-default-value @@ -65,8 +93,17 @@ def proportion_bullet_points( # pylint: disable=dangerous-default-value Returns: float: proportion of bullet points """ - lines = span._.lines - return np.mean([line.strip().startswith(bullet_point) for line in lines]) + # check if has extension _lines + if not hasattr(span._, "lines"): + lines = span.text.split("\n") + else: + lines = span._.lines + line_starts_with_bullet = [ + line.strip().startswith(tuple(bullet_point)) for line in lines + ] + if line_starts_with_bullet: + return float(np.mean(line_starts_with_bullet)) + return 0.0 def proportion_ellipsis( # pylint: disable=dangerous-default-value @@ -76,45 +113,93 @@ def proportion_ellipsis( # pylint: disable=dangerous-default-value Args: span (Span): spaCy span object + ellipsis (set): set of ellipsis Returns: float: proportion of ellipsis """ - lines = span._.lines - return np.mean([line.strip().endswith(ellipsis) for line in lines]) + if not hasattr(span._, "lines"): + lines = span.text.split("\n") + else: + lines = span._.lines + + line_ends_with_ellipsis = [line.strip().endswith(tuple(ellipsis)) for line in lines] + if line_ends_with_ellipsis: + return float(np.mean(line_ends_with_ellipsis)) + return 0.0 -def duplicate_line_fraction(span: Span) -> float: - """Calculate the proportion of of characters within duplicate lines. +def get_ranges(arr: np.ndarray) -> List[Tuple[int, int]]: + """Get true ranges from boolean array, i.e. + + Example: + >>> get_ranges(np.array([0, 1, 1, 0, 0, 1, 1])) + [(1, 3), (5, 7)] + """ + ranges = [] + start = None + for i, val in enumerate(arr): + if (val and start) is None: + start = i + elif not val and start is not None: + ranges.append((start, i)) + start = None + if start is not None: + ranges.append((start, len(arr))) + return ranges + + +def duplicate_paragraph_chr_fraction(span: Span) -> float: + """Calculate the character fraction of duplicate paragraphs. Args: span (Span): spaCy span object Returns: - float: proportion of characters within a duplicate lines + float: The fraction of duplicate characters. """ - lines = span._.lines - unique_lines = set(lines) - return 1 - len(unique_lines) / len(lines) + chr_len = len(span.text) + if chr_len == 0: + return 0.0 + + if not hasattr(span._, "paragraphs"): + paragraphs = span.text.split("\n\n") + else: + paragraphs = span._.paragraphs + paragraph_counter = Counter(paragraphs) + duplicate_chr = 0 + for t, c in paragraph_counter.items(): + if c > 1: + duplicate_chr += len(t) * (c - 1) + frac = duplicate_chr / chr_len + return frac + + +def duplicate_line_chr_fraction(span: Span) -> float: + """Calculate the character fraction of duplicate lines. -def duplicate_chr_fraction_getter(doc: Doc, attr: str) -> float: - """Calculate the character fraction of duplicates based on a counter object. Args: - doc (Doc): - A spaCy Doc. - attr (str): - The document attribute to extract. + span (Span): spaCy span object + Returns: - float: - The fraction of duplicate characters. + float: The fraction of duplicate characters. """ - counter = getattr(doc._, attr) + chr_len = len(span.text) + if chr_len == 0: + return 0.0 + + if not hasattr(span._, "lines"): + lines = span.text.split("\n") + else: + lines = span._.lines + line_counter = Counter(lines) + duplicate_chr = 0 - for t, c in counter.items(): + for t, c in line_counter.items(): if c > 1: duplicate_chr += len(t) * (c - 1) - frac = duplicate_chr / doc._.chr_len + frac = duplicate_chr / chr_len return frac @@ -130,86 +215,86 @@ def symbol_2_word_ratio(span: Span, symbol: str) -> float: float: ratio of symbols to words """ n_symbol = span.text.count(symbol) - return n_symbol / len([t for t in span if not (t.is_space or t.is_punct)]) + n_words = sum(not (t.is_space or t.is_punct) for t in span) + if n_words: + return n_symbol / n_words + return 0.0 -def duplicate_ngram_fraction( - span: Span, - ngram_range: Tuple[int, int], -) -> Dict[int, float]: - """calculates the character fraction of duplicate n-gram over the overall text, - taking care not to count overlapping n-grams twice. +def span_ngrams(span: Span, ngram_range: Tuple[int, int]) -> Dict[str, Counter]: + """Calculate the counts of n-grams in the specified range. Args: span (Span): spaCy span object - ngram_range (Tuple[int, int], optional): The n-gram range. + ngram_range (Tuple[int, int]): The n-gram range. Returns: - Dict[int, float]: the fraction of duplicate characters for each - n-gram size + Dict[int, Dict[str, int, List[Span]]]: A dictionary that for each n in the ngram + range contains the counts of the n-grams as well as the spans of the + n-grams. """ - lower, upper = ngram_range - - # calcuate maximum chr. limits according to thresholds - ngrams = defaultdict(set) - duplicate_char = defaultdict(int) - minmax = defaultdict(lambda: [0, 0]) max_len = len(span) - + lower, upper = ngram_range + shingles_count = { + n: defaultdict(lambda: {"count": 0, "span": []}) + for n in range(lower, upper + 1) + } for i, _ in enumerate(span): for ngram_size in range(lower, upper + 1): - - min_, max_ = minmax[ngram_size] end = i + ngram_size + if not end > max_len: + ngram_span = span[i:end] + ngram = ngram_span.text + shingles_count[ngram_size][ngram]["count"] += 1 + shingles_count[ngram_size][ngram]["span"].append(ngram_span) + return shingles_count - if end < max_len: - span = span[i:end] - ngram = span.text.lower() # create n-gram from span - - if ngram in ngrams[ngram_size]: - # if it doesn't overlap with other ngrams of the same size - # update - if span.start_char > max_: - duplicate_char[ngram_size] += max_ - min_ - minmax[ngram_size] = [span.start_char, span.end_char] - else: - # extend range of duplicates - minmax[ngram_size][1] = span.end_char - else: - ngrams[ngram_size].add(ngram) - - # empty buffer for of duplicate chr. which have yet to be added. - for ngram_size in range(lower, upper + 1): - min_, max_ = minmax[ngram_size] - duplicate_char[ngram_size] += max_ - min_ - - return duplicate_char +def duplicate_ngram_fraction( + span: Span, + ngram_range: Tuple[int, int], +) -> Dict[int, float]: + """calculates the character fraction of duplicate n-gram over the overall text, + taking care not to count overlapping n-grams twice. This does not include spaces + between the n-grams. -def n_gram_counter(span: Span, ngram_range: Tuple[int, int]) -> Dict[str, Counter]: - """Calculate the counts of n-grams in the specified range. Args: span (Span): spaCy span object - ngram_range (Tuple[int, int]): The n-gram range. + ngram_range (Tuple[int, int], optional): The n-gram range. + Returns: - Dict[str, Counter]: Dict with str keys and Counter values. A dictionary - containing the counts of n-grams for a specific n. + Dict[int, float]: the fraction of duplicate characters for each + n-gram size """ max_len = len(span) - lower, upper = ngram_range - shingles_count = defaultdict(Counter) - for i, _ in enumerate(span): - for ngram_size in range(lower, upper + 1): - end = i + ngram_size - if end < max_len: - span = span[i:end] - shingles_count[ngram_size][span.text.lower()] += 1 - return shingles_count + chr_len = len(span.text) + if chr_len == 0: + return {n: 0.0 for n in range(ngram_range[0], ngram_range[1] + 1)} + shingles_count = span_ngrams(span, ngram_range) + duplicate_chr_fraction = {} + for ngram_size, ngrams in shingles_count.items(): + # create a boolean array of the same length as the text + # where True indicates that the token is a duplicate + is_duplicate = np.zeros(max_len, dtype=bool) + # set duplicate tokens to True + for ngram, count in ngrams.items(): + if count["count"] > 1: + for ngram_span in count["span"]: + is_duplicate[ngram_span.start : ngram_span.end] = True + + duplicate_chars = 0 + # get duplicate ranges from boolean array + for start, end in get_ranges(is_duplicate): + _span = span[start:end] + duplicate_chars += _span.end_char - _span.start_char + duplicate_chr_fraction[ngram_size] = duplicate_chars / chr_len + return duplicate_chr_fraction def top_ngram_chr_fraction( span: Span, ngram_range: Tuple[int, int], + min_count: int = 0, ) -> float: """Calculated whether the character fraction of the top n-grams is below the given thresholds @@ -217,16 +302,29 @@ def top_ngram_chr_fraction( Args: span (Span): spaCy span object ngram_range (Tuple[int, int], optional): Range of n grams to examine. + min_count (int): Minimum count of n-grams to before an n-gram is considered + a top n-gram. Defaults to 0. Returns: float: The fraction of the top n-grams. """ - ngram_counter = n_gram_counter(span, ngram_range=ngram_range) + chr_len = len(span.text) + if chr_len == 0: + return {n: 0.0 for n in range(ngram_range[0], ngram_range[1] + 1)} + + ngram_counter = span_ngrams(span, ngram_range=ngram_range) top_ngram_chr_frac = {} - for n in zip(ngram_counter): - ngram, count = ngram_counter[n].most_common(1)[0] - top_ngram_chr_frac[n] = len(ngram) * count / span._.chr_len - top_ngram_chr_frac = len(ngram) * count / span._.chr_len + for n in ngram_counter: + # find the top n-gram + ngram, count_span = max(ngram_counter[n].items(), key=lambda x: x[1]["count"]) + count = count_span["count"] + if count >= min_count: + # calculate the fraction of the top n-gram + top_ngram_chr_frac[n] = len(ngram) / chr_len + else: + top_ngram_chr_frac[n] = 0.0 + + return top_ngram_chr_frac def contains_string(span: Span, string: str) -> bool: @@ -254,43 +352,43 @@ def __init__( # pylint: disable=dangerous-default-value symbols: List[str], contains: List[str], top_ngram_range: Tuple[int, int], + top_ngram_min_count: int, duplicate_n_gram_fraction_range: Tuple[int, int], + quality_thresholds: Optional[ + Dict[str, Union[bool, Tuple[Optional[float], Optional[float]]]] + ] = None, force: bool = False, ): # noqa: D107 """Initialise components""" self.name = name self.force = force - - duplicate_lines_chr_fraction = partial( - duplicate_chr_fraction_getter, attr="lines_counter" - ) - duplicate_paragraph_chr_fraction = partial( - duplicate_chr_fraction_getter, attr="paragraphs_counter" - ) - - self.extensions = { - "_lines": lambda span: span.text.split("\n"), - "_paragrahs": lambda span: span.text.split("\n\n"), - "_lines_counter": lambda span: Counter(span._.lines), - "_paragraphs_counter": lambda span: Counter(span._.paragraphs), - "_chr_len": lambda span: len(span.text), - } + self.symbols = symbols + self.contains = contains + self.top_ngram_range = top_ngram_range + self.top_ngram_min_count = top_ngram_min_count + self.duplicate_n_gram_fraction_range = duplicate_n_gram_fraction_range + if quality_thresholds is None: + quality_thresholds = DEFAULT_QUALITY_THRESHOLDS + self.quality_thresholds = quality_thresholds self.getters = { # heuristic quality filters "n_stop_words": n_stop_words, "alpha_ratio": alpha_ratio, "mean_word_length": mean_word_length, + "doc_length": len, "proportion_ellipsis": proportion_ellipsis, "proportion_bullet_points": proportion_bullet_points, # text repetition - "duplicate_lines_chr_fraction": duplicate_lines_chr_fraction, + "duplicate_line_chr_fraction": duplicate_line_chr_fraction, "duplicate_paragraph_chr_fraction": duplicate_paragraph_chr_fraction, "duplicate_ngram_chr_fraction": partial( duplicate_ngram_fraction, ngram_range=duplicate_n_gram_fraction_range ), "top_ngram_chr_fraction": partial( - top_ngram_chr_fraction, ngram_range=top_ngram_range + top_ngram_chr_fraction, + ngram_range=top_ngram_range, + min_count=top_ngram_min_count, ), } # add symbol to word ratio @@ -302,16 +400,12 @@ def __init__( # pylint: disable=dangerous-default-value for string in contains: self.getters[f"contains_{string}"] = partial(contains_string, string=string) - self.set_extensions() + self.extensions = { + "passed_quality_check": self.passed_quality_thresholds, + "quality": self.quality_getter, + } - if not Span.has_extension("quality") or force: - Span.set_extension("quality", getter=self.quality_getter, force=force) - if not Doc.has_extension("quality") or force: - Doc.set_extension( - "quality", - getter=span_getter_to_doc_getter(self.quality_getter), - force=force, - ) + self.set_extensions() def __call__(self, doc: Doc): """Run the pipeline component""" @@ -331,15 +425,39 @@ def quality_getter(self, span: Span) -> Dict[str, Union[float, int, bool]]: if name == "top_ngram_chr_fraction": chr_frac = getter(span) for n_gram, frac in chr_frac.items(): - quality[f"{n_gram}_gram_chr_fraction"] = frac - if name == "duplicate_ngram_chr_fraction": + quality[f"top_{n_gram}-gram_chr_fraction"] = frac + elif name == "duplicate_ngram_chr_fraction": chr_frac = getter(span) for n_gram, frac in chr_frac.items(): - quality[f"{n_gram}_gram_duplicate_chr_fraction"] = frac - - quality[name] = getter(span) + quality[f"duplicate_{n_gram}-gram_chr_fraction"] = frac + else: + quality[name] = getter(span) return quality + def passed_quality_thresholds(self, span: Span) -> bool: + """ + Checks whether a span passed the quality thresholds + """ + quality = span._.quality + for name, threshold in self.quality_thresholds.items(): + if name not in quality: + raise KeyError(f"Quality metric {name} not found in doc._.quality") + if isinstance(threshold, bool): + if quality[name] != threshold: + return False + elif isinstance(threshold, tuple) and len(threshold) == 2: + if threshold[0] is not None and quality[name] < threshold[0]: + return False + if threshold[1] is not None and quality[name] > threshold[1]: + return False + else: + raise ValueError( + f"Quality threshold {name} is not a bool, or " + + f"Tuple of length 2, but {type(threshold)}." + ) + + return True + def set_extensions(self): """Set required extensions.""" @@ -358,31 +476,37 @@ def set_extensions(self): "symbols": ["#"], "contains": ["lorem ipsum"], "top_ngram_range": [2, 4], + "top_ngram_min_count": 3, "duplicate_n_gram_fraction_range": [5, 10], "force": False, + "quality_thresholds": None, }, ) -def create_quality_component( +def create_quality_component( # pylint: disable=dangerous-default-value nlp: Language, name: str, symbols: List[str], contains: List[str], top_ngram_range: Tuple[int, int], + top_ngram_min_count: int, duplicate_n_gram_fraction_range: Tuple[int, int], - force: bool, + quality_thresholds: Optional[ + Dict[str, Union[bool, Tuple[Optional[float], Optional[float]]]] + ] = None, + force: bool = False, ) -> Callable[[Doc], Doc]: """Allows Quality to be added to a spaCy pipe using nlp.add_pipe("quality"). Set the following extensions: - {Span/Doc}._.quality - - {Span/Doc}._._lines - - {Span/Doc}._._paragraphs - - {Span/Doc}._._lines_counter - - {Span/Doc}._._paragraphs_counter - - {Span/Doc}._._chr_len + - {Span/Doc}._.passed_quality_check + + It is also possible to optionally set the following extensions: + - {Span/Doc}._.lines + - {Span/Doc}._.paragraphs - Where the last are used to calculate some of the quality metrics. The can be - overwritten if you e.g. wish lines to be split on "\\r\\n" instead of "\\n". + These are used to calculate some of the quality metrics. They can be overwritten if + you e.g. wish lines to be split on "\\r\\n" instead of "\\n". A large part of the quality metrics were proposed by [1] and [2] for filtering out low quality text from large text corpora. @@ -404,12 +528,38 @@ def create_quality_component( document contains them. Defaults to ["lorem ipsum"]. top_ngram_range (Tuple[int]): range of n-grams to calculate the proportion of the top n-gram. Defaults to [2, 4]. + top_ngram_min_count (int): minimum number of times a n-gram must occur to + be considered a top n-gram. Defaults to 3. duplicate_n_gram_fraction_range (Tuple[int]): range of n-grams to calculate the proportion of duplicate n-grams. Defaults to [5, 10]. force (bool): whether to overwrite existing extensions. Defaults to False. + quality_thresholds (Dict[str, Union[bool, Tuple[Union[int, float, None], + Union[int, float, None]]]]): A dictionary of quality thresholds indicated by + either a range (Tuple), wherein the first value is the lower bound and the + second value is the upper bound. Lower and upper bounds can be None, in + which case they are not checked. Alternatively, a boolean can be provided, + checking if the quality metric is boolean. For example, if you don't want + documents containing `lorem ipsum`, to pass the quality check, you can set + `quality_thresholds={"contains_lorem_ipsum": False}`. Similar if you want to + set a upper bound on the `duplicate_5-gram_chr_fraction`, you can set + `quality_thresholds={"duplicate_5-gram_chr_fraction": (None, 0.15)}`. + Default values are set in + `textdescriptives.constants.DEFAULT_QUALITY_THRESHOLDS`. + Returns: Callable[[Doc], Doc]: the spaCy component + + Example: + >>> import spacy + >>> from spacy_quality import Quality + >>> nlp = spacy.blank(("en_core_web_sm") + >>> nlp.add_pipe("quality") + >>> doc = nlp("This is a test") + >>> # extract quality metrics + >>> doc._.quality + >>> # check whether the document passed the quality thresholds + >>> doc._.passed_quality_check """ return Quality( nlp, @@ -417,6 +567,8 @@ def create_quality_component( symbols=symbols, contains=contains, top_ngram_range=top_ngram_range, + top_ngram_min_count=top_ngram_min_count, duplicate_n_gram_fraction_range=duplicate_n_gram_fraction_range, + quality_thresholds=quality_thresholds, force=force, ) diff --git a/textdescriptives/tests/test_quality.py b/textdescriptives/tests/test_quality.py new file mode 100644 index 00000000..947b2ef9 --- /dev/null +++ b/textdescriptives/tests/test_quality.py @@ -0,0 +1,251 @@ +""" +Tests for the quality module. +""" + +from typing import List, Tuple + +import pytest +import spacy + +from textdescriptives.components.quality import ( + alpha_ratio, + duplicate_ngram_fraction, + mean_word_length, + n_stop_words, + proportion_bullet_points, + proportion_ellipsis, + symbol_2_word_ratio, + top_ngram_chr_fraction, +) + + +@pytest.fixture +def nlp(): + """Load a blank English model.""" + return spacy.blank("en") + + +@pytest.mark.parametrize( + "text, stop_words", + [ + ("", 0), + ("This is a test.", 3), + ("This is a test. This is a test.", 6), + ], +) +def test_n_stop_words(text: str, stop_words: int, nlp: spacy.Language): + """Test the n_stop_words function.""" + doc = nlp(text) + assert n_stop_words(doc) == stop_words + + +# test mean word length +@pytest.mark.parametrize( + "text, mean_length", + [ + ("", 0), + ("This is a test.", 2.4), + ("This is a test. This is a test.", 2.4), + ], +) +def test_mean_word_length(text: str, mean_length: float, nlp: spacy.Language): + """Test the mean_word_length function.""" + doc = nlp(text) + assert mean_word_length(doc) == mean_length + + +# test alpha ratio +@pytest.mark.parametrize( + "text, alpha", + [ + ("", 0), + ("This is a test.", 0.8), + ("This,, is a test. 12355 is €%&/( a <3.10 + assert abs(d[i] - j) < 0.01 + + +# test top ngram chr fraction +@pytest.mark.parametrize( + "text, top_ngram_chr_frac, ngram_range", + [ + ("", (0, 0), (2, 3)), + ("This is a test.", (0.466, 0.6), (2, 3)), + ("This is a test. This is a monkey", (0.437, 0.562, 0.437), (2, 4)), + ( + "This is a test. This is a monkey. This is a star.", + (0.428, 0.551, 0.449), + (2, 4), + ), + ], +) +def test_top_ngram_chr_fraction( + text: str, + top_ngram_chr_frac: List[float], + ngram_range: Tuple[int, int], + nlp: spacy.Language, +): + """Test the top_ngram_chr_fraction function.""" + doc = nlp(text) + top_ngram_fractions = top_ngram_chr_fraction(doc, ngram_range=ngram_range) + for i, j in zip(top_ngram_fractions.values(), top_ngram_chr_frac): + assert abs(i - j) < 0.01 + + +def test_quality_component(nlp: spacy.Language): + """Test the quality component.""" + nlp.add_pipe("quality") + doc = nlp("This is a test. This is a test. This is a test.") + assert doc._.quality["n_stop_words"] == 9 + assert doc._.quality["mean_word_length"] == 2.4 + assert doc._.quality["alpha_ratio"] == 0.8 + assert doc._.quality["proportion_bullet_points"] == 0 + assert doc._.quality["proportion_ellipsis"] == 0 + assert doc._.quality["symbol_#_2_word_ratio"] == 0 + assert doc._.quality["duplicate_5-gram_chr_fraction"] == 1 + assert abs(doc._.quality["top_2-gram_chr_fraction"] - 0.44) < 0.01 + assert doc._.passed_quality_check is False + + +def test_quality_component_with_config(nlp: spacy.Language): + """Test the quality component with config.""" + + quality_thresholds = { + "n_stop_words": (3, None), + "alpha_ratio": (None, 0.8), + "mean_word_length": (1, 10), + "doc_length": (10, 100_000), + "symbol_._2_word_ratio": (None, 0.3), + "proportion_ellipsis": (None, 0.3), + "proportion_bullet_points": (None, 0.8), + "duplicate_line_chr_fraction": (None, 0.2), + "duplicate_paragraph_chr_fraction": (None, 0.2), + "top_2-gram_chr_fraction": (None, 0.6), + "top_3-gram_chr_fraction": (None, 0.6), + "contains_lorem ipsum": False, + } + d = nlp.add_pipe( + "quality", config={"symbols": ["."], "quality_thresholds": quality_thresholds} + ) + + doc = nlp("This is a test. This is a test. This is a test.") + assert doc._.quality["n_stop_words"] == 9 + assert doc._.quality["mean_word_length"] == 2.4 + assert doc._.quality["alpha_ratio"] == 0.8 + assert doc._.quality["proportion_bullet_points"] == 0 + assert doc._.quality["proportion_ellipsis"] == 0 + assert doc._.quality["symbol_._2_word_ratio"] == 0.25 + assert doc._.quality["duplicate_5-gram_chr_fraction"] == 1 + assert doc._.quality["duplicate_8-gram_chr_fraction"] == 1 + assert abs(doc._.quality["top_3-gram_chr_fraction"] - 0.57) < 0.01 + assert doc._.passed_quality_check is True + + +@pytest.mark.parametrize( + "text, passed", + [ + ("", False), + ( + "This is a reasonable text, which has a very good sentence structure and " + + "will therefore pass the quality check.", + True, + ), + ( + "This is repitious text, This is repitious text, This is repitious text.", + False, + ), + ("This test has many symobls #!@#$%^&*()_+.", False), + ("- this is a text of \n - bullet points", False), + ], +) +def test_passed_quality_check(text: str, passed: bool, nlp: spacy.Language): + """Test the passed_quality_check attribute.""" + nlp.add_pipe("quality") + doc = nlp(text) + assert doc._.passed_quality_check == passed From b47d60273e6fa6d8c29b15a7860aabd3b9da2537 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sun, 25 Sep 2022 16:28:06 +0200 Subject: [PATCH 07/11] fix: Fixed erros in test due by forcing the extension to be set. --- textdescriptives/components/quality.py | 6 +++--- textdescriptives/tests/test_quality.py | 11 ++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py index 3fc74810..dfd49208 100644 --- a/textdescriptives/components/quality.py +++ b/textdescriptives/components/quality.py @@ -320,7 +320,7 @@ def top_ngram_chr_fraction( count = count_span["count"] if count >= min_count: # calculate the fraction of the top n-gram - top_ngram_chr_frac[n] = len(ngram) / chr_len + top_ngram_chr_frac[n] = (len(ngram) * count) / chr_len else: top_ngram_chr_frac[n] = 0.0 @@ -465,9 +465,9 @@ def set_extensions(self): doc_getter = span_getter_to_doc_getter(span_getter) if not Span.has_extension(ext_name) or self.force is True: - Span.set_extension(ext_name, getter=span_getter) + Span.set_extension(ext_name, getter=span_getter, force=True) if not Doc.has_extension(ext_name) or self.force is True: - Doc.set_extension(ext_name, getter=doc_getter) + Doc.set_extension(ext_name, getter=doc_getter, force=True) @Language.factory( diff --git a/textdescriptives/tests/test_quality.py b/textdescriptives/tests/test_quality.py index 947b2ef9..6d510a88 100644 --- a/textdescriptives/tests/test_quality.py +++ b/textdescriptives/tests/test_quality.py @@ -180,7 +180,7 @@ def test_top_ngram_chr_fraction( def test_quality_component(nlp: spacy.Language): """Test the quality component.""" - nlp.add_pipe("quality") + nlp.add_pipe("quality", config={"force": True}) doc = nlp("This is a test. This is a test. This is a test.") assert doc._.quality["n_stop_words"] == 9 assert doc._.quality["mean_word_length"] == 2.4 @@ -211,7 +211,12 @@ def test_quality_component_with_config(nlp: spacy.Language): "contains_lorem ipsum": False, } d = nlp.add_pipe( - "quality", config={"symbols": ["."], "quality_thresholds": quality_thresholds} + "quality", + config={ + "symbols": ["."], + "quality_thresholds": quality_thresholds, + "force": True, + }, ) doc = nlp("This is a test. This is a test. This is a test.") @@ -246,6 +251,6 @@ def test_quality_component_with_config(nlp: spacy.Language): ) def test_passed_quality_check(text: str, passed: bool, nlp: spacy.Language): """Test the passed_quality_check attribute.""" - nlp.add_pipe("quality") + nlp.add_pipe("quality", config={"force": True}) doc = nlp(text) assert doc._.passed_quality_check == passed From 2aadd118882e35f390d49dddb0e99a4501e49ebb Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Sun, 25 Sep 2022 16:29:20 +0200 Subject: [PATCH 08/11] refactor: changed default of force to True --- textdescriptives/components/quality.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py index dfd49208..fa53d066 100644 --- a/textdescriptives/components/quality.py +++ b/textdescriptives/components/quality.py @@ -478,7 +478,7 @@ def set_extensions(self): "top_ngram_range": [2, 4], "top_ngram_min_count": 3, "duplicate_n_gram_fraction_range": [5, 10], - "force": False, + "force": True, "quality_thresholds": None, }, ) @@ -493,7 +493,7 @@ def create_quality_component( # pylint: disable=dangerous-default-value quality_thresholds: Optional[ Dict[str, Union[bool, Tuple[Optional[float], Optional[float]]]] ] = None, - force: bool = False, + force: bool = True, ) -> Callable[[Doc], Doc]: """Allows Quality to be added to a spaCy pipe using nlp.add_pipe("quality"). @@ -532,7 +532,6 @@ def create_quality_component( # pylint: disable=dangerous-default-value be considered a top n-gram. Defaults to 3. duplicate_n_gram_fraction_range (Tuple[int]): range of n-grams to calculate the proportion of duplicate n-grams. Defaults to [5, 10]. - force (bool): whether to overwrite existing extensions. Defaults to False. quality_thresholds (Dict[str, Union[bool, Tuple[Union[int, float, None], Union[int, float, None]]]]): A dictionary of quality thresholds indicated by either a range (Tuple), wherein the first value is the lower bound and the @@ -545,6 +544,7 @@ def create_quality_component( # pylint: disable=dangerous-default-value `quality_thresholds={"duplicate_5-gram_chr_fraction": (None, 0.15)}`. Default values are set in `textdescriptives.constants.DEFAULT_QUALITY_THRESHOLDS`. + force (bool): whether to overwrite existing extensions. Defaults to True. Returns: From 92a70e87f0a366b74a3a978139b89cc3fea65c60 Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 26 Sep 2022 11:20:58 +0200 Subject: [PATCH 09/11] feat: add quality metrics to `Extractor` --- README.md | 3 +- docs/quality.rst | 2 +- textdescriptives/components/quality.py | 3 +- textdescriptives/dataframe_extract.py | 60 +++++++++---------- textdescriptives/load_components.py | 2 + textdescriptives/tests/test_extractor.py | 2 +- .../tests/test_load_components.py | 1 + 7 files changed, 37 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index b3a5a688..660ecc0f 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ A Python library for calculating a large variety of statistics from text(s) usin # 📰 News +* New component: `quality` which implements a bunch of metrics for checking the quality of a document. See the [news](https://github.com/HLasse/TextDescriptives/blob/master/NEWS.md) for further information. * TextDescriptives has been completely re-implemented using spaCy v.3.0. The stanza implementation can be found in the `stanza_version` branch and will no longer be maintained. * Check out the brand new documentation [here](https://hlasse.github.io/TextDescriptives/)! See [NEWS.md](https://github.com/HLasse/TextDescriptives/blob/master/NEWS.md) for release notes (v. 1.0.5 and onwards) @@ -93,7 +94,7 @@ The table below shows the metrics included in TextDescriptives and their attribu | `{Doc/Span}._.counts` | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the Doc/Span. | | `{Doc/Span}._.pos_proportions` | `pos_stats` | Dict of `{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG. | | `{Doc/Span}._.token_length` | `descriptive_stats` | Dict containing mean, median, and std of token length. | -| `{Doc/Span}._.quality` | `quality` | Dict a series of heuristic metrics related to text quality. Targeted at filtering out low-quality text. | +| `{Doc/Span}._.quality` | `quality` | Dict containing a number of heuristic metrics related to text quality. Targeted at filtering out low-quality text. | | `{Doc/Span}._.passed_quality_check` | `quality` | Boolean on whether the document or span passed threshold sets for quality checks. | diff --git a/docs/quality.rst b/docs/quality.rst index 1ef2a5f1..4ed79c9a 100644 --- a/docs/quality.rst +++ b/docs/quality.rst @@ -14,7 +14,7 @@ Heuristic quality metrics: * Symbol to word ratio (:code:`symbol_{symbol}_2_word_ratio`): Ratio of specified symbols to words, could e.g. include ratio of hashtags or curly brackets. * Contains string (:code:`contains_{string}`): Whether the document contains a specified string. For instance documents containing the string "lorem ipsum". -Repititious text metrics: +Repetitious text metrics: * Duplicate lines character fraction (:code:`duplicate_lines_chr_fraction`): Fraction of characters in a document which are contained within duplicate lines. * Duplicate paragraphs character fraction (:code:`duplicate_paragraphs_chr_fraction`): Fraction of characters in a document which are contained within duplicate paragraphs. diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py index fa53d066..1d80b854 100644 --- a/textdescriptives/components/quality.py +++ b/textdescriptives/components/quality.py @@ -89,6 +89,7 @@ def proportion_bullet_points( # pylint: disable=dangerous-default-value Args: span (Span): spaCy span object + bullet_point (set): set of bullet points Returns: float: proportion of bullet points @@ -543,7 +544,7 @@ def create_quality_component( # pylint: disable=dangerous-default-value set a upper bound on the `duplicate_5-gram_chr_fraction`, you can set `quality_thresholds={"duplicate_5-gram_chr_fraction": (None, 0.15)}`. Default values are set in - `textdescriptives.constants.DEFAULT_QUALITY_THRESHOLDS`. + `textdescriptives.components..quality.DEFAULT_QUALITY_THRESHOLDS`. force (bool): whether to overwrite existing extensions. Defaults to True. diff --git a/textdescriptives/dataframe_extract.py b/textdescriptives/dataframe_extract.py index 6af7b102..4104bf35 100644 --- a/textdescriptives/dataframe_extract.py +++ b/textdescriptives/dataframe_extract.py @@ -35,6 +35,7 @@ def __init__( "readability", "dependency_distance", "pos_stats", + "quality", "all", ] ) @@ -58,60 +59,55 @@ def __init__( extraction = [] if "all" in metrics: - if doc.has_extension("counts"): - extraction.append(self.__descriptive_stats(doc)) - if doc.has_extension("readability"): - extraction.append(self.__readability(doc)) - if doc.has_extension("dependency_distance"): - extraction.append(self.__dependency_distance(doc)) - if doc.has_extension("pos_proportions"): - extraction.append(self.__pos_proportions(doc)) + for component in valid_metrics - set(["all"]): + if doc.has_extension(component) or component == "descriptive_stats": + extraction.append(self.__unpack_extension(doc, component)) + else: - if "descriptive_stats" in metrics: - extraction.append(self.__descriptive_stats(doc)) - if "readability" in metrics: - extraction.append(self.__readability(doc)) - if "dependency_distance" in metrics: - extraction.append(self.__dependency_distance(doc)) - if "pos_stats" in metrics: - extraction.append(self.__pos_proportins(doc)) + for component in metrics: + if doc.has_extension(component) or component == "descriptive_stats": + extraction.append(self.__unpack_extension(doc, component)) if self.as_dict: self.out = reduce(lambda a, b: {**a, **b}, extraction) else: self.out = pd.concat(extraction, axis=1) - def __descriptive_stats(self, doc: Doc) -> pd.DataFrame: + def __get_descriptive_stats_dict(self, doc: Doc) -> pd.DataFrame: descriptive_stats = { **doc._.token_length, **doc._.sentence_length, **doc._.syllables, **doc._.counts, } - if self.as_dict: - return descriptive_stats - return pd.DataFrame.from_records([descriptive_stats]) + return descriptive_stats - def __readability(self, doc: Doc) -> pd.DataFrame: - if self.as_dict: - return doc._.readability - return pd.DataFrame.from_records([doc._.readability]) + def __unpack_extension(self, doc: Doc, extension: str) -> pd.DataFrame: + """Unpacks the the values from the extension to a dict or dataframe + + Args: + doc (Doc): Document to extract from + extension (str): Extension to extract + + Returns: + pd.DataFrame: DataFrame with extension values + """ + # doc.get_extension returns a tuple of (default, method, getter, setter) + # we only need the getter + if extension == "descriptive_stats": + values = self.__get_descriptive_stats_dict(doc) + else: + values = doc.get_extension(extension)[2](doc) - def __dependency_distance(self, doc: Doc) -> pd.DataFrame: if self.as_dict: - return doc._.dependency_distance - return pd.DataFrame.from_records([doc._.dependency_distance]) + return values + return pd.DataFrame.from_records([values]) def __extract_text(self, doc: Doc) -> Union[pd.DataFrame, str]: if self.as_dict: return {"text": doc.text} return pd.DataFrame([doc.text], columns=["text"]) - def __pos_proportions(self, doc: Doc) -> pd.DataFrame: - if self.as_dict: - return doc._.pos_proportions - return pd.DataFrame.from_records([doc._.pos_proportions]) - def extract_df( doc: Doc, metrics: Union[List[str], str] = "all", include_text: bool = True diff --git a/textdescriptives/load_components.py b/textdescriptives/load_components.py index a19a5322..ffde8a96 100644 --- a/textdescriptives/load_components.py +++ b/textdescriptives/load_components.py @@ -4,6 +4,7 @@ DependencyDistance, DescriptiveStatistics, POSStatistics, + Quality ) from spacy.language import Language @@ -17,6 +18,7 @@ def create_textdescriptives_component(nlp: Language, name: str): "readability", "dependency_distance", "pos_stats", + "quality" ]: nlp.add_pipe(component, last=True) return TextDescriptives(nlp) diff --git a/textdescriptives/tests/test_extractor.py b/textdescriptives/tests/test_extractor.py index 5c204c57..9299cfe4 100644 --- a/textdescriptives/tests/test_extractor.py +++ b/textdescriptives/tests/test_extractor.py @@ -13,7 +13,7 @@ def nlp(): def test_extract_df_single_doc(nlp): doc = nlp("This is just a cute little text. Actually, it's two sentences.") td.extract_df(doc) - for metric in ["descriptive_stats", "readability", "dependency_distance"]: + for metric in ["descriptive_stats", "readability", "dependency_distance", "quality"]: td.extract_df(doc, metrics=metric) diff --git a/textdescriptives/tests/test_load_components.py b/textdescriptives/tests/test_load_components.py index 587baa42..7ecd8ad8 100644 --- a/textdescriptives/tests/test_load_components.py +++ b/textdescriptives/tests/test_load_components.py @@ -17,6 +17,7 @@ def test_integration(nlp): "readability", "dependency_distance", "textdescriptives", + "quality" ]: assert component in nlp.pipe_names From 03b9d84b7ea77ee57370fae7a0b46a7f57a6f757 Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 26 Sep 2022 11:21:42 +0200 Subject: [PATCH 10/11] docs: minor update to news --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 31adf6e4..c44cdbdd 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,7 @@ # News ## v1.1.0 - 21st of September, 2022 -- Added the new pipe; "quality". This pipe implements a series of metrics related to text quality, some of which were used by Rae et al. (2021) and Raffel et al. (2020) to filter large text corpora. +- Added the new pipe; "quality". This pipe implements a series of metrics related to text quality, some of which were used by Rae et al. (2021) and Raffel et al. (2020) to filter large text corpora. See the documentation for examples. ## v1.0.7 - 4th May, 2022 - Some minor fixes and bells and whistles. From 15785993eee366bc2e8951f53fccbe02cbd4bf9c Mon Sep 17 00:00:00 2001 From: Lasse Date: Mon, 26 Sep 2022 11:25:57 +0200 Subject: [PATCH 11/11] docs: add reference to DFM --- docs/quality.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/docs/quality.rst b/docs/quality.rst index 4ed79c9a..b9c816f6 100644 --- a/docs/quality.rst +++ b/docs/quality.rst @@ -27,6 +27,11 @@ These quality metrics were for example used by `Raffel et al. (2020) `__` to filter large text corpora for pre-training language models. +Note: this implementation is not optimized for speed, but rather for usability, simplicity, and spacy integration. +If you need to run quality filters on a large corpus, you should consider using the implementation from +[Danish Foundation Models](https://github.com/centre-for-humanities-computing/danish-foundation-models) which also +includes a number of other quality filters and deduplication strategies. + Quality Component ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~