ci: Added mypy and refactored quality thresholds

HLasse · Dec 22, 2022 · 69ea0fd · 69ea0fd
1 parent defb5fc
commit 69ea0fd
Show file tree

Hide file tree

Showing 8 changed files with 157 additions and 80 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,3 +33,8 @@ repos:
     hooks:
       - id: ruff
         args: [--fix]
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.991
+    hooks:
+      - id: mypy
diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,8 @@ pytest>=7.1.3,<7.3.0
 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz
 en_core_web_sm==3.2.0
 
-# style
+# style, development
 black==22.8.0
-pre-commit
+pre-commit==2.20.0
+ruff==0.0.191
+mypy==0.991
diff --git a/setup.cfg b/setup.cfg
@@ -33,7 +33,7 @@ zip_safe = false
 install_requires =
     spacy>=3.1.0<3.3.0
     numpy>=1.20.0,<1.24.0
-    pandas>=1.0.0,<1.5.0
+    pandas>=1.0.0,<1.6.0
     pyphen>=0.11.0,<0.12.0
     ftfy>=6.0.3,<6.1.0
 setup_requires =

diff --git a/textdescriptives/__init__.py b/textdescriptives/__init__.py
@@ -1,9 +1,4 @@
 from .about import __title__, __version__  # noqa: F401
-from .components import DependencyDistance  # noqa: F401
-from .components import DescriptiveStatistics  # noqa: F401
-from .components import POSProportions  # noqa: F401
-from .components import Quality  # noqa: F401
-from .components import Readability  # noqa: F401
 from .extract import extract_df, extract_dict  # noqa: F401
 from .load_components import TextDescriptives  # noqa: F401
-from .utils import get_columns, get_valid_metrics  # noqa: F401
+from .utils import get_assigns, get_valid_metrics  # noqa: F401
diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py
@@ -4,30 +4,119 @@
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
+from pydantic import BaseModel, Field
 from spacy.language import Language
 from spacy.tokens import Doc, Span
 
-DEFAULT_QUALITY_THRESHOLDS = {
-    "n_stop_words": (2, None),
-    "alpha_ratio": (0.8, None),
-    "mean_word_length": (3, 10),
-    "doc_length": (10, 100_000),
-    "symbol_#_2_word_ratio": (None, 0.1),
-    "proportion_ellipsis": (None, 0.3),
-    "proportion_bullet_points": (None, 0.8),
-    "duplicate_line_chr_fraction": (None, 0.2),
-    "duplicate_paragraph_chr_fraction": (None, 0.2),
-    "duplicate_5-gram_chr_fraction": (None, 0.15),
-    "duplicate_6-gram_chr_fraction": (None, 0.14),
-    "duplicate_7-gram_chr_fraction": (None, 0.13),
-    "duplicate_8-gram_chr_fraction": (None, 0.12),
-    "duplicate_9-gram_chr_fraction": (None, 0.11),
-    "duplicate_10-gram_chr_fraction": (None, 0.1),
-    "top_2-gram_chr_fraction": (None, 0.20),
-    "top_3-gram_chr_fraction": (None, 0.18),
-    "top_4-gram_chr_fraction": (None, 0.16),
-    "contains_lorem ipsum": False,
-}
+RangeType = Tuple[Optional[float], Optional[float]]
+
+
+class QualityThresholds(BaseModel):
+    n_stop_words: RangeType = Field(
+        (2, None),
+        description="A Range for the number of stop words. Default: (2, None), i.e. "
+        + "at least 2 stop words, but no upper limit.",
+    )
+    alpha_ratio: RangeType = Field(
+        (0.8, None),
+        description="A Range for the alpha ratio. Default: (0.8, None), i.e. at "
+        + "least 80% of tokens contain at least one alphabetic character, but no "
+        + "upper limit.",
+    )
+    mean_word_length: RangeType = Field(
+        (3, 10),
+        description="A Range for the mean word length. Default: (3, 10), i.e. between"
+        + " 3 and 10 characters.",
+    )
+    doc_length: RangeType = Field(
+        (10, 100_000),
+        description="A Range for the document length. Default: (10, 100_000), i.e."
+        + " between 10 and 100_000 characters.",
+    )
+    symbol_hashtag_to_word_ratio: RangeType = Field(
+        (None, 0.1),
+        description="A Range for the symbol hashtag to word ratio. Default: "
+        + "(None, 0.1), i.e. no lower limit, but at most 10% of tokens are hashtags.",
+    )
+    proportion_ellipsis: RangeType = Field(
+        (None, 0.3),
+        description="A Range for the proportion of ellipsis. Default: (None, 0.3), "
+        + "i.e. no lower limit, but at most 30% of lines end with an ellipsis.",
+    )
+    proportion_bullet_points: RangeType = Field(
+        (None, 0.8),
+        description="A Range for the proportion of bullet points. Default: "
+        + "(None, 0.8), i.e. no lower limit, but at most 80% of lines start with a"
+        + " bullet point.",
+    )
+    duplicate_line_chr_fraction: RangeType = Field(
+        (None, 0.2),
+        description="A Range for the duplicate line character fraction. Default: "
+        + "(None, 0.2), i.e. no lower limit, but at most 20% of characters are"
+        + " duplicates.",
+    )
+    duplicate_paragraph_chr_fraction: RangeType = Field(
+        (None, 0.2),
+        description="A Range for the duplicate paragraph character fraction. Default:"
+        + " (None, 0.2), i.e. no lower limit, but at most 20% of characters are "
+        + "duplicates.",
+    )
+    duplicate_5gram_chr_fraction: RangeType = Field(
+        (None, 0.15),
+        description="A Range for the duplicate 5-gram character fraction. Default: "
+        + "(None, 0.15), i.e. no lower limit, but at most 15% of characters are "
+        + "duplicates.",
+    )
+    duplicate_6gram_chr_fraction: RangeType = Field(
+        (None, 0.14),
+        description="A Range for the duplicate 6-gram character fraction. Default: "
+        + "(None, 0.14), i.e. no lower limit, but at most 14% of characters are "
+        + "duplicates.",
+    )
+    duplicate_7gram_chr_fraction: RangeType = Field(
+        (None, 0.13),
+        description="A Range for the duplicate 7-gram character fraction. Default: "
+        + "(None, 0.13), i.e. no lower limit, but at most 13% of characters are "
+        + "duplicates.",
+    )
+    duplicate_8gram_chr_fraction: RangeType = Field(
+        (None, 0.12),
+        description="A Range for the duplicate 8-gram character fraction. Default: "
+        + "(None, 0.12), i.e. no lower limit, but at most 12% of characters are "
+        + "duplicates.",
+    )
+    duplicate_9gram_chr_fraction: RangeType = Field(
+        (None, 0.11),
+        description="A Range for the duplicate 9-gram character fraction. Default: "
+        + "(None, 0.11), i.e. no lower limit, but at most 11% of characters are "
+        + "duplicates.",
+    )
+    duplicate_10gram_chr_fraction: RangeType = Field(
+        (None, 0.1),
+        description="A Range for the duplicate 10-gram character fraction. Default:"
+        + " (None, 0.1), i.e. no lower limit, but at most 10% of characters are "
+        + "duplicates.",
+    )
+    top_2gram_chr_fraction: RangeType = Field(
+        (None, 0.20),
+        description="A Range for the top 2-gram character fraction. Default: (None,"
+        + " 0.20), i.e. no lower limit, but at most 20% of characters are duplicates.",
+    )
+    top_3gram_chr_fraction: RangeType = Field(
+        (None, 0.18),
+        description="A Range for the top 3-gram character fraction. Default: (None,"
+        + " 0.18), i.e. no lower limit, but at most 18% of characters are duplicates.",
+    )
+    top_4gram_chr_fraction: RangeType = Field(
+        (None, 0.16),
+        description="A Range for the top 4-gram character fraction. Default: (None, "
+        + "0.16), i.e. no lower limit, but at most 16% of characters are duplicates.",
+    )
+    contains_lorem_ipsum: bool = Field(
+        False,
+        description="Whether the document contains the string 'lorem ipsum'. Default: "
+        + "False.",
+    )
 
 
 def n_stop_words(span: Span) -> int:
@@ -223,21 +312,24 @@ def symbol_2_word_ratio(span: Span, symbol: str) -> float:
     return 0.0
 
 
-def span_ngrams(span: Span, ngram_range: Tuple[int, int]) -> Dict[str, Counter]:
+def span_ngrams(
+    span: Span,
+    ngram_range: Tuple[int, int],
+) -> Dict[int, Dict[str, Union[int, List[Span]]]]:
     """Calculates the counts of n-grams in the specified range.
 
     Args:
         span (Span): spaCy span object
         ngram_range (Tuple[int, int]): The n-gram range.
 
     Returns:
-        Dict[int, Dict[str, int, List[Span]]]: A dictionary that for each n in the ngram
-            range contains the counts of the n-grams as well as the spans of the
-            n-grams.
+        Dict[int, Dict[str, Union[int, List[Span]]]]: A dictionary that for each n in
+            the ngram range contains the counts of the n-grams as well as the spans of
+            the n-grams.
     """
     max_len = len(span)
     lower, upper = ngram_range
-    shingles_count = {
+    shingles_count = {  # type: ignore
         n: defaultdict(lambda: {"count": 0, "span": []})
         for n in range(lower, upper + 1)
     }
@@ -247,9 +339,11 @@ def span_ngrams(span: Span, ngram_range: Tuple[int, int]) -> Dict[str, Counter]:
             if not end > max_len:
                 ngram_span = span[i:end]
                 ngram = ngram_span.text
-                shingles_count[ngram_size][ngram]["count"] += 1
-                shingles_count[ngram_size][ngram]["span"].append(ngram_span)
-    return shingles_count
+                shingles_count[ngram_size][ngram]["count"] += 1  # type: ignore
+                shingles_count[ngram_size][ngram]["span"].append(  # type: ignore
+                    ngram_span,
+                )
+    return shingles_count  # type: ignore
 
 
 def duplicate_ngram_fraction(
@@ -280,8 +374,8 @@ def duplicate_ngram_fraction(
         is_duplicate = np.zeros(max_len, dtype=bool)
         # set duplicate tokens to True
         for ngram, count in ngrams.items():
-            if count["count"] > 1:
-                for ngram_span in count["span"]:
+            if count["count"] > 1:  # type: ignore
+                for ngram_span in count["span"]:  # type: ignore
                     is_duplicate[ngram_span.start : ngram_span.end] = True
 
         duplicate_chars = 0
@@ -297,7 +391,7 @@ def top_ngram_chr_fraction(
     span: Span,
     ngram_range: Tuple[int, int],
     min_count: int = 0,
-) -> float:
+) -> Dict[int, float]:
     """Calculates the character fraction of the top ngrams.
 
     Args:
@@ -307,7 +401,8 @@ def top_ngram_chr_fraction(
             a top n-gram. Defaults to 0.
 
     Returns:
-        float: The fraction of the top n-grams.
+        Dict[int, float]: the fraction of duplicate characters for each
+            n-gram size
     """
     # check if span has enough tokens within the range
 
@@ -322,9 +417,9 @@ def top_ngram_chr_fraction(
         if ngram_counter[n]:
             ngram, count_span = max(
                 ngram_counter[n].items(),
-                key=lambda x: x[1]["count"],
+                key=lambda x: x[1]["count"],  # type: ignore
             )
-            count = count_span["count"]
+            count = count_span["count"]  # type: ignore
             if count >= min_count:
                 # calculate the fraction of the top n-gram
                 top_ngram_chr_frac[n] = (len(ngram) * count) / chr_len
@@ -366,9 +461,7 @@ def __init__(  # pylint: disable=dangerous-default-value
         top_ngram_range: Tuple[int, int],
         top_ngram_min_count: int,
         duplicate_n_gram_fraction_range: Tuple[int, int],
-        quality_thresholds: Optional[
-            Dict[str, Union[bool, Tuple[Optional[float], Optional[float]]]]
-        ] = None,
+        quality_thresholds: Optional[QualityThresholds] = None,
         force: bool = False,
     ):  # noqa: D107
         """Initialise components."""
@@ -380,7 +473,7 @@ def __init__(  # pylint: disable=dangerous-default-value
         self.top_ngram_min_count = top_ngram_min_count
         self.duplicate_n_gram_fraction_range = duplicate_n_gram_fraction_range
         if quality_thresholds is None:
-            quality_thresholds: dict = DEFAULT_QUALITY_THRESHOLDS
+            quality_thresholds = QualityThresholds()
         self.quality_thresholds = quality_thresholds
 
         self.getters = {
@@ -433,21 +526,21 @@ def quality_getter(self, span: Span) -> Dict[str, Union[float, int, bool]]:
         quality = {}
         for name, getter in self.getters.items():
             if name == "top_ngram_chr_fraction":
-                chr_frac = getter(span)
+                chr_frac = getter(span)  # type: ignore
                 for n_gram, frac in chr_frac.items():
                     quality[f"top_{n_gram}-gram_chr_fraction"] = frac
             elif name == "duplicate_ngram_chr_fraction":
-                chr_frac = getter(span)
+                chr_frac = getter(span)  # type: ignore
                 for n_gram, frac in chr_frac.items():
                     quality[f"duplicate_{n_gram}-gram_chr_fraction"] = frac
             else:
-                quality[name] = getter(span)
+                quality[name] = getter(span)  # type: ignore
         return quality
 
     def passed_quality_thresholds(self, span: Span) -> bool:
         """Checks whether a span passed the quality thresholds."""
         quality = span._.quality
-        for name, threshold in self.quality_thresholds.items():
+        for name, threshold in self.quality_thresholds.dict().items():
             if name not in quality:
                 raise KeyError(f"Quality metric {name} not found in doc._.quality")
             if isinstance(threshold, bool):
@@ -508,9 +601,7 @@ def create_quality_component(  # pylint: disable=dangerous-default-value
     top_ngram_range: Tuple[int, int],
     top_ngram_min_count: int,
     duplicate_n_gram_fraction_range: Tuple[int, int],
-    quality_thresholds: Optional[
-        Dict[str, Union[bool, Tuple[Optional[float], Optional[float]]]]
-    ] = None,
+    quality_thresholds: Optional[QualityThresholds] = None,
     force: bool = True,
 ) -> Callable[[Doc], Doc]:
     """Allows Quality to be added to a spaCy pipe using
@@ -551,18 +642,11 @@ def create_quality_component(  # pylint: disable=dangerous-default-value
             be considered a top n-gram. Defaults to 3.
         duplicate_n_gram_fraction_range (Tuple[int]): range of n-grams to
             calculate the proportion of duplicate n-grams. Defaults to [5, 10].
-        quality_thresholds (Dict[str, Union[bool, Tuple[Union[int, float, None],
-            Union[int, float, None]]]]): A dictionary of quality thresholds indicated by
-            either a range (Tuple), wherein the first value is the lower bound and the
-            second value is the upper bound. Lower and upper bounds can be None, in
-            which case they are not checked. Alternatively, a boolean can be provided,
-            checking if the quality metric is boolean. For example, if you  don't want
-            documents containing `lorem ipsum`, to pass the quality check, you can set
-            `quality_thresholds={"contains_lorem_ipsum": False}`. Similar if you want to
-            set a upper bound on the `duplicate_5-gram_chr_fraction`, you can set
-            `quality_thresholds={"duplicate_5-gram_chr_fraction": (None, 0.15)}`.
-            Default values are set in
-            `textdescriptives.components..quality.DEFAULT_QUALITY_THRESHOLDS`.
+        quality_thresholds (Optional[QualityThresholds]): A QualityThresholds object of
+            quality thresholds indicated by either a range (Tuple), wherein the first
+            value is the lower bound and the second value is the upper bound or
+            a boolean. Defaults to None in which case the default for QualityThresholds
+            is used.
         force (bool): whether to overwrite existing extensions. Defaults to True.
 
 

diff --git a/textdescriptives/extract.py b/textdescriptives/extract.py
@@ -1,19 +1,14 @@
 """Extract metrics as Pandas DataFrame."""
-import types
-from collections import defaultdict
-from functools import reduce
 from typing import Any, Dict, Iterable, List, Union
 
 import pandas as pd
-from spacy import Language
 from spacy.tokens import Doc
 
 from textdescriptives.utils import get_valid_metrics
 
 
 def __get_quality(doc: Doc) -> dict:
-    """Get quality metrics as well as boolean indicator for passing
-    filters."""
+    """Get quality metrics as well as boolean indicator for passing filters."""
     return {**doc._.quality, "passed_quality_check": doc._.passed_quality_check}
 
 
@@ -63,7 +58,6 @@ def extract_dict(
 
     # extract textdescriptive metrics from the list of spacy Language factory
     valid_metrics = get_valid_metrics()
-    valid_metrics.update({"all"})
 
     if isinstance(metrics, str):
         metrics = [metrics]
@@ -108,7 +102,4 @@ def extract_df(
     Returns:
         pd.DataFrame: DataFrame with a row for each doc and column for each metric.
     """
-    if isinstance(docs, Doc):
-        docs = [docs]
-
     return pd.DataFrame(extract_dict(docs, metrics, include_text))