feat: add quality metrics to Extractor

HLasse · Sep 26, 2022 · 92a70e8 · 92a70e8
1 parent 2aadd11
commit 92a70e8
Show file tree

Hide file tree

Showing 7 changed files with 37 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,7 @@ A Python library for calculating a large variety of statistics from text(s) usin
 
 # 📰 News
 
+* New component: `quality` which implements a bunch of metrics for checking the quality of a document. See the [news](https://github.com/HLasse/TextDescriptives/blob/master/NEWS.md) for further information. 
 * TextDescriptives has been completely re-implemented using spaCy v.3.0. The stanza implementation can be found in the `stanza_version` branch and will no longer be maintained. 
 * Check out the brand new documentation [here](https://hlasse.github.io/TextDescriptives/)!
 See [NEWS.md](https://github.com/HLasse/TextDescriptives/blob/master/NEWS.md) for release notes (v. 1.0.5 and onwards)
@@ -93,7 +94,7 @@ The table below shows the metrics included in TextDescriptives and their attribu
 | `{Doc/Span}._.counts`               | `descriptive_stats`   | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the Doc/Span.                                           |
 | `{Doc/Span}._.pos_proportions`      | `pos_stats`           | Dict of `{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG.                                 |
 | `{Doc/Span}._.token_length`         | `descriptive_stats`   | Dict containing mean, median, and std of token length.                                                                                                                       |
-| `{Doc/Span}._.quality`              | `quality`             | Dict a series of heuristic metrics related to text quality. Targeted at filtering out low-quality text.                                                                      |
+| `{Doc/Span}._.quality`              | `quality`             | Dict containing a number of heuristic metrics related to text quality. Targeted at filtering out low-quality text.                                                                      |
 | `{Doc/Span}._.passed_quality_check` | `quality`             | Boolean on whether the document or span passed threshold sets for quality checks.                                                                                            |
 
 

diff --git a/docs/quality.rst b/docs/quality.rst
@@ -14,7 +14,7 @@ Heuristic quality metrics:
 * Symbol to word ratio (:code:`symbol_{symbol}_2_word_ratio`): Ratio of specified symbols to words, could e.g. include ratio of hashtags or curly brackets.
 * Contains string (:code:`contains_{string}`): Whether the document contains a specified string. For instance documents containing the string "lorem ipsum".
 
-Repititious text metrics:
+Repetitious text metrics:
 
 * Duplicate lines character fraction (:code:`duplicate_lines_chr_fraction`): Fraction of characters in a document which are contained within duplicate lines.
 * Duplicate paragraphs character fraction (:code:`duplicate_paragraphs_chr_fraction`): Fraction of characters in a document which are contained within duplicate paragraphs.

diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py
@@ -89,6 +89,7 @@ def proportion_bullet_points(  # pylint: disable=dangerous-default-value
 
     Args:
         span (Span): spaCy span object
+        bullet_point (set): set of bullet points
 
     Returns:
         float: proportion of bullet points
@@ -543,7 +544,7 @@ def create_quality_component(  # pylint: disable=dangerous-default-value
             set a upper bound on the `duplicate_5-gram_chr_fraction`, you can set
             `quality_thresholds={"duplicate_5-gram_chr_fraction": (None, 0.15)}`.
             Default values are set in
-            `textdescriptives.constants.DEFAULT_QUALITY_THRESHOLDS`.
+            `textdescriptives.components..quality.DEFAULT_QUALITY_THRESHOLDS`.
         force (bool): whether to overwrite existing extensions. Defaults to True.
 
 

diff --git a/textdescriptives/dataframe_extract.py b/textdescriptives/dataframe_extract.py
@@ -35,6 +35,7 @@ def __init__(
                 "readability",
                 "dependency_distance",
                 "pos_stats",
+                "quality",
                 "all",
             ]
         )
@@ -58,60 +59,55 @@ def __init__(
             extraction = []
 
         if "all" in metrics:
-            if doc.has_extension("counts"):
-                extraction.append(self.__descriptive_stats(doc))
-            if doc.has_extension("readability"):
-                extraction.append(self.__readability(doc))
-            if doc.has_extension("dependency_distance"):
-                extraction.append(self.__dependency_distance(doc))
-            if doc.has_extension("pos_proportions"):
-                extraction.append(self.__pos_proportions(doc))
+            for component in valid_metrics - set(["all"]):
+                if doc.has_extension(component) or component == "descriptive_stats":
+                    extraction.append(self.__unpack_extension(doc, component))
+
         else:
-            if "descriptive_stats" in metrics:
-                extraction.append(self.__descriptive_stats(doc))
-            if "readability" in metrics:
-                extraction.append(self.__readability(doc))
-            if "dependency_distance" in metrics:
-                extraction.append(self.__dependency_distance(doc))
-            if "pos_stats" in metrics:
-                extraction.append(self.__pos_proportins(doc))
+            for component in metrics:
+                if doc.has_extension(component) or component == "descriptive_stats":
+                    extraction.append(self.__unpack_extension(doc, component))
 
         if self.as_dict:
             self.out = reduce(lambda a, b: {**a, **b}, extraction)
         else:
             self.out = pd.concat(extraction, axis=1)
 
-    def __descriptive_stats(self, doc: Doc) -> pd.DataFrame:
+    def __get_descriptive_stats_dict(self, doc: Doc) -> pd.DataFrame:
         descriptive_stats = {
             **doc._.token_length,
             **doc._.sentence_length,
             **doc._.syllables,
             **doc._.counts,
         }
-        if self.as_dict:
-            return descriptive_stats
-        return pd.DataFrame.from_records([descriptive_stats])
+        return descriptive_stats
 
-    def __readability(self, doc: Doc) -> pd.DataFrame:
-        if self.as_dict:
-            return doc._.readability
-        return pd.DataFrame.from_records([doc._.readability])
+    def __unpack_extension(self, doc: Doc, extension: str) -> pd.DataFrame:
+        """Unpacks the the values from the extension to a dict or dataframe
+
+        Args:
+            doc (Doc): Document to extract from
+            extension (str): Extension to extract
+
+        Returns:
+            pd.DataFrame: DataFrame with extension values
+        """
+        # doc.get_extension returns a tuple of (default, method, getter, setter)
+        # we only need the getter
+        if extension == "descriptive_stats":
+            values = self.__get_descriptive_stats_dict(doc)
+        else:
+            values = doc.get_extension(extension)[2](doc)
 
-    def __dependency_distance(self, doc: Doc) -> pd.DataFrame:
         if self.as_dict:
-            return doc._.dependency_distance
-        return pd.DataFrame.from_records([doc._.dependency_distance])
+            return values
+        return pd.DataFrame.from_records([values])
 
     def __extract_text(self, doc: Doc) -> Union[pd.DataFrame, str]:
         if self.as_dict:
             return {"text": doc.text}
         return pd.DataFrame([doc.text], columns=["text"])
 
-    def __pos_proportions(self, doc: Doc) -> pd.DataFrame:
-        if self.as_dict:
-            return doc._.pos_proportions
-        return pd.DataFrame.from_records([doc._.pos_proportions])
-
 
 def extract_df(
     doc: Doc, metrics: Union[List[str], str] = "all", include_text: bool = True

diff --git a/textdescriptives/load_components.py b/textdescriptives/load_components.py
@@ -4,6 +4,7 @@
     DependencyDistance,
     DescriptiveStatistics,
     POSStatistics,
+    Quality
 )
 
 from spacy.language import Language
@@ -17,6 +18,7 @@ def create_textdescriptives_component(nlp: Language, name: str):
         "readability",
         "dependency_distance",
         "pos_stats",
+        "quality"
     ]:
         nlp.add_pipe(component, last=True)
     return TextDescriptives(nlp)

diff --git a/textdescriptives/tests/test_extractor.py b/textdescriptives/tests/test_extractor.py
@@ -13,7 +13,7 @@ def nlp():
 def test_extract_df_single_doc(nlp):
     doc = nlp("This is just a cute little text. Actually, it's two sentences.")
     td.extract_df(doc)
-    for metric in ["descriptive_stats", "readability", "dependency_distance"]:
+    for metric in ["descriptive_stats", "readability", "dependency_distance", "quality"]:
         td.extract_df(doc, metrics=metric)
 
 

diff --git a/textdescriptives/tests/test_load_components.py b/textdescriptives/tests/test_load_components.py
@@ -17,6 +17,7 @@ def test_integration(nlp):
         "readability",
         "dependency_distance",
         "textdescriptives",
+        "quality"
     ]:
         assert component in nlp.pipe_names