Skip to content

Commit

Permalink
feat: add quality metrics to Extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
HLasse committed Sep 26, 2022
1 parent 2aadd11 commit 92a70e8
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 36 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ A Python library for calculating a large variety of statistics from text(s) usin

# 📰 News

* New component: `quality` which implements a bunch of metrics for checking the quality of a document. See the [news](https://github.com/HLasse/TextDescriptives/blob/master/NEWS.md) for further information.
* TextDescriptives has been completely re-implemented using spaCy v.3.0. The stanza implementation can be found in the `stanza_version` branch and will no longer be maintained.
* Check out the brand new documentation [here](https://hlasse.github.io/TextDescriptives/)!
See [NEWS.md](https://github.com/HLasse/TextDescriptives/blob/master/NEWS.md) for release notes (v. 1.0.5 and onwards)
Expand Down Expand Up @@ -93,7 +94,7 @@ The table below shows the metrics included in TextDescriptives and their attribu
| `{Doc/Span}._.counts` | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the Doc/Span. |
| `{Doc/Span}._.pos_proportions` | `pos_stats` | Dict of `{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG. |
| `{Doc/Span}._.token_length` | `descriptive_stats` | Dict containing mean, median, and std of token length. |
| `{Doc/Span}._.quality` | `quality` | Dict a series of heuristic metrics related to text quality. Targeted at filtering out low-quality text. |
| `{Doc/Span}._.quality` | `quality` | Dict containing a number of heuristic metrics related to text quality. Targeted at filtering out low-quality text. |
| `{Doc/Span}._.passed_quality_check` | `quality` | Boolean on whether the document or span passed threshold sets for quality checks. |


Expand Down
2 changes: 1 addition & 1 deletion docs/quality.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Heuristic quality metrics:
* Symbol to word ratio (:code:`symbol_{symbol}_2_word_ratio`): Ratio of specified symbols to words, could e.g. include ratio of hashtags or curly brackets.
* Contains string (:code:`contains_{string}`): Whether the document contains a specified string. For instance documents containing the string "lorem ipsum".

Repititious text metrics:
Repetitious text metrics:

* Duplicate lines character fraction (:code:`duplicate_lines_chr_fraction`): Fraction of characters in a document which are contained within duplicate lines.
* Duplicate paragraphs character fraction (:code:`duplicate_paragraphs_chr_fraction`): Fraction of characters in a document which are contained within duplicate paragraphs.
Expand Down
3 changes: 2 additions & 1 deletion textdescriptives/components/quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ def proportion_bullet_points( # pylint: disable=dangerous-default-value
Args:
span (Span): spaCy span object
bullet_point (set): set of bullet points
Returns:
float: proportion of bullet points
Expand Down Expand Up @@ -543,7 +544,7 @@ def create_quality_component( # pylint: disable=dangerous-default-value
set a upper bound on the `duplicate_5-gram_chr_fraction`, you can set
`quality_thresholds={"duplicate_5-gram_chr_fraction": (None, 0.15)}`.
Default values are set in
`textdescriptives.constants.DEFAULT_QUALITY_THRESHOLDS`.
`textdescriptives.components..quality.DEFAULT_QUALITY_THRESHOLDS`.
force (bool): whether to overwrite existing extensions. Defaults to True.
Expand Down
60 changes: 28 additions & 32 deletions textdescriptives/dataframe_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(
"readability",
"dependency_distance",
"pos_stats",
"quality",
"all",
]
)
Expand All @@ -58,60 +59,55 @@ def __init__(
extraction = []

if "all" in metrics:
if doc.has_extension("counts"):
extraction.append(self.__descriptive_stats(doc))
if doc.has_extension("readability"):
extraction.append(self.__readability(doc))
if doc.has_extension("dependency_distance"):
extraction.append(self.__dependency_distance(doc))
if doc.has_extension("pos_proportions"):
extraction.append(self.__pos_proportions(doc))
for component in valid_metrics - set(["all"]):
if doc.has_extension(component) or component == "descriptive_stats":
extraction.append(self.__unpack_extension(doc, component))

else:
if "descriptive_stats" in metrics:
extraction.append(self.__descriptive_stats(doc))
if "readability" in metrics:
extraction.append(self.__readability(doc))
if "dependency_distance" in metrics:
extraction.append(self.__dependency_distance(doc))
if "pos_stats" in metrics:
extraction.append(self.__pos_proportins(doc))
for component in metrics:
if doc.has_extension(component) or component == "descriptive_stats":
extraction.append(self.__unpack_extension(doc, component))

if self.as_dict:
self.out = reduce(lambda a, b: {**a, **b}, extraction)
else:
self.out = pd.concat(extraction, axis=1)

def __descriptive_stats(self, doc: Doc) -> pd.DataFrame:
def __get_descriptive_stats_dict(self, doc: Doc) -> pd.DataFrame:
descriptive_stats = {
**doc._.token_length,
**doc._.sentence_length,
**doc._.syllables,
**doc._.counts,
}
if self.as_dict:
return descriptive_stats
return pd.DataFrame.from_records([descriptive_stats])
return descriptive_stats

def __readability(self, doc: Doc) -> pd.DataFrame:
if self.as_dict:
return doc._.readability
return pd.DataFrame.from_records([doc._.readability])
def __unpack_extension(self, doc: Doc, extension: str) -> pd.DataFrame:
"""Unpacks the the values from the extension to a dict or dataframe
Args:
doc (Doc): Document to extract from
extension (str): Extension to extract
Returns:
pd.DataFrame: DataFrame with extension values
"""
# doc.get_extension returns a tuple of (default, method, getter, setter)
# we only need the getter
if extension == "descriptive_stats":
values = self.__get_descriptive_stats_dict(doc)
else:
values = doc.get_extension(extension)[2](doc)

def __dependency_distance(self, doc: Doc) -> pd.DataFrame:
if self.as_dict:
return doc._.dependency_distance
return pd.DataFrame.from_records([doc._.dependency_distance])
return values
return pd.DataFrame.from_records([values])

def __extract_text(self, doc: Doc) -> Union[pd.DataFrame, str]:
if self.as_dict:
return {"text": doc.text}
return pd.DataFrame([doc.text], columns=["text"])

def __pos_proportions(self, doc: Doc) -> pd.DataFrame:
if self.as_dict:
return doc._.pos_proportions
return pd.DataFrame.from_records([doc._.pos_proportions])


def extract_df(
doc: Doc, metrics: Union[List[str], str] = "all", include_text: bool = True
Expand Down
2 changes: 2 additions & 0 deletions textdescriptives/load_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
DependencyDistance,
DescriptiveStatistics,
POSStatistics,
Quality
)

from spacy.language import Language
Expand All @@ -17,6 +18,7 @@ def create_textdescriptives_component(nlp: Language, name: str):
"readability",
"dependency_distance",
"pos_stats",
"quality"
]:
nlp.add_pipe(component, last=True)
return TextDescriptives(nlp)
Expand Down
2 changes: 1 addition & 1 deletion textdescriptives/tests/test_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def nlp():
def test_extract_df_single_doc(nlp):
doc = nlp("This is just a cute little text. Actually, it's two sentences.")
td.extract_df(doc)
for metric in ["descriptive_stats", "readability", "dependency_distance"]:
for metric in ["descriptive_stats", "readability", "dependency_distance", "quality"]:
td.extract_df(doc, metrics=metric)


Expand Down
1 change: 1 addition & 0 deletions textdescriptives/tests/test_load_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def test_integration(nlp):
"readability",
"dependency_distance",
"textdescriptives",
"quality"
]:
assert component in nlp.pipe_names

Expand Down

0 comments on commit 92a70e8

Please sign in to comment.