Skip to content

Commit

Permalink
chore: pre-commit
Browse files Browse the repository at this point in the history
  • Loading branch information
HLasse committed Feb 14, 2023
1 parent 4f37b6b commit 2e4d069
Show file tree
Hide file tree
Showing 15 changed files with 93 additions and 107 deletions.
3 changes: 1 addition & 2 deletions src/textdescriptives/about.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""About textdescriptives, version number is specified in the setup.cfg
file."""
""" About textdescriptives, version number is specified in the setup.cfg file."""

# if python >= 3.8, use importlib.metadata otherwise use pkg_resources
try:
Expand Down
17 changes: 8 additions & 9 deletions src/textdescriptives/components/coherence.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ def n_order_coherence(doc: Doc, order: int) -> List[float]:


class Coherence:
"""Spacy v.3.0 component that adds attributes with coherence to `Doc` and
`Span` objects."""
"""Spacy v.3.0 component that adds attributes with coherence to `Doc` and `Span`
objects."""

def __init__(self, nlp: Language):
"""Initialise component."""
Expand All @@ -60,23 +60,22 @@ def __init__(self, nlp: Language):

@staticmethod
def _first_order_coherence(doc: Doc) -> List[float]:
"""Calculate first order coherence for a `Doc`, i.e. the semantic
similarity between consecutive sentences."""
"""Calculate first order coherence for a `Doc`, i.e. the semantic similarity
between consecutive sentences."""
return n_order_coherence(doc=doc, order=1)

@staticmethod
def _second_order_coherence(doc: Doc) -> List[float]:
"""Calculate second order coherence for a `Doc`, i.e. the semantic
similarity between sentences that are two sentences apart."""
"""Calculate second order coherence for a `Doc`, i.e. the semantic similarity
between sentences that are two sentences apart."""
return n_order_coherence(doc, order=2)

def coherence(self, doc: Doc) -> None:
"""Calculate mean semantic coherence for a `Doc` and set the coherence
attribute.
Coherence is calculated by taking the mean of the similarity
between sentence embeddings. See the documentation for more
details.
Coherence is calculated by taking the mean of the similarity between sentence
embeddings. See the documentation for more details.
"""
first_order_coherence = self._first_order_coherence(doc)
second_order_coherence = self._second_order_coherence(doc)
Expand Down
36 changes: 17 additions & 19 deletions src/textdescriptives/components/dependency_distance.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Calculation of statistics related to dependency distance."""
""" Calculation of statistics related to dependency distance."""
from typing import Callable

import numpy as np
Expand All @@ -7,13 +7,12 @@


class DependencyDistance:
"""spaCy v.3.0 component that adds attributes to `Doc`, `Span`, and `Token`
objects relating to dependency distance.
"""spaCy v.3.0 component that adds attributes to `Doc`, `Span`, and `Token` objects
relating to dependency distance.
Dependency distance can be used as a measure of syntactic
complexity, and measures the distance from a word to its head word.
For `Doc` objects, dependency distance is calculated on the sentence
level.
Dependency distance can be used as a measure of syntactic complexity, and measures
the distance from a word to its head word. For `Doc` objects, dependency distance is
calculated on the sentence level.
"""

def __init__(self, nlp: Language):
Expand All @@ -26,9 +25,9 @@ def __init__(self, nlp: Language):
Doc.set_extension("dependency_distance", getter=self.doc_dependency)

def token_dependency(self, token: Token) -> dict:
"""Calculate token level dependency distance, i.e. the distance from a
token to its head token. Also returns a boolean indicating whether the
dependency relation is adjacent to the token.
"""Calculate token level dependency distance, i.e. the distance from a token to
its head token. Also returns a boolean indicating whether the dependency
relation is adjacent to the token.
Returns:
dict: Dictionary with the following keys:
Expand All @@ -45,9 +44,9 @@ def token_dependency(self, token: Token) -> dict:
return {"dependency_distance": dep_dist, "adjacent_dependency": ajd_dep}

def span_dependency(self, span: Span) -> dict:
"""Aggregates token level dependency distance on the span level by
taking the mean of the dependency distance and the proportion of
adjacent dependency relations.
"""Aggregates token level dependency distance on the span level by taking the
mean of the dependency distance and the proportion of adjacent dependency
relations.
Returns:
dict: Dictionary with the following keys: dependency_distance_mean:
Expand All @@ -63,9 +62,9 @@ def span_dependency(self, span: Span) -> dict:
}

def doc_dependency(self, doc: Doc) -> dict:
"""Aggregates token level dependency distance on the document level by
taking the mean of the dependency distance and the proportion of
adjacent dependency relations on the sentence level.
"""Aggregates token level dependency distance on the document level by taking
the mean of the dependency distance and the proportion of adjacent dependency
relations on the sentence level.
Returns:
dict: Dictionary with the following keys:
Expand Down Expand Up @@ -112,9 +111,8 @@ def create_dependency_distance_component(
nlp: Language,
name: str,
) -> Callable[[Doc], Doc]:
"""Create spaCy language factory that allows DependencyDistance attributes
to be added to a pipe using
nlp.add_pipe("textdescriptives/dependency_distance")
"""Create spaCy language factory that allows DependencyDistance attributes to be
added to a pipe using nlp.add_pipe("textdescriptives/dependency_distance")
Adding this component to a pipeline sets the following attributes:
- `token._.dependency_distance`
Expand Down
21 changes: 10 additions & 11 deletions src/textdescriptives/components/descriptive_stats.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Calculation of descriptive statistics."""
""" Calculation of descriptive statistics."""
from typing import Callable, Dict, Union

import numpy as np
Expand All @@ -9,11 +9,11 @@


class DescriptiveStatistics:
"""spaCy v.3.0 component that adds attributes with desriptive statistics to
`Doc` and `Span` objects.
"""spaCy v.3.0 component that adds attributes with desriptive statistics to `Doc`
and `Span` objects.
The attributes relate to token and sentence length, number of
syllables, and counts of tokens and sentences.
The attributes relate to token and sentence length, number of syllables, and counts
of tokens and sentences.
"""

def __init__(self, nlp: Language):
Expand All @@ -40,8 +40,7 @@ def __init__(self, nlp: Language):
Doc.set_extension(extension_name, getter=getter_fun)

def token_length(self, doc: Union[Doc, Span]) -> dict:
"""Calculate mean, median and std of token length for a `Doc` or
`Span`.
"""Calculate mean, median and std of token length for a `Doc` or `Span`.
Returns:
dict: token_length_mean, token_length_median, token_length_std
Expand Down Expand Up @@ -88,8 +87,8 @@ def sentence_length(self, doc: Doc) -> dict:
}

def syllables(self, doc: Doc) -> dict:
"""Calculate mean, median and std of syllables per token for a `Doc`.
Uses `Pyphen` for hyphenation.
"""Calculate mean, median and std of syllables per token for a `Doc`. Uses
`Pyphen` for hyphenation.
Returns:
dict: syllables_per_token_mean, syllables_per_token_median,
Expand All @@ -109,8 +108,8 @@ def syllables(self, doc: Doc) -> dict:
}

def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True) -> dict:
"""Calculate counts of tokens, unique tokens, and characters for a
`Doc` or `Span`. Adds number of sentences for `Doc` objects.
"""Calculate counts of tokens, unique tokens, and characters for a `Doc` or
`Span`. Adds number of sentences for `Doc` objects.
Args:
ignore_whitespace: if True, whitespace is not counted as a character when
Expand Down
14 changes: 7 additions & 7 deletions src/textdescriptives/components/information_theory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Calculate the entropy and perplexity of a corpus."""
""" Calculate the entropy and perplexity of a corpus."""

from typing import Callable, Dict, Union

Expand Down Expand Up @@ -31,8 +31,8 @@ def entropy(log_probs=np.ndarray) -> float:
def perplexity(entropy: float) -> float:
"""Calculates the perplexity.
Calculated as exp(H(p)), where H is the entropy using a base e and p
is the probabilites of a given word.
Calculated as exp(H(p)), where H is the entropy using a base e and p is the
probabilites of a given word.
"""
return np.exp(entropy)

Expand All @@ -46,8 +46,8 @@ def entropy_getter(doc: Union[Doc, Span], log_prob_attr: str = "prob") -> float:
def perplexity_getter(doc: Union[Doc, Span]) -> float:
"""Calculates the perplexity of a doc.
Calculated as exp(H(p)), where H is the entropy using a base e and p
is the probabilites of a given word.
Calculated as exp(H(p)), where H is the entropy using a base e and p is the
probabilites of a given word.
"""
# check if it has the attribute entropy
if hasattr(doc._, "entropy"):
Expand Down Expand Up @@ -82,8 +82,8 @@ def set_docspan_extension(


class InformationTheory:
"""SpaCy component for adding information theoretic metrics such as entropy
and perplexity."""
"""SpaCy component for adding information theoretic metrics such as entropy and
perplexity."""

def __init__(self, nlp: Language, name: str, force: bool) -> None:
self.name = name
Expand Down
10 changes: 5 additions & 5 deletions src/textdescriptives/components/pos_proportions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Calculation of statistics that require a pos-tagger in the pipeline."""
""" Calculation of statistics that require a pos-tagger in the pipeline."""

from typing import Callable, Counter, Union

Expand All @@ -7,8 +7,8 @@


class POSProportions:
"""spaCy v.3.0 component that adds attributes for POS statistics to `Doc`
and `Span` objects."""
"""spaCy v.3.0 component that adds attributes for POS statistics to `Doc` and
`Span` objects."""

def __init__(self, nlp: Language, use_pos: bool):
"""Initialise components.
Expand All @@ -26,8 +26,8 @@ def __init__(self, nlp: Language, use_pos: bool):
Span.set_extension("pos_proportions", getter=self.pos_proportions)

def pos_proportions(self, text: Union[Doc, Span]) -> dict:
"""Calculates the proportion of tokens in a `Doc`|`Span` that are
tagged with each POS tag.
"""Calculates the proportion of tokens in a `Doc`|`Span` that are tagged with
each POS tag.
Returns:
Dict containing {pos_prop_POSTAG: proportion of all tokens tagged with
Expand Down
21 changes: 9 additions & 12 deletions src/textdescriptives/components/quality.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Component for calculating quality metrics."""
""" Component for calculating quality metrics."""
from collections import Counter, defaultdict
from typing import Callable, Dict, List, Mapping, Optional, Tuple, Union

Expand Down Expand Up @@ -37,8 +37,8 @@ def mean_word_length(span: Union[Doc, Span]) -> float:


def alpha_ratio(span: Union[Doc, Span]) -> float:
"""The percentage of spacy tokens in this document which contain at leat
one alphabetic character.
"""The percentage of spacy tokens in this document which contain at leat one
alphabetic character.
Args:
span (Union[Doc, Span]): A spaCy Doc or Span object
Expand All @@ -63,8 +63,7 @@ def proportion_bullet_points( # pylint: disable=dangerous-default-value
span: Union[Doc, Span],
bullet_point: set = {"-", "*"},
) -> float:
"""Calculate the proportion of lines which start with a bullet points in a
span.
"""Calculate the proportion of lines which start with a bullet points in a span.
Args:
span (Union[Doc, Span]): A spaCy Doc or Span object
Expand Down Expand Up @@ -240,9 +239,9 @@ def duplicate_ngram_fraction(
span: Union[Span, Doc],
ngram_range: Tuple[int, int],
) -> Dict[int, float]:
"""Calculates the character fraction of duplicate n-gram over the overall
text, taking care not to count overlapping n-grams twice. This does not
include spaces between the n-grams.
"""Calculates the character fraction of duplicate n-gram over the overall text,
taking care not to count overlapping n-grams twice. This does not include spaces
between the n-grams.
Args:
span (Union[Span, Doc]): A spaCy Span or Doc object.
Expand Down Expand Up @@ -352,11 +351,9 @@ def oov_ratio(span: Union[Span, Doc], vocab: Optional[Mapping] = None) -> float:


class Quality:
"""spaCy component for adding text quality metrics to the `Doc` and `Span`
objects.
"""spaCy component for adding text quality metrics to the `Doc` and `Span` objects.
Extracts metrics and returns them as a dictionary as the ._.quality
attribute.
Extracts metrics and returns them as a dictionary as the ._.quality attribute.
"""

def __init__( # pylint: disable=dangerous-default-value
Expand Down
13 changes: 6 additions & 7 deletions src/textdescriptives/components/quality_data_classes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Data classes used for the quality component."""
""" Data classes used for the quality component."""
from typing import Any, Dict, Optional, Tuple, Union

from pydantic import BaseModel, Extra, Field
Expand All @@ -7,10 +7,9 @@


class ThresholdsOutput(BaseModel):
"""An output which contains an three items. 1) a thresholds which is either
an interval or a accepted boolean value. 2) a value which is the value of
the metric. 3) a boolean which is True if the value is within the
thresholds.
"""An output which contains an three items. 1) a thresholds which is either an
interval or a accepted boolean value. 2) a value which is the value of the metric.
3) a boolean which is True if the value is within the thresholds.
Example:
>>> t_out = ThresholdsOutput(threshold=(0, 2), value=2)
Expand Down Expand Up @@ -257,8 +256,8 @@ def __repr_str__(self, join_str: str) -> str:
)

def to_flat_value_dict(self) -> Dict[str, Any]:
"""Creates a flat dictionary representation of the object to allow for
easy easy conversion to a pandas DataFrame."""
"""Creates a flat dictionary representation of the object to allow for easy
easy conversion to a pandas DataFrame."""
flat_dict = {"passed_quality_check": self.passed}

for k, v in self.__dict__.items():
Expand Down
Loading

0 comments on commit 2e4d069

Please sign in to comment.