Skip to content

Commit

Permalink
Merge pull request #88 from HLasse/HLasse/Separate-component-loaders
Browse files Browse the repository at this point in the history
feat: Separate component loaders
  • Loading branch information
HLasse authored Dec 14, 2022
2 parents 2b87c85 + 2fdee32 commit faa7bb8
Show file tree
Hide file tree
Showing 17 changed files with 65 additions and 59 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# News

## v2.0.0 - X 2022
- All components have been renamed to have the `textdescriptives.` prefix. I.e. components should now be loaded with e.g. `nlp.add_pipe("textdescriptives.descriptive_stats)`.
`textdescriptives.all` can be used to load all components at once.
- `pos_stats` has been renamed to `pos_proportions` for consistency.

## v1.1.0 - 21st of September, 2022
- Added the new pipe; "quality". This pipe implements a series of metrics related to text quality, some of which were used by Rae et al. (2021) and Raffel et al. (2020) to filter large text corpora. See the documentation for examples.

Expand Down
2 changes: 1 addition & 1 deletion textdescriptives/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from .components import ( # noqa: F401
DependencyDistance,
DescriptiveStatistics,
POSStatistics,
POSProportions,
Quality,
Readability,
)
Expand Down
2 changes: 1 addition & 1 deletion textdescriptives/components/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .dependency_distance import DependencyDistance # noqa: F401
from .descriptive_stats import DescriptiveStatistics # noqa: F401
from .pos_stats import POSStatistics # noqa: F401
from .pos_proportions import POSProportions # noqa: F401
from .quality import Quality # noqa: F401
from .readability import Readability # noqa: F401
5 changes: 3 additions & 2 deletions textdescriptives/components/dependency_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
from spacy.tokens import Doc, Span, Token


@Language.factory("dependency_distance")
@Language.factory("textdescriptives.dependency_distance")
def create_dependency_distance_component(nlp: Language, name: str):
"""Create spaCy language factory that allows DependencyDistance attributes to be added to a pipe using nlp.add_pipe("dependency_distance")"""
"""Create spaCy language factory that allows DependencyDistance attributes to be
added to a pipe using nlp.add_pipe("textdescriptives.dependency_distance")"""
return DependencyDistance(nlp)


Expand Down
4 changes: 2 additions & 2 deletions textdescriptives/components/descriptive_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
from .utils import filtered_tokens, n_sentences, n_syllables, n_tokens


@Language.factory("descriptive_stats")
@Language.factory("textdescriptives.descriptive_stats")
def create_descriptive_stats_component(nlp: Language, name: str):
"""Allows DescriptiveStatistics to be added to a spaCy pipe using nlp.add_pipe("descriptive_stats").
"""Allows DescriptiveStatistics to be added to a spaCy pipe using nlp.add_pipe("textdescriptives.descriptive_stats").
If the pipe does not contain a parser or sentencizer, the sentencizer component is silently added."""
sentencizers = set(["sentencizer", "parser"])
if not sentencizers.intersection(set(nlp.pipe_names)):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,19 @@
from spacy.tokens import Doc, Span


@Language.factory("pos_stats", default_config={"use_pos": True})
@Language.factory("textdescriptives.pos_proportions", default_config={"use_pos": True})
def create_pos_stats_component(nlp: Language, name: str, use_pos: bool) -> Language:
"""Allows PosStats to be added to a spaCy pipe using nlp.add_pipe("pos_stats")"""
"""Allows PosPropotions to be added to a spaCy pipe using nlp.add_pipe("textdescriptives.pos_proportions")"""

tagger = {"tagger", "attribute_ruler"}
if not tagger.intersection(set(nlp.pipe_names)):
raise ValueError(
"The pipeline does not contain a component for POS tagging. Please load a spaCy model which includes a 'tagger' or an 'attribute ruler' component."
)
return POSStatistics(nlp, use_pos=use_pos)
return POSProportions(nlp, use_pos=use_pos)


class POSStatistics:
class POSProportions:
"""spaCy v.3.0 component that adds attributes for POS statistics to `Doc` and `Span` objects."""

def __init__(self, nlp: Language, use_pos: bool):
Expand Down
4 changes: 2 additions & 2 deletions textdescriptives/components/quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def top_ngram_chr_fraction(
ngram_range: Tuple[int, int],
min_count: int = 0,
) -> float:
"""Calculated whether the character fraction of the top n-grams is below the
"""Calculates whether the character fraction of the top n-grams is below the
given thresholds
Args:
Expand Down Expand Up @@ -472,7 +472,7 @@ def set_extensions(self):


@Language.factory(
"quality",
"textdescriptives.quality",
default_config={
"symbols": ["#"],
"contains": ["lorem ipsum"],
Expand Down
8 changes: 4 additions & 4 deletions textdescriptives/components/readability.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@
from .descriptive_stats import create_descriptive_stats_component # noqa


@Language.factory("readability")
@Language.factory("textdescriptives.readability")
def create_readability_component(nlp: Language, name: str):
"""Allows Readability to be added to a spaCy pipe using nlp.add_pipe("readability").
"""Allows Readability to be added to a spaCy pipe using nlp.add_pipe("textdescriptives.readability").
Readability requires attributes from DescriptiveStatistics and adds it to the
pipe if it not already loaded."""
if "descriptive_stats" not in nlp.pipe_names:
if "textdescriptives.descriptive_stats" not in nlp.pipe_names:
print(
"'descriptive_stats' component is required for 'readability'. Adding to pipe."
)
nlp = nlp.add_pipe("descriptive_stats")
nlp = nlp.add_pipe("textdescriptives.descriptive_stats")
return Readability(nlp)


Expand Down
6 changes: 2 additions & 4 deletions textdescriptives/dataframe_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(
"descriptive_stats",
"readability",
"dependency_distance",
"pos_stats",
"pos_proportions",
"quality",
"all",
}
Expand Down Expand Up @@ -78,7 +78,7 @@ def __get_descriptive_stats_dict(self, doc: Doc) -> pd.DataFrame:
}

def __unpack_extension(self, doc: Doc, extension: str) -> pd.DataFrame:
"""Unpacks the the values from the extension to a dict or dataframe
"""Unpacks the values from the extension to a dict or dataframe
Args:
doc (Doc): Document to extract from
Expand All @@ -91,8 +91,6 @@ def __unpack_extension(self, doc: Doc, extension: str) -> pd.DataFrame:
# we only need the getter
if extension == "descriptive_stats":
values = self.__get_descriptive_stats_dict(doc)
elif extension == "pos_stats":
values = doc.get_extension("pos_proportions")[2](doc)
else:
values = doc.get_extension(extension)[2](doc)

Expand Down
22 changes: 11 additions & 11 deletions textdescriptives/load_components.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
"""Adds all components to a spaCy pipeline"""
from spacy.language import Language
from spacy.tokens import Doc

from .components import (
Readability,
DependencyDistance,
DescriptiveStatistics,
POSStatistics,
POSProportions,
Quality,
Readability,
)

from spacy.language import Language
from spacy.tokens import Doc


@Language.factory("textdescriptives")
@Language.factory("textdescriptives.all")
def create_textdescriptives_component(nlp: Language, name: str):
for component in [
"descriptive_stats",
"readability",
"dependency_distance",
"pos_stats",
"quality",
"textdescriptives.descriptive_stats",
"textdescriptives.readability",
"textdescriptives.dependency_distance",
"textdescriptives.pos_proportions",
"textdescriptives.quality",
]:
nlp.add_pipe(component, last=True)
return TextDescriptives(nlp)
Expand Down
4 changes: 2 additions & 2 deletions textdescriptives/tests/test_dependency_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@
@pytest.fixture(scope="function")
def nlp():
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("dependency_distance")
nlp.add_pipe("textdescriptives.dependency_distance")
return nlp


def test_dependency_distance_integration(nlp):
assert "dependency_distance" == nlp.pipe_names[-1]
assert "textdescriptives.dependency_distance" == nlp.pipe_names[-1]


def test_dependency_distance(nlp):
Expand Down
4 changes: 2 additions & 2 deletions textdescriptives/tests/test_descriptive_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
def nlp():
nlp = English()
nlp.add_pipe("sentencizer")
nlp.add_pipe("descriptive_stats")
nlp.add_pipe("textdescriptives.descriptive_stats")
return nlp


def test_descriptive_stats_integration(nlp):
assert "descriptive_stats" == nlp.pipe_names[-1]
assert "textdescriptives.descriptive_stats" == nlp.pipe_names[-1]


def test_descriptive_stats(nlp):
Expand Down
2 changes: 1 addition & 1 deletion textdescriptives/tests/test_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
@pytest.fixture(scope="function")
def nlp():
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives")
nlp.add_pipe("textdescriptives.all")
return nlp


Expand Down
18 changes: 10 additions & 8 deletions textdescriptives/tests/test_load_components.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
import spacy
import pytest
import spacy

from textdescriptives import TextDescriptives


@pytest.fixture(scope="function")
def nlp():
nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textdescriptives")
nlp.add_pipe("textdescriptives.all")
return nlp


def test_integration(nlp):
assert nlp.pipe_names[-1] == "textdescriptives"
assert nlp.pipe_names[-1] == "textdescriptives.all"
for component in [
"descriptive_stats",
"readability",
"dependency_distance",
"textdescriptives",
"quality",
"textdescriptives.descriptive_stats",
"textdescriptives.readability",
"textdescriptives.dependency_distance",
"textdescriptives.all",
"textdescriptives.quality",
"textdescriptives.pos_proportions",
]:
assert component in nlp.pipe_names

Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import pytest
import spacy
from spacy.tokens import Doc
from spacy.lang.en import English
import pytest
from textdescriptives.components import POSStatistics
from spacy.tokens import Doc

from textdescriptives.components import POSProportions


@pytest.fixture(scope="function")
def nlp():
nlp = spacy.load("en_core_web_sm", disable=("ner", "textcat"))
nlp.add_pipe("pos_stats")
nlp.add_pipe("textdescriptives.pos_proportions")

return nlp

Expand Down Expand Up @@ -76,7 +77,7 @@ def doc(nlp):


def test_pos_integrations(nlp):
assert "pos_stats" == nlp.pipe_names[-1]
assert "textdescriptives.pos_proportions" == nlp.pipe_names[-1]


def test_pos_proportions_doc(doc):
Expand Down
6 changes: 3 additions & 3 deletions textdescriptives/tests/test_quality.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ def test_top_ngram_chr_fraction(

def test_quality_component(nlp: spacy.Language):
"""Test the quality component."""
nlp.add_pipe("quality", config={"force": True})
nlp.add_pipe("textdescriptives.quality", config={"force": True})
doc = nlp("This is a test. This is a test. This is a test.")
assert doc._.quality["n_stop_words"] == 9
assert doc._.quality["mean_word_length"] == 2.4
Expand Down Expand Up @@ -211,7 +211,7 @@ def test_quality_component_with_config(nlp: spacy.Language):
"contains_lorem ipsum": False,
}
d = nlp.add_pipe(
"quality",
"textdescriptives.quality",
config={
"symbols": ["."],
"quality_thresholds": quality_thresholds,
Expand Down Expand Up @@ -251,6 +251,6 @@ def test_quality_component_with_config(nlp: spacy.Language):
)
def test_passed_quality_check(text: str, passed: bool, nlp: spacy.Language):
"""Test the passed_quality_check attribute."""
nlp.add_pipe("quality", config={"force": True})
nlp.add_pipe("textdescriptives.quality", config={"force": True})
doc = nlp(text)
assert doc._.passed_quality_check == passed
13 changes: 6 additions & 7 deletions textdescriptives/tests/test_readability.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,22 @@
import ftfy
import numpy as np
import pytest
from spacy.lang.en import English

from textdescriptives.components import Readability
from .books import *

from spacy.lang.en import English

import numpy as np
import ftfy
from .books import *


@pytest.fixture(scope="function")
def nlp():
nlp = English()
nlp.add_pipe("readability")
nlp.add_pipe("textdescriptives.readability")
return nlp


def test_readability_integration(nlp):
assert "readability" == nlp.pipe_names[-1]
assert "textdescriptives.readability" == nlp.pipe_names[-1]


def test_readability(nlp):
Expand Down

0 comments on commit faa7bb8

Please sign in to comment.