Merge pull request #88 from HLasse/HLasse/Separate-component-loaders

feat: Separate component loaders
HLasse · Dec 14, 2022 · faa7bb8 · faa7bb8
2 parents 2b87c85 + 2fdee32
commit faa7bb8
Show file tree

Hide file tree

Showing 17 changed files with 65 additions and 59 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # News
 
+## v2.0.0 - X 2022
+- All components have been renamed to have the `textdescriptives.` prefix. I.e. components should now be loaded with e.g. `nlp.add_pipe("textdescriptives.descriptive_stats)`.
+`textdescriptives.all` can be used to load all components at once. 
+- `pos_stats` has been renamed to `pos_proportions` for consistency.
+
 ## v1.1.0 - 21st of September, 2022
 - Added the new pipe; "quality". This pipe implements a series of metrics related to text quality, some of which were used by Rae et al. (2021) and Raffel et al. (2020) to filter large text corpora. See the documentation for examples.
 

diff --git a/textdescriptives/__init__.py b/textdescriptives/__init__.py
@@ -2,7 +2,7 @@
 from .components import (  # noqa: F401
     DependencyDistance,
     DescriptiveStatistics,
-    POSStatistics,
+    POSProportions,
     Quality,
     Readability,
 )

diff --git a/textdescriptives/components/__init__.py b/textdescriptives/components/__init__.py
@@ -1,5 +1,5 @@
 from .dependency_distance import DependencyDistance  # noqa: F401
 from .descriptive_stats import DescriptiveStatistics  # noqa: F401
-from .pos_stats import POSStatistics  # noqa: F401
+from .pos_proportions import POSProportions  # noqa: F401
 from .quality import Quality  # noqa: F401
 from .readability import Readability  # noqa: F401
diff --git a/textdescriptives/components/dependency_distance.py b/textdescriptives/components/dependency_distance.py
@@ -4,9 +4,10 @@
 from spacy.tokens import Doc, Span, Token
 
 
-@Language.factory("dependency_distance")
+@Language.factory("textdescriptives.dependency_distance")
 def create_dependency_distance_component(nlp: Language, name: str):
-    """Create spaCy language factory that allows DependencyDistance attributes to be added to a pipe using nlp.add_pipe("dependency_distance")"""
+    """Create spaCy language factory that allows DependencyDistance attributes to be 
+    added to a pipe using nlp.add_pipe("textdescriptives.dependency_distance")"""
     return DependencyDistance(nlp)
 
 

diff --git a/textdescriptives/components/descriptive_stats.py b/textdescriptives/components/descriptive_stats.py
@@ -8,9 +8,9 @@
 from .utils import filtered_tokens, n_sentences, n_syllables, n_tokens
 
 
-@Language.factory("descriptive_stats")
+@Language.factory("textdescriptives.descriptive_stats")
 def create_descriptive_stats_component(nlp: Language, name: str):
-    """Allows DescriptiveStatistics to be added to a spaCy pipe using nlp.add_pipe("descriptive_stats").
+    """Allows DescriptiveStatistics to be added to a spaCy pipe using nlp.add_pipe("textdescriptives.descriptive_stats").
     If the pipe does not contain a parser or sentencizer, the sentencizer component is silently added."""
     sentencizers = set(["sentencizer", "parser"])
     if not sentencizers.intersection(set(nlp.pipe_names)):

diff --git a/textdescriptives/components/pos_stats.py → ...escriptives/components/pos_proportions.py b/textdescriptives/components/pos_stats.py → ...escriptives/components/pos_proportions.py
@@ -6,19 +6,19 @@
 from spacy.tokens import Doc, Span
 
 
-@Language.factory("pos_stats", default_config={"use_pos": True})
+@Language.factory("textdescriptives.pos_proportions", default_config={"use_pos": True})
 def create_pos_stats_component(nlp: Language, name: str, use_pos: bool) -> Language:
-    """Allows PosStats to be added to a spaCy pipe using nlp.add_pipe("pos_stats")"""
+    """Allows PosPropotions to be added to a spaCy pipe using nlp.add_pipe("textdescriptives.pos_proportions")"""
 
     tagger = {"tagger", "attribute_ruler"}
     if not tagger.intersection(set(nlp.pipe_names)):
         raise ValueError(
             "The pipeline does not contain a component for POS tagging. Please load a spaCy model which includes a 'tagger' or an 'attribute ruler' component."
         )
-    return POSStatistics(nlp, use_pos=use_pos)
+    return POSProportions(nlp, use_pos=use_pos)
 
 
-class POSStatistics:
+class POSProportions:
     """spaCy v.3.0 component that adds attributes for POS statistics to `Doc` and `Span` objects."""
 
     def __init__(self, nlp: Language, use_pos: bool):

diff --git a/textdescriptives/components/quality.py b/textdescriptives/components/quality.py
@@ -297,7 +297,7 @@ def top_ngram_chr_fraction(
     ngram_range: Tuple[int, int],
     min_count: int = 0,
 ) -> float:
-    """Calculated whether the character fraction of the top n-grams is below the
+    """Calculates whether the character fraction of the top n-grams is below the
     given thresholds
 
     Args:
@@ -472,7 +472,7 @@ def set_extensions(self):
 
 
 @Language.factory(
-    "quality",
+    "textdescriptives.quality",
     default_config={
         "symbols": ["#"],
         "contains": ["lorem ipsum"],

diff --git a/textdescriptives/components/readability.py b/textdescriptives/components/readability.py
@@ -8,16 +8,16 @@
 from .descriptive_stats import create_descriptive_stats_component  # noqa
 
 
-@Language.factory("readability")
+@Language.factory("textdescriptives.readability")
 def create_readability_component(nlp: Language, name: str):
-    """Allows Readability to be added to a spaCy pipe using nlp.add_pipe("readability").
+    """Allows Readability to be added to a spaCy pipe using nlp.add_pipe("textdescriptives.readability").
     Readability requires attributes from DescriptiveStatistics and adds it to the
     pipe if it not already loaded."""
-    if "descriptive_stats" not in nlp.pipe_names:
+    if "textdescriptives.descriptive_stats" not in nlp.pipe_names:
         print(
             "'descriptive_stats' component is required for 'readability'. Adding to pipe."
         )
-        nlp = nlp.add_pipe("descriptive_stats")
+        nlp = nlp.add_pipe("textdescriptives.descriptive_stats")
     return Readability(nlp)
 
 

diff --git a/textdescriptives/dataframe_extract.py b/textdescriptives/dataframe_extract.py
@@ -33,7 +33,7 @@ def __init__(
             "descriptive_stats",
             "readability",
             "dependency_distance",
-            "pos_stats",
+            "pos_proportions",
             "quality",
             "all",
         }
@@ -78,7 +78,7 @@ def __get_descriptive_stats_dict(self, doc: Doc) -> pd.DataFrame:
         }
 
     def __unpack_extension(self, doc: Doc, extension: str) -> pd.DataFrame:
-        """Unpacks the the values from the extension to a dict or dataframe
+        """Unpacks the values from the extension to a dict or dataframe
 
         Args:
             doc (Doc): Document to extract from
@@ -91,8 +91,6 @@ def __unpack_extension(self, doc: Doc, extension: str) -> pd.DataFrame:
         # we only need the getter
         if extension == "descriptive_stats":
             values = self.__get_descriptive_stats_dict(doc)
-        elif extension == "pos_stats":
-            values = doc.get_extension("pos_proportions")[2](doc)
         else:
             values = doc.get_extension(extension)[2](doc)
 

diff --git a/textdescriptives/load_components.py b/textdescriptives/load_components.py
@@ -1,24 +1,24 @@
 """Adds all components to a spaCy pipeline"""
+from spacy.language import Language
+from spacy.tokens import Doc
+
 from .components import (
-    Readability,
     DependencyDistance,
     DescriptiveStatistics,
-    POSStatistics,
+    POSProportions,
     Quality,
+    Readability,
 )
 
-from spacy.language import Language
-from spacy.tokens import Doc
-
 
-@Language.factory("textdescriptives")
+@Language.factory("textdescriptives.all")
 def create_textdescriptives_component(nlp: Language, name: str):
     for component in [
-        "descriptive_stats",
-        "readability",
-        "dependency_distance",
-        "pos_stats",
-        "quality",
+        "textdescriptives.descriptive_stats",
+        "textdescriptives.readability",
+        "textdescriptives.dependency_distance",
+        "textdescriptives.pos_proportions",
+        "textdescriptives.quality",
     ]:
         nlp.add_pipe(component, last=True)
     return TextDescriptives(nlp)

diff --git a/textdescriptives/tests/test_dependency_distance.py b/textdescriptives/tests/test_dependency_distance.py
@@ -9,12 +9,12 @@
 @pytest.fixture(scope="function")
 def nlp():
     nlp = spacy.load("en_core_web_sm")
-    nlp.add_pipe("dependency_distance")
+    nlp.add_pipe("textdescriptives.dependency_distance")
     return nlp
 
 
 def test_dependency_distance_integration(nlp):
-    assert "dependency_distance" == nlp.pipe_names[-1]
+    assert "textdescriptives.dependency_distance" == nlp.pipe_names[-1]
 
 
 def test_dependency_distance(nlp):

diff --git a/textdescriptives/tests/test_descriptive_stats.py b/textdescriptives/tests/test_descriptive_stats.py
@@ -7,12 +7,12 @@
 def nlp():
     nlp = English()
     nlp.add_pipe("sentencizer")
-    nlp.add_pipe("descriptive_stats")
+    nlp.add_pipe("textdescriptives.descriptive_stats")
     return nlp
 
 
 def test_descriptive_stats_integration(nlp):
-    assert "descriptive_stats" == nlp.pipe_names[-1]
+    assert "textdescriptives.descriptive_stats" == nlp.pipe_names[-1]
 
 
 def test_descriptive_stats(nlp):

diff --git a/textdescriptives/tests/test_extractor.py b/textdescriptives/tests/test_extractor.py
@@ -6,7 +6,7 @@
 @pytest.fixture(scope="function")
 def nlp():
     nlp = spacy.load("en_core_web_sm")
-    nlp.add_pipe("textdescriptives")
+    nlp.add_pipe("textdescriptives.all")
     return nlp
 
 

diff --git a/textdescriptives/tests/test_load_components.py b/textdescriptives/tests/test_load_components.py
@@ -1,23 +1,25 @@
-import spacy
 import pytest
+import spacy
+
 from textdescriptives import TextDescriptives
 
 
 @pytest.fixture(scope="function")
 def nlp():
     nlp = spacy.load("en_core_web_sm")
-    nlp.add_pipe("textdescriptives")
+    nlp.add_pipe("textdescriptives.all")
     return nlp
 
 
 def test_integration(nlp):
-    assert nlp.pipe_names[-1] == "textdescriptives"
+    assert nlp.pipe_names[-1] == "textdescriptives.all"
     for component in [
-        "descriptive_stats",
-        "readability",
-        "dependency_distance",
-        "textdescriptives",
-        "quality",
+        "textdescriptives.descriptive_stats",
+        "textdescriptives.readability",
+        "textdescriptives.dependency_distance",
+        "textdescriptives.all",
+        "textdescriptives.quality",
+        "textdescriptives.pos_proportions",
     ]:
         assert component in nlp.pipe_names
 

diff --git a/textdescriptives/tests/test_pos_stats.py → ...escriptives/tests/test_pos_proportions.py b/textdescriptives/tests/test_pos_stats.py → ...escriptives/tests/test_pos_proportions.py
@@ -1,14 +1,15 @@
+import pytest
 import spacy
-from spacy.tokens import Doc
 from spacy.lang.en import English
-import pytest
-from textdescriptives.components import POSStatistics
+from spacy.tokens import Doc
+
+from textdescriptives.components import POSProportions
 
 
 @pytest.fixture(scope="function")
 def nlp():
     nlp = spacy.load("en_core_web_sm", disable=("ner", "textcat"))
-    nlp.add_pipe("pos_stats")
+    nlp.add_pipe("textdescriptives.pos_proportions")
 
     return nlp
 
@@ -76,7 +77,7 @@ def doc(nlp):
 
 
 def test_pos_integrations(nlp):
-    assert "pos_stats" == nlp.pipe_names[-1]
+    assert "textdescriptives.pos_proportions" == nlp.pipe_names[-1]
 
 
 def test_pos_proportions_doc(doc):

diff --git a/textdescriptives/tests/test_quality.py b/textdescriptives/tests/test_quality.py
@@ -180,7 +180,7 @@ def test_top_ngram_chr_fraction(
 
 def test_quality_component(nlp: spacy.Language):
     """Test the quality component."""
-    nlp.add_pipe("quality", config={"force": True})
+    nlp.add_pipe("textdescriptives.quality", config={"force": True})
     doc = nlp("This is a test. This is a test. This is a test.")
     assert doc._.quality["n_stop_words"] == 9
     assert doc._.quality["mean_word_length"] == 2.4
@@ -211,7 +211,7 @@ def test_quality_component_with_config(nlp: spacy.Language):
         "contains_lorem ipsum": False,
     }
     d = nlp.add_pipe(
-        "quality",
+        "textdescriptives.quality",
         config={
             "symbols": ["."],
             "quality_thresholds": quality_thresholds,
@@ -251,6 +251,6 @@ def test_quality_component_with_config(nlp: spacy.Language):
 )
 def test_passed_quality_check(text: str, passed: bool, nlp: spacy.Language):
     """Test the passed_quality_check attribute."""
-    nlp.add_pipe("quality", config={"force": True})
+    nlp.add_pipe("textdescriptives.quality", config={"force": True})
     doc = nlp(text)
     assert doc._.passed_quality_check == passed
diff --git a/textdescriptives/tests/test_readability.py b/textdescriptives/tests/test_readability.py
@@ -1,23 +1,22 @@
+import ftfy
+import numpy as np
 import pytest
+from spacy.lang.en import English
 
 from textdescriptives.components import Readability
-from .books import *
 
-from spacy.lang.en import English
-
-import numpy as np
-import ftfy
+from .books import *
 
 
 @pytest.fixture(scope="function")
 def nlp():
     nlp = English()
-    nlp.add_pipe("readability")
+    nlp.add_pipe("textdescriptives.readability")
     return nlp
 
 
 def test_readability_integration(nlp):
-    assert "readability" == nlp.pipe_names[-1]
+    assert "textdescriptives.readability" == nlp.pipe_names[-1]
 
 
 def test_readability(nlp):