-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #7 from HLasse/master
master to posstatistics
- Loading branch information
Showing
8 changed files
with
92 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
__title__ = "textdescriptives" | ||
__version__ = "1.0.2" # the ONLY source of version ID | ||
__version__ = "1.0.3" # the ONLY source of version ID | ||
__download_url__ = "https://github.com/HLasse/textdescriptives" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from .readability import Readability | ||
from .dependency_distance import DependencyDistance | ||
from .descriptive_stats import DescriptiveStatistics | ||
from .pos_stats import POSStatistics |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
"""Calculation of statistics that require a pos-tagger in the pipeline""" | ||
|
||
from spacy.tokens import Doc, Span | ||
from spacy.language import Language | ||
from typing import Counter | ||
|
||
from .utils import filtered_tokens | ||
|
||
@Language.factory("pos_stats") | ||
def create_pos_stats_component(nlp: Language, name: str): | ||
"""Allows PosStats to be added to a spaCy pipe using nlp.add_pipe("pos_stats"). | ||
If the pipe does not contain a tagger, is is silently added.""" | ||
|
||
tagger = set(["tagger"]) | ||
if not tagger.intersection(set(nlp.pipe_names)): | ||
nlp.add_pipe("tagger") # add a tagger if not one in pipe | ||
return POSStatistics(nlp) | ||
|
||
class POSStatistics: | ||
"""spaCy v.3.0 component that adds attributes for POS statistics to `Doc` and `Span` objects. | ||
""" | ||
|
||
def __init__(self, nlp: Language): | ||
"""Initialise components""" | ||
if not Doc.has_extension("pos_proportions"): | ||
Doc.set_extension("pos_proportions", getter=self.pos_proportions) | ||
|
||
|
||
def __call__(self, doc): | ||
"""Run the pipeline component""" | ||
return doc | ||
|
||
def pos_proportions(self, doc: Doc) -> dict: | ||
""" | ||
Returns: | ||
Dict with proportions of part-of-speech tag in doc. | ||
""" | ||
pos_counts = Counter() | ||
|
||
pos_counts.update([token.tag_ for token in doc]) | ||
|
||
pos_proportions = {tag : pos_counts[tag] / sum(pos_counts.values()) for tag in pos_counts} | ||
|
||
return pos_proportions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import spacy | ||
from spacy.lang.en import English | ||
import pytest | ||
from textdescriptives.components import POSStatistics | ||
|
||
@pytest.fixture(scope="function") | ||
|
||
def nlp(): | ||
nlp = spacy.load("en_core_web_sm", disable=('ner', 'textcat')) | ||
nlp.add_pipe("pos_stats") | ||
|
||
return nlp | ||
|
||
def test_pos_integrations(nlp): | ||
assert "pos_stats" == nlp.pipe_names[-1] | ||
|
||
def test_pos_proportions(nlp): | ||
doc = nlp( | ||
"Here is the first sentence. It was pretty short. Let's make another one that's slightly longer and more complex." | ||
) | ||
|
||
assert doc._.pos_proportions == {'RB': 0.125, 'VBZ': 0.08333333333333333, 'DT': 0.08333333333333333, 'JJ': 0.125, 'NN': 0.08333333333333333, '.': 0.125, 'PRP': 0.08333333333333333, 'VBD': 0.041666666666666664, 'VB': 0.08333333333333333, 'WDT': 0.041666666666666664, 'JJR': 0.041666666666666664, 'CC': 0.041666666666666664, 'RBR': 0.041666666666666664} |