Skip to content

Commit

Permalink
Merge pull request #7 from HLasse/master
Browse files Browse the repository at this point in the history
master to posstatistics
  • Loading branch information
HLasse authored Aug 31, 2021
2 parents a72ab59 + 555f091 commit 64f9f17
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 10 deletions.
2 changes: 1 addition & 1 deletion textdescriptives/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .load_components import TextDescriptives
from .components import DescriptiveStatistics, Readability, DependencyDistance
from .components import DescriptiveStatistics, Readability, DependencyDistance, POSStatistics
from .dataframe_extract import (
extract_df,
extract_dict,
Expand Down
2 changes: 1 addition & 1 deletion textdescriptives/about.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__title__ = "textdescriptives"
__version__ = "1.0.2" # the ONLY source of version ID
__version__ = "1.0.3" # the ONLY source of version ID
__download_url__ = "https://github.com/HLasse/textdescriptives"
1 change: 1 addition & 0 deletions textdescriptives/components/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .readability import Readability
from .dependency_distance import DependencyDistance
from .descriptive_stats import DescriptiveStatistics
from .pos_stats import POSStatistics
9 changes: 8 additions & 1 deletion textdescriptives/components/descriptive_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,13 @@ def syllables(self, doc: Doc):
}

def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True):
"""Returns:
Dict with keys:
n_tokens,
n_unique_tokens,
proportion_unique_tokens,
n_characters
"""
n_tokens = doc._._n_tokens
n_types = len(set([tok.lower_ for tok in doc._._filtered_tokens]))
if ignore_whitespace:
Expand All @@ -116,4 +123,4 @@ def counts(self, doc: Union[Doc, Span], ignore_whitespace: bool = True):
}
if isinstance(doc, Doc):
out["n_sentences"] = doc._._n_sentences
return out
return out
44 changes: 44 additions & 0 deletions textdescriptives/components/pos_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Calculation of statistics that require a pos-tagger in the pipeline"""

from spacy.tokens import Doc, Span
from spacy.language import Language
from typing import Counter

from .utils import filtered_tokens

@Language.factory("pos_stats")
def create_pos_stats_component(nlp: Language, name: str):
"""Allows PosStats to be added to a spaCy pipe using nlp.add_pipe("pos_stats").
If the pipe does not contain a tagger, is is silently added."""

tagger = set(["tagger"])
if not tagger.intersection(set(nlp.pipe_names)):
nlp.add_pipe("tagger") # add a tagger if not one in pipe
return POSStatistics(nlp)

class POSStatistics:
"""spaCy v.3.0 component that adds attributes for POS statistics to `Doc` and `Span` objects.
"""

def __init__(self, nlp: Language):
"""Initialise components"""
if not Doc.has_extension("pos_proportions"):
Doc.set_extension("pos_proportions", getter=self.pos_proportions)


def __call__(self, doc):
"""Run the pipeline component"""
return doc

def pos_proportions(self, doc: Doc) -> dict:
"""
Returns:
Dict with proportions of part-of-speech tag in doc.
"""
pos_counts = Counter()

pos_counts.update([token.tag_ for token in doc])

pos_proportions = {tag : pos_counts[tag] / sum(pos_counts.values()) for tag in pos_counts}

return pos_proportions
19 changes: 14 additions & 5 deletions textdescriptives/dataframe_extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,15 @@ def __init__(
Args:
doc (Doc): a spaCy doc
metrics (Union[list[str], str], optional): Which metrics to extract.
One or more of ["descriptive_stats", "readability", "dependency_distance", "all"].
One or more of ["descriptive_stats", "readability", "dependency_distance", "pos_stats", "all"].
Defaults to "all".
include_text (bool, optional): Whether to add a column containing the text. Defaults to True.
"""
if not isinstance(doc, (Doc)):
raise TypeError(f"doc should be a spaCy Doc object, not {type(doc)}.")

valid_metrics = set(
["descriptive_stats", "readability", "dependency_distance", "all"]
["descriptive_stats", "readability", "dependency_distance", "pos_stats", "all"]
)
if isinstance(metrics, str):
metrics = [metrics]
Expand All @@ -40,7 +40,7 @@ def __init__(
)
if not set(metrics).issubset(valid_metrics):
raise ValueError(
f"'metrics' contained invalid metric.\nValid metrics are: ['all', 'descriptive_stats', 'readability', 'dependency_distance']"
f"'metrics' contained invalid metric.\nValid metrics are: ['all', 'descriptive_stats', 'readability', 'dependency_distance', 'pos_stats']"
)

self.include_text = include_text
Expand All @@ -58,13 +58,17 @@ def __init__(
extraction.append(self.__readability(doc))
if doc.has_extension("dependency_distance"):
extraction.append(self.__dependency_distance(doc))
if doc.has_extension("pos_proportions"):
extraction.append(self.__pos_proportions(doc))
else:
if "descriptive_stats" in metrics:
extraction.append(self.__descriptive_stats(doc))
if "readability" in metrics:
extraction.append(self.__readability(doc))
if "dependency_distance" in metrics:
extraction.append(self.__dependency_distance(doc))
if "pos_stats" in metrics:
extraction.append(self.__pos_proportins(doc))

if self.as_dict:
self.out = reduce(lambda a, b: {**a, **b}, extraction)
Expand Down Expand Up @@ -96,6 +100,11 @@ def __extract_text(self, doc: Doc) -> Union[pd.DataFrame, str]:
if self.as_dict:
return {"text" : doc.text}
return pd.DataFrame([doc.text], columns=["text"])

def __pos_stats(self, doc: Doc) -> pd.DataFrame:
if self.as_dict:
return doc._.pos_proportions
return pd.DataFrame.from_records([doc._.pos_proportions])



Expand All @@ -108,7 +117,7 @@ def extract_df(
Args:
doc (Doc): a spaCy doc or a generator of spaCy Docs
metrics (Union[list[str], str], optional): Which metrics to extract.
One or more of ["descriptive_stats", "readability", "dependency_distance", "all"].
One or more of ["descriptive_stats", "readability", "dependency_distance", "pos_stats", "all"].
Defaults to "all".
include_text (bool, optional): Whether to add a column containing the text. Defaults to True.
Expand All @@ -133,7 +142,7 @@ def extract_dict(
Args:
doc (Doc): a spaCy doc or a generator of spaCy Docs
metrics (Union[list[str], str], optional): Which metrics to extract.
One or more of ["descriptive_stats", "readability", "dependency_distance", "all"].
One or more of ["descriptive_stats", "readability", "dependency_distance", "pos_stats", "all"].
Defaults to "all".
include_text (bool, optional): Whether to add an entry containing the text. Defaults to True.
Expand Down
3 changes: 1 addition & 2 deletions textdescriptives/tests/test_descriptive_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,10 @@ def test_counts(nlp):
assert doc[0:6]._.counts["proportion_unique_tokens"] == 1.0
assert doc[0:6]._.counts["n_characters"] == 23


@pytest.mark.parametrize("text", ["", "#"])
def test_descriptive_edge(text, nlp):
doc = nlp(text)
assert doc._.token_length
assert doc._.sentence_length
assert doc._.syllables
assert doc._.counts
assert doc._.counts
22 changes: 22 additions & 0 deletions textdescriptives/tests/test_pos_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import spacy
from spacy.lang.en import English
import pytest
from textdescriptives.components import POSStatistics

@pytest.fixture(scope="function")

def nlp():
nlp = spacy.load("en_core_web_sm", disable=('ner', 'textcat'))
nlp.add_pipe("pos_stats")

return nlp

def test_pos_integrations(nlp):
assert "pos_stats" == nlp.pipe_names[-1]

def test_pos_proportions(nlp):
doc = nlp(
"Here is the first sentence. It was pretty short. Let's make another one that's slightly longer and more complex."
)

assert doc._.pos_proportions == {'RB': 0.125, 'VBZ': 0.08333333333333333, 'DT': 0.08333333333333333, 'JJ': 0.125, 'NN': 0.08333333333333333, '.': 0.125, 'PRP': 0.08333333333333333, 'VBD': 0.041666666666666664, 'VB': 0.08333333333333333, 'WDT': 0.041666666666666664, 'JJR': 0.041666666666666664, 'CC': 0.041666666666666664, 'RBR': 0.041666666666666664}

0 comments on commit 64f9f17

Please sign in to comment.