Sync with main repo

HLasse · Sep 8, 2021 · c639bb1 · c639bb1
2 parents ea54a46 + 66d2360
commit c639bb1
Show file tree

Hide file tree

Showing 10 changed files with 47 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ A Python library for calculating a large variety of statistics from text(s) usin
 
 
 # 👩‍💻 Usage
- 
+
 Import the library and add the component to your pipeline using the string name of the "textdescriptives" component factory:
 
 ```py
@@ -41,11 +41,11 @@ TextDescriptives includes convenience functions for extracting metrics to a Pand
 td.extract_df(doc)
 # td.extract_dict(doc)
 ```
-|    | text                                                                                                                                                        |   token_length_mean |   token_length_median |   token_length_std |   sentence_length_mean |   sentence_length_median |   sentence_length_std |   syllables_per_token_mean |   syllables_per_token_median |   syllables_per_token_std |   n_tokens |   n_unique_tokens |   proportion_unique_tokens |   n_characters |   n_sentences |   flesch_reading_ease |   flesch_kincaid_grade |    smog |   gunning_fog |   automated_readability_index |   coleman_liau_index |     lix |   rix |   dependency_distance_mean |   dependency_distance_std |   prop_adjacent_dependency_relation_mean |   prop_adjacent_dependency_relation_std |
-|---:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------:|----------------------:|-------------------:|-----------------------:|-------------------------:|----------------------:|---------------------------:|-----------------------------:|--------------------------:|-----------:|------------------:|---------------------------:|---------------:|--------------:|----------------------:|-----------------------:|--------:|--------------:|------------------------------:|---------------------:|--------:|------:|---------------------------:|--------------------------:|-----------------------------------------:|----------------------------------------:|
-|  0 | The world (...) |             3.28571 |                     3 |            1.54127 |                      7 |                        6 |               3.09839 |                    1.08571 |                            1 |                  0.368117 |         35 |                23 |                   0.657143 |            121 |             5 |               107.879 |             -0.0485714 | 5.68392 |       3.94286 |                      -2.45429 |            -0.708571 | 12.7143 |   0.4 |                    1.69524 |                  0.422282 |                                  0.44381 |                               0.0863679 |
+|    | text                                                                                                                                                        |   token_length_mean |   token_length_median |   token_length_std |   sentence_length_mean |   sentence_length_median |   sentence_length_std |   syllables_per_token_mean |   syllables_per_token_median |   syllables_per_token_std |   n_tokens |   n_unique_tokens |   proportion_unique_tokens |   n_characters |   n_sentences |   flesch_reading_ease |   flesch_kincaid_grade |    smog |   gunning_fog |   automated_readability_index |   coleman_liau_index |     lix |   rix |   dependency_distance_mean |   dependency_distance_std |   prop_adjacent_dependency_relation_mean |   prop_adjacent_dependency_relation_std |   pos_prop_DT |   pos_prop_NN |   pos_prop_VBZ |   pos_prop_VBN |   pos_prop_. |   pos_prop_PRP |   pos_prop_VBP |   pos_prop_IN |   pos_prop_RB |   pos_prop_VBD |   pos_prop_, |   pos_prop_WP |
+|---:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------:|----------------------:|-------------------:|-----------------------:|-------------------------:|----------------------:|---------------------------:|-----------------------------:|--------------------------:|-----------:|------------------:|---------------------------:|---------------:|--------------:|----------------------:|-----------------------:|--------:|--------------:|------------------------------:|---------------------:|--------:|------:|---------------------------:|--------------------------:|-----------------------------------------:|----------------------------------------:|--------------:|--------------:|---------------:|---------------:|-------------:|---------------:|---------------:|--------------:|--------------:|---------------:|-------------:|--------------:|
+|  0 | The world  (...) |             3.28571 |                     3 |            1.54127 |                      7 |                        6 |               3.09839 |                    1.08571 |                            1 |                  0.368117 |         35 |                23 |                   0.657143 |            121 |             5 |               107.879 |             -0.0485714 | 5.68392 |       3.94286 |                      -2.45429 |            -0.708571 | 12.7143 |   0.4 |                    1.69524 |                  0.422282 |                                  0.44381 |                               0.0863679 |      0.097561 |      0.121951 |      0.0487805 |      0.0487805 |     0.121951 |       0.170732 |       0.121951 |      0.121951 |     0.0731707 |      0.0243902 |    0.0243902 |     0.0243902 |
 
-Set which group(s) of metrics you want to extract using the `metrics` parameter (one or more of `readability`, `dependency_distance`, `descriptive_stats`, defaults to `all`)
+Set which group(s) of metrics you want to extract using the `metrics` parameter (one or more of `readability`, `dependency_distance`, `descriptive_stats`, `pos_stats`, defaults to `all`)
 
 If `extract_df` is called on an object created using `nlp.pipe` it will format the output with 1 row for each document and a column for each metric. Similarly, `extract_dict` will have a key for each metric and values as a list of metrics (1 per doc).
 ```py
@@ -62,7 +62,7 @@ td.extract_df(docs, metrics="dependency_distance")
 The `text` column can by exluded by setting `include_text` to `False`.
 
 ### Using specific components
-The specific components (`descriptive_stats`, `readability`, and `dependency_distance`) can be loaded individually. This can be helpful if you're only interested in e.g. readability metrics or descriptive statistics and don't want to run the dependency parser. 
+The specific components (`descriptive_stats`, `readability`, `dependency_distance` and `pos_stats`) can be loaded individually. This can be helpful if you're only interested in e.g. readability metrics or descriptive statistics and don't want to run the dependency parser or part-of-speech tagger. 
 
 ```py
 nlp = spacy.blank("da")
@@ -88,12 +88,13 @@ The table below shows the metrics included in TextDescriptives and their attribu
 | `Doc._.sentence_length` | `descriptive_stats`                        | Dict containing mean, median, and std of sentence length.   |
 | `Doc._.syllables`    | `descriptive_stats`                       | Dict containing mean, median, and std of number of syllables per token.  |
 | `Doc._.counts`        | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the Doc.|
-| `Doc._.readability`        | `readability` | Dict containing Flesch Reading Ease, Flesch-Kincaid Grade, SMOG, Gunning-Fog, Automated Readability Index, Coleman-Liau Index, LIX, and RIX readability metrics for the Doc. |
-| `Doc._.dependency_distance`        | `dependency_distance` | Dict containing the mean and standard deviation of the dependency distance and proportion adjacent dependency relations in the Doc.|
-`Span._.token_length`   | `descriptive_stats`                       | Dict containing mean, median, and std of token length in the span.                                |
-| `Span._.counts`        | `descriptive_stats` | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the span. |
-| `Span._.dependency_distance`        | `dependency_distance` | Dict containing the mean dependency distance and proportion adjacent dependency relations in the Doc.|
-| `Token._.dependency_distance`        | `dependency_distance` | Dict containing the dependency distance and whether the head word is adjacent for a Token.|
+| `Doc._.pos_proportions`       | `pos_stats`           | Dict of `{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG. |
+| `Doc._.readability`           | `readability`         | Dict containing Flesch Reading Ease, Flesch-Kincaid Grade, SMOG, Gunning-Fog, Automated Readability Index, Coleman-Liau Index, LIX, and RIX readability metrics for the Doc. |
+| `Doc._.dependency_distance`   | `dependency_distance` | Dict containing the mean and standard deviation of the dependency distance and proportion adjacent dependency relations in the Doc. |
+| `Span._.token_length`         | `descriptive_stats`   | Dict containing mean, median, and std of token length in the span. |
+| `Span._.counts`               | `descriptive_stats`   | Dict containing the number of tokens, number of unique tokens, proportion unique tokens, and number of characters in the span. |
+| `Span._.dependency_distance`  | `dependency_distance` | Dict containing the mean dependency distance and proportion adjacent dependency relations in the Doc. |
+| `Token._.dependency_distance` | `dependency_distance` | Dict containing the dependency distance and whether the head word is adjacent for a Token. |
 
 
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -37,6 +37,7 @@ The documentation is organized in two parts:
    descriptivestats
    readability
    dependencydistance
+   posstats
 
 .. add more references here
 

diff --git a/docs/posstats.rst b/docs/posstats.rst
@@ -0,0 +1,18 @@
+POS Stats
+----------------------
+
+The *pos_stats* component adds one attribute (so far) to  Doc:
+
+* ._.proportions (:code:`Doc`) 
+    * Dict of `{pos_prop_POSTAG: proportion of all tokens tagged with POSTAG}`. Does not create a key if no tokens in the document fit the POSTAG.
+
+textdescriptives.components.pos_stats
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. automodule:: textdescriptives.components.pos_stats
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. :exclude-members: function
+.. for functions you wish to exclude
diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 spacy>=3.1.0
-numpy>=1.20.0
+numpy>=1.15.0
 pandas>=1.0.0
 pyphen>=0.11.0
 ftfy>=6.0.3
 https://huggingface.co/spacy/en_core_web_sm/resolve/main/en_core_web_sm-any-py3-none-any.whl
-en_core_web_sm==3.1.0
+en_core_web_sm==3.1.0
diff --git a/textdescriptives/about.py b/textdescriptives/about.py
@@ -1,3 +1,3 @@
 __title__ = "textdescriptives"
-__version__ = "1.0.3"  # the ONLY source of version ID
+__version__ = "1.0.4"  # the ONLY source of version ID
 __download_url__ = "https://github.com/HLasse/textdescriptives"
diff --git a/textdescriptives/components/pos_stats.py b/textdescriptives/components/pos_stats.py
@@ -9,11 +9,11 @@
 @Language.factory("pos_stats")
 def create_pos_stats_component(nlp: Language, name: str):
     """Allows PosStats to be added to a spaCy pipe using nlp.add_pipe("pos_stats").
-    If the pipe does not contain a tagger, is is silently added."""
+    If the pipe does not contain a tagger, it is silently added."""
 
     tagger = set(["tagger"])
     if not tagger.intersection(set(nlp.pipe_names)):
-        nlp.add_pipe("tagger")  # add a tagger if not one in pipe
+        raise ValueError("The pipeline does not contain a tagger. Please load a spaCy model which includes a 'tagger' component.")
     return POSStatistics(nlp)
 
 class POSStatistics:
@@ -36,12 +36,10 @@ def __call__(self, doc):
     def pos_proportions(self, input: Union[Doc, Span]) -> dict:
         """
             Returns:
-                Dict with proportions of part-of-speech tag in doc.
+                Dict containing {pos_prop_POSTAG: proportion of all tokens tagged with POSTAG. Does not create a key if no tokens in the document fit the POSTAG. 
         """
         pos_counts = Counter()
-
         pos_counts.update([token.tag_ for token in input])
-
-        pos_proportions = {tag : pos_counts[tag] / sum(pos_counts.values()) for tag in pos_counts}
+        pos_proportions = {"pos_prop_" + tag : pos_counts[tag] / sum(pos_counts.values()) for tag in pos_counts}
 
         return pos_proportions
diff --git a/textdescriptives/dataframe_extract.py b/textdescriptives/dataframe_extract.py
@@ -101,7 +101,7 @@ def __extract_text(self, doc: Doc) -> Union[pd.DataFrame, str]:
             return {"text" : doc.text}
         return pd.DataFrame([doc.text], columns=["text"])
 
-    def __pos_stats(self, doc: Doc) -> pd.DataFrame:
+    def __pos_proportions(self, doc: Doc) -> pd.DataFrame:
         if self.as_dict:
             return doc._.pos_proportions
         return pd.DataFrame.from_records([doc._.pos_proportions])

diff --git a/textdescriptives/load_components.py b/textdescriptives/load_components.py
@@ -3,6 +3,7 @@
     Readability,
     DependencyDistance,
     DescriptiveStatistics,
+    POSStatistics
 )
 
 from spacy.language import Language
@@ -15,6 +16,7 @@ def create_textdescriptives_component(nlp: Language, name: str):
         "descriptive_stats",
         "readability",
         "dependency_distance",
+        "pos_stats"
     ]:
         nlp.add_pipe(component, last=True)
     return TextDescriptives(nlp)

diff --git a/textdescriptives/tests/test_load_components.py b/textdescriptives/tests/test_load_components.py
@@ -1,11 +1,11 @@
-from spacy.lang.en import English
+import spacy
 import pytest
 from textdescriptives import TextDescriptives
 
 
 @pytest.fixture(scope="function")
 def nlp():
-    nlp = English()
+    nlp = spacy.load("en_core_web_sm")
     nlp.add_pipe("textdescriptives")
     return nlp
 

diff --git a/textdescriptives/tests/test_pos_stats.py b/textdescriptives/tests/test_pos_stats.py
@@ -4,7 +4,6 @@
 from textdescriptives.components import POSStatistics
 
 @pytest.fixture(scope="function")
-
 def nlp():
     nlp = spacy.load("en_core_web_sm", disable=('ner', 'textcat'))
     nlp.add_pipe("pos_stats")
@@ -19,7 +18,7 @@ def test_pos_proportions_doc(nlp):
         "Here is the first sentence. It was pretty short. Let's make another one that's slightly longer and more complex."
     )
 
-    assert doc._.pos_proportions == {'RB': 0.125, 'VBZ': 0.08333333333333333, 'DT': 0.08333333333333333, 'JJ': 0.125, 'NN': 0.08333333333333333, '.': 0.125, 'PRP': 0.08333333333333333, 'VBD': 0.041666666666666664, 'VB': 0.08333333333333333, 'WDT': 0.041666666666666664, 'JJR': 0.041666666666666664, 'CC': 0.041666666666666664, 'RBR': 0.041666666666666664}
+    assert doc._.pos_proportions == {'pos_prop_RB': 0.125, 'pos_prop_VBZ': 0.08333333333333333, 'pos_prop_DT': 0.08333333333333333, 'pos_prop_JJ': 0.125, 'pos_prop_NN': 0.08333333333333333, 'pos_prop_.': 0.125, 'pos_prop_PRP': 0.08333333333333333, 'pos_prop_VBD': 0.041666666666666664, 'pos_prop_VB': 0.08333333333333333, 'pos_prop_WDT': 0.041666666666666664, 'pos_prop_JJR': 0.041666666666666664, 'pos_prop_CC': 0.041666666666666664, 'pos_prop_RBR': 0.041666666666666664}
 
 def test_pos_proportions_span(nlp):
     doc = nlp(
@@ -28,4 +27,5 @@ def test_pos_proportions_span(nlp):
 
     span = doc[0:]
 
-    assert span._.pos_proportions == {'RB': 0.125, 'VBZ': 0.08333333333333333, 'DT': 0.08333333333333333, 'JJ': 0.125, 'NN': 0.08333333333333333, '.': 0.125, 'PRP': 0.08333333333333333, 'VBD': 0.041666666666666664, 'VB': 0.08333333333333333, 'WDT': 0.041666666666666664, 'JJR': 0.041666666666666664, 'CC': 0.041666666666666664, 'RBR': 0.041666666666666664}
+    assert doc._.pos_proportions == {'pos_prop_RB': 0.125, 'pos_prop_VBZ': 0.08333333333333333, 'pos_prop_DT': 0.08333333333333333, 'pos_prop_JJ': 0.125, 'pos_prop_NN': 0.08333333333333333, 'pos_prop_.': 0.125, 'pos_prop_PRP': 0.08333333333333333, 'pos_prop_VBD': 0.041666666666666664, 'pos_prop_VB': 0.08333333333333333, 'pos_prop_WDT': 0.041666666666666664, 'pos_prop_JJR': 0.041666666666666664, 'pos_prop_CC': 0.041666666666666664, 'pos_prop_RBR': 0.041666666666666664}
+