Merge branch 'dev_ludvig'

HLasse · Mar 22, 2020 · 872ac88 · 872ac88
2 parents cd1880d + eb95c10
commit 872ac88
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 45 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,5 +11,7 @@ stanfordnlp_resources/*
 stanfordnlp_resources
 snlp_resources/*
 snlp_resources
+stanza_resources/*
+stanza_resources
 
 test_textdescriptives.py
diff --git a/README.md b/README.md
@@ -15,9 +15,9 @@ import textdescriptives
 en_test = ['The world is changed. I feel it in the water. I feel it in the earth. I smell it in the air. Much that once was is lost, for none now live who remember it.',
             'He felt that his whole life was some kind of dream and he sometimes wondered whose it was and whether they were enjoying it.']
 
-snlp_path = path/to/snlp_resources
+stanza_path = path/to/stanza_resources
 
-textdescriptives.all_metrics(en_test, lang = 'en', snlp_path = snlp_path)
+textdescriptives.all_metrics(en_test, lang = 'en', stanza_path = stanza_path)
 ```
 |    | Text                                                                                                                                                        |   avg_word_length |   median_word_length |   std_word_length |   avg_sentence_length |   median_sentence_length |   std_sentence_length |   avg_syl_per_word |   median_syl_per_word |   std_syl_per_word |   type_token_ratio |     lix |   rix |   n_types |   n_sentences |   n_tokens |   n_chars |   gunning_fog |    smog |   flesch_reading_ease |   flesch_kincaid_grade |   automated_readability_index |   coleman_liau_index |   Germanic |   Latinate |   Latinate/Germanic |   mean_dependency_distance |   std_dependency_distance |   mean_prop_adjacent_dependency_relation |   std_prop_adjacent_dependency_relation |
 |---:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|---------------------:|------------------:|----------------------:|-------------------------:|----------------------:|-------------------:|----------------------:|-------------------:|-------------------:|--------:|------:|----------:|--------------:|-----------:|----------:|--------------:|--------:|----------------------:|-----------------------:|------------------------------:|---------------------:|-----------:|-----------:|--------------------:|---------------------------:|--------------------------:|-----------------------------------------:|----------------------------------------:|
@@ -30,14 +30,14 @@ To calculate one category at a time:
 textdescriptives.basic_stats(texts, lang = 'en', metrics = 'all')
 textdescriptives.readability(texts, lang = 'en')
 textdescriptives.etymology(texts, lang = 'en')
-textdescriptives.dependency_distance(texsts, lang = 'en', snlp_path = None)
+textdescriptives.dependency_distance(texsts, lang = 'en', stanza_path = None)
 ```
 Textdescriptives works for most languages - simply change the country code:
 ```py
 da_test = pd.Series(['Da jeg var atten, tog jeg patent på ild. Det skulle senere vise sig at blive en meget indbringende forretning',
             "Spis skovsneglen, Mulle. Du vil jo gerne være med i hulen, ikk'?"])
 
-textdescriptives.all_metrics(da_test, lang = 'da', snlp_path=snlp_path)
+textdescriptives.all_metrics(da_test, lang = 'da', stanza_path=stanza_path)
 ```
 
 If you only want a subset of the basic statistics
@@ -57,16 +57,16 @@ The readability measures are largely derived from the [textstat](https://github.
 The etymology measures are calculated using [macroetym](https://github.com/JonathanReeve/macro-etym) only slightly rewritten to be called from a script. They are calculated since in English, a greater frequency of words with a Latinate origin tends to indicate a more formal language register. 
 
 ### Dependency Distance
-Mean dependency distance can be used as a way of measuring the average syntactic complexity of a text. Requres the `snlp` library. 
-The dependency distance function requires stanfordnlp, and their language models. If you have already downloaded these models, the path to the folder can be specified in the snlp_path paramter. Otherwise, the models will be downloaded to your working directory + /snlp_resources.
+Mean dependency distance can be used as a way of measuring the average syntactic complexity of a text. Requires the `stanza` library. 
+The dependency distance function requires stanza, and their language models. If you have already downloaded these models, the path to the folder can be specified in the stanza_path parameter. Otherwise, the models will be downloaded to your working directory + /stanza_resources.
 
 
 ## Dependencies
 Depending on which measures you want to calculate, the dependencies differ.
  * Basic and readability: numpy, pandas, pyphen, pycountry
  * Etymology: nltk and the following models 
 `python3 -c "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('averaged_perceptron_tagger'); nltk.download('wordnet')"`
- * Dependency distance: snlp
+ * Dependency distance: stanza (Peng Qi, Yuhao Zhang, Yuhui Zhang, Jason Bolton and Christopher D. Manning. 2020. Stanza: A Python Natural Language Processing Toolkit for Many Human Languages. arXiv preprint arXiv:2003.07082.)
 
 
 ## Metrics

diff --git a/tests.py b/tests.py
@@ -10,24 +10,25 @@
 en_test = ['The world is changed. I feel it in the water. I feel it in the earth. I smell it in the air. Much that once was is lost, for none now live who remember it.',
             'He felt that his whole life was some kind of dream and he sometimes wondered whose it was and whether they were enjoying it.']
 
+# Dependency distance uses stanza, and thus requires stanza language resources
+# If you already have them downloaded, you can specify the folder path
+# Otherwise, they will be downloaded
+stanza_path = '/path/to/stanza_resources'
+
 # If you want to calculate all measures at once
-textdescriptives.all_metrics(en_test, lang = 'en', snlp_path = snlp_path)
+textdescriptives.all_metrics(en_test, lang = 'en', stanza_path = stanza_path)
 
 # Otherwise, the following calculates one category at a time
 textdescriptives.basic_stats(en_test, lang = 'en')
 textdescriptives.readability(en_test, lang = 'en')
 textdescriptives.etymology(en_test, lang = 'en')
-# Dependency distance uses stanfordnlp, and thus requires snlp language resources
-# If you already have them downloaded, you can specify the folder path
-# Otherwise, they will be downloaded
-snlp_path = '/path/to/stanfordnlp_resources'    
-textdescriptives.dependency_distance(en_test, lang = 'en', snlp_path = snlp_path)
+textdescriptives.dependency_distance(en_test, lang = 'en', stanza_path = stanza_path)
 
 # Textdescriptives works for most languages. Simply change the country code     
 da_test = pd.Series(['Da jeg var atten, tog jeg patent på ild. Det skulle senere vise sig at blive en meget indbringende forretning',
             "Spis skovsneglen, Mulle. Du vil jo gerne være med i hulen, ikk'?"])
 
-textdescriptives.all_metrics(da_test, 'da', snlp_path=snlp_path)
+textdescriptives.all_metrics(da_test, 'da', stanza_path=stanza_path)
 
 
 # If you want to calculate a subset of the basic statistics they can be specified in the measures paratmer

diff --git a/textdescriptives/dependency_distance.py b/textdescriptives/dependency_distance.py
@@ -1,16 +1,16 @@
 import os
-import stanfordnlp
+import stanza
 import numpy as np
 import pandas as pd
 
 class DepDistance():
-    def __init__(self, text, lang, snlp_path):
+    def __init__(self, text, lang, stanza_path = None):
         self.text = text
         self.lang = lang
-        if snlp_path is None:
-            self.snlp_path = os.getcwd() + '/snlp_resources'
+        if stanza_path is None:
+            self.stanza_path = os.getcwd() + '/stanza_resources'
         else:
-            self.snlp_path = snlp_path
+            self.stanza_path = stanza_path
 
         self.__dep_dist()
         self.__describe_distances()
@@ -21,19 +21,24 @@ def get_sentence_distances(self):
     def get_text_distances(self):
         return self.__text_distances
 
-    def __dl_missing_langs_snlp(self):
+    def __dl_missing_langs_stanza(self):
         """
-        Downloads any missing languages from Stanford NLP resources
+        downloads any missing languages from stanza
+
+        Examples:
+        >>> dl_missing_langs(langs = "da", stanza_path = os.path.join(str(Path.home()), 'stanza_resources'))
         """
-        import stanfordnlp
 
-        if not os.path.exists(self.snlp_path):
-            os.makedirs(self.snlp_path)
+        if not os.path.exists(self.stanza_path):
+            os.makedirs(self.stanza_path)
 
-        dl_langs = [folder[:2] for folder in os.listdir(self.snlp_path)]
+        dl_langs = [folder[:2] for folder in os.listdir(self.stanza_path)]
 
         if self.lang not in dl_langs:
-            stanfordnlp.download(self.lang, resource_dir=self.snlp_path)
+            try:
+                stanza.download(self.lang, dir = self.stanza_path)
+            except ValueError:
+                raise ValueError(f"Language: '{self.lang}' does not exist in stanza. Try specifying another language")
 
     def __dep_dist(self):
         """
@@ -42,28 +47,28 @@ def __dep_dist(self):
         """
         #Check if snlp language resources are installed, otherwise download them
         try: 
-            self.__dl_missing_langs_snlp()
+            self.__dl_missing_langs_stanza()
             # If the specified language is not in SNLP, throw error and stop the function
         except ValueError:
-            ValueError(f"Language '{self.lang}' does not exist in stanford NLP. Try specifying another language")
+            ValueError(f"Language '{self.lang}' does not exist in stanza. Try specifying another language")
 
         if 's_nlp' not in globals():
             global s_nlp
-            s_nlp = stanfordnlp.Pipeline(lang = self.lang, models_dir = self.snlp_path, 
-                    processors="tokenize,pos,depparse")
+            s_nlp = stanza.Pipeline(lang = self.lang, dir = self.stanza_path, 
+                processors = "tokenize,lemma,pos,depparse")
 
-        def score_token(dep_relation, governor, idx):
+        def score_token(dep_relation, head, idx):
             dep_dist = 0
             adj_rel = 0
             if dep_relation != 'root':
-                dep_dist = abs(governor - int(idx))
+                dep_dist = abs(head - int(idx))
                 if dep_dist == 1:
                     adj_rel = 1
             return pd.Series([dep_dist, adj_rel])
 
         def score_sentence(df):
             res = df.apply(
-                lambda r: score_token(r["dep_rel"], r["governor"], r["token_id"]), 
+                lambda r: score_token(r["dep_rel"], r["head"], r["token_id"]), 
                 axis = 1)  
             token_dep_dists = res[0]
             token_adj_rels = res[1]
@@ -73,9 +78,9 @@ def score_sentence(df):
 
         def score_text(txt, txt_id):
             doc = s_nlp(txt)
-            parsed = [(sent_n, word.index, word.governor, word.dependency_relation) \
+            parsed = [(sent_n, word.id, word.head, word.deprel) \
                 for sent_n, sent in enumerate(doc.sentences) for word in sent.words]
-            parsed = pd.DataFrame(parsed, columns = ["sent_id", "token_id", "governor", "dep_rel"])
+            parsed = pd.DataFrame(parsed, columns = ["sent_id", "token_id", "head", "dep_rel"])
             res = parsed.groupby("sent_id").apply(score_sentence).reset_index()
             res.columns = ["sent_id", "dep_dist", "prop_adjacent"]
             res["text_id"] = txt_id
@@ -118,4 +123,4 @@ def summarizer(df):
 
 #text = ["Bare en enkelt sætning for lige at teste"]
 
-#dep = DepDistance(texts, 'da', snlp_path)
+#dep = DepDistance(texts, 'da', stanza_path)
diff --git a/textdescriptives/textdescriptives.py b/textdescriptives/textdescriptives.py
@@ -5,7 +5,7 @@
 import pandas as pd
 
 class TextDescriptives():
-    def __init__(self, texts, lang = 'da', category = 'all', measures = 'all', snlp_path = None):
+    def __init__(self, texts, lang = 'da', category = 'all', measures = 'all', stanza_path = None):
         """
         texts: str/list/pd.Series containing text
         lang: str with the language code
@@ -37,7 +37,7 @@ def __init__(self, texts, lang = 'da', category = 'all', measures = 'all', snlp_
         self.df = pd.DataFrame(texts, columns = ['Text'])
         # self.token_dfs = token_dfs
         self.lang = lang
-        self.snlp_path = snlp_path
+        self.stanza_path = stanza_path
 
         # Category check
         valid_categories = set(['all', 'basic', 'readability', 'entropy', 'sentiment', 'etymology', 'dep_distance'])
@@ -109,7 +109,7 @@ def dependency_distance(self):
         MDD is calculated on sentence level, ie. MDD is the mean of the average dependency distance pr sentence.
         Mean and standard deviation of the proportion of adjacent dependency relations pr sentence is further calculated
         """
-        dep = DepDistance(self.df['Text'], self.lang, self.snlp_path)
+        dep = DepDistance(self.df['Text'], self.lang, self.stanza_path)
         self.df = pd.concat([self.df, dep.get_text_distances()], axis = 1)
 
     def entropy(self):
@@ -122,14 +122,14 @@ def get_df(self):
         return self.df
 
 
-def all_metrics(texts, lang = 'en', snlp_path = None):
+def all_metrics(texts, lang = 'en', stanza_path = None):
     """
     Calculates all implemented statistical metrics
     text: str/list/pd.Series of strings
     lang: two character language code, e.g. 'en', 'da'
-    snlp_path: string, path to stanfordnlp_resources
+    stanza_path: string, path to stanza resources
     """
-    return TextDescriptives(texts, lang = lang, category = 'all', snlp_path = snlp_path).df
+    return TextDescriptives(texts, lang = lang, category = 'all', stanza_path = stanza_path).df
 
 def basic_stats(texts, lang = 'en', metrics = 'all'):
     """
@@ -157,11 +157,11 @@ def etymology(texts, lang = 'en'):
     """
     return TextDescriptives(texts, lang = lang, category = 'etymology').df
 
-def dependency_distance(texts, lang = 'en', snlp_path = None):
+def dependency_distance(texts, lang = 'en', stanza_path = None):
     """
     Calculates measures related to etymology
     texts: str/list/pd.Series of strings
     lang: string, two character language code
-    snlp_path: string, path to stanfordnlp_resources
+    stanza_path: string, path to stanza resources
     """
-    return TextDescriptives(texts, lang = lang, category = 'dep_distance', snlp_path = snlp_path).df
+    return TextDescriptives(texts, lang = lang, category = 'dep_distance', stanza_path = stanza_path).df