diff --git a/.gitignore b/.gitignore index 576e3c96..e3aa36e6 100644 --- a/.gitignore +++ b/.gitignore @@ -11,5 +11,7 @@ stanfordnlp_resources/* stanfordnlp_resources snlp_resources/* snlp_resources +stanza_resources/* +stanza_resources test_textdescriptives.py \ No newline at end of file diff --git a/README.md b/README.md index aaaf02a9..c25c2fa0 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,9 @@ import textdescriptives en_test = ['The world is changed. I feel it in the water. I feel it in the earth. I smell it in the air. Much that once was is lost, for none now live who remember it.', 'He felt that his whole life was some kind of dream and he sometimes wondered whose it was and whether they were enjoying it.'] -snlp_path = path/to/snlp_resources +stanza_path = path/to/stanza_resources -textdescriptives.all_metrics(en_test, lang = 'en', snlp_path = snlp_path) +textdescriptives.all_metrics(en_test, lang = 'en', stanza_path = stanza_path) ``` | | Text | avg_word_length | median_word_length | std_word_length | avg_sentence_length | median_sentence_length | std_sentence_length | avg_syl_per_word | median_syl_per_word | std_syl_per_word | type_token_ratio | lix | rix | n_types | n_sentences | n_tokens | n_chars | gunning_fog | smog | flesch_reading_ease | flesch_kincaid_grade | automated_readability_index | coleman_liau_index | Germanic | Latinate | Latinate/Germanic | mean_dependency_distance | std_dependency_distance | mean_prop_adjacent_dependency_relation | std_prop_adjacent_dependency_relation | |---:|:------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------:|---------------------:|------------------:|----------------------:|-------------------------:|----------------------:|-------------------:|----------------------:|-------------------:|-------------------:|--------:|------:|----------:|--------------:|-----------:|----------:|--------------:|--------:|----------------------:|-----------------------:|------------------------------:|---------------------:|-----------:|-----------:|--------------------:|---------------------------:|--------------------------:|-----------------------------------------:|----------------------------------------:| @@ -30,14 +30,14 @@ To calculate one category at a time: textdescriptives.basic_stats(texts, lang = 'en', metrics = 'all') textdescriptives.readability(texts, lang = 'en') textdescriptives.etymology(texts, lang = 'en') -textdescriptives.dependency_distance(texsts, lang = 'en', snlp_path = None) +textdescriptives.dependency_distance(texsts, lang = 'en', stanza_path = None) ``` Textdescriptives works for most languages - simply change the country code: ```py da_test = pd.Series(['Da jeg var atten, tog jeg patent på ild. Det skulle senere vise sig at blive en meget indbringende forretning', "Spis skovsneglen, Mulle. Du vil jo gerne være med i hulen, ikk'?"]) -textdescriptives.all_metrics(da_test, lang = 'da', snlp_path=snlp_path) +textdescriptives.all_metrics(da_test, lang = 'da', stanza_path=stanza_path) ``` If you only want a subset of the basic statistics @@ -57,8 +57,8 @@ The readability measures are largely derived from the [textstat](https://github. The etymology measures are calculated using [macroetym](https://github.com/JonathanReeve/macro-etym) only slightly rewritten to be called from a script. They are calculated since in English, a greater frequency of words with a Latinate origin tends to indicate a more formal language register. ### Dependency Distance -Mean dependency distance can be used as a way of measuring the average syntactic complexity of a text. Requres the `snlp` library. -The dependency distance function requires stanfordnlp, and their language models. If you have already downloaded these models, the path to the folder can be specified in the snlp_path paramter. Otherwise, the models will be downloaded to your working directory + /snlp_resources. +Mean dependency distance can be used as a way of measuring the average syntactic complexity of a text. Requires the `stanza` library. +The dependency distance function requires stanza, and their language models. If you have already downloaded these models, the path to the folder can be specified in the stanza_path parameter. Otherwise, the models will be downloaded to your working directory + /stanza_resources. ## Dependencies @@ -66,7 +66,7 @@ Depending on which measures you want to calculate, the dependencies differ. * Basic and readability: numpy, pandas, pyphen, pycountry * Etymology: nltk and the following models `python3 -c "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('averaged_perceptron_tagger'); nltk.download('wordnet')"` - * Dependency distance: snlp + * Dependency distance: stanza (Peng Qi, Yuhao Zhang, Yuhui Zhang, Jason Bolton and Christopher D. Manning. 2020. Stanza: A Python Natural Language Processing Toolkit for Many Human Languages. arXiv preprint arXiv:2003.07082.) ## Metrics diff --git a/tests.py b/tests.py index 25ecf2cd..671c3e14 100644 --- a/tests.py +++ b/tests.py @@ -10,24 +10,25 @@ en_test = ['The world is changed. I feel it in the water. I feel it in the earth. I smell it in the air. Much that once was is lost, for none now live who remember it.', 'He felt that his whole life was some kind of dream and he sometimes wondered whose it was and whether they were enjoying it.'] +# Dependency distance uses stanza, and thus requires stanza language resources +# If you already have them downloaded, you can specify the folder path +# Otherwise, they will be downloaded +stanza_path = '/path/to/stanza_resources' + # If you want to calculate all measures at once -textdescriptives.all_metrics(en_test, lang = 'en', snlp_path = snlp_path) +textdescriptives.all_metrics(en_test, lang = 'en', stanza_path = stanza_path) # Otherwise, the following calculates one category at a time textdescriptives.basic_stats(en_test, lang = 'en') textdescriptives.readability(en_test, lang = 'en') textdescriptives.etymology(en_test, lang = 'en') -# Dependency distance uses stanfordnlp, and thus requires snlp language resources -# If you already have them downloaded, you can specify the folder path -# Otherwise, they will be downloaded -snlp_path = '/path/to/stanfordnlp_resources' -textdescriptives.dependency_distance(en_test, lang = 'en', snlp_path = snlp_path) +textdescriptives.dependency_distance(en_test, lang = 'en', stanza_path = stanza_path) # Textdescriptives works for most languages. Simply change the country code da_test = pd.Series(['Da jeg var atten, tog jeg patent på ild. Det skulle senere vise sig at blive en meget indbringende forretning', "Spis skovsneglen, Mulle. Du vil jo gerne være med i hulen, ikk'?"]) -textdescriptives.all_metrics(da_test, 'da', snlp_path=snlp_path) +textdescriptives.all_metrics(da_test, 'da', stanza_path=stanza_path) # If you want to calculate a subset of the basic statistics they can be specified in the measures paratmer diff --git a/textdescriptives/dependency_distance.py b/textdescriptives/dependency_distance.py index a2b173d6..049cbd49 100644 --- a/textdescriptives/dependency_distance.py +++ b/textdescriptives/dependency_distance.py @@ -1,16 +1,16 @@ import os -import stanfordnlp +import stanza import numpy as np import pandas as pd class DepDistance(): - def __init__(self, text, lang, snlp_path): + def __init__(self, text, lang, stanza_path = None): self.text = text self.lang = lang - if snlp_path is None: - self.snlp_path = os.getcwd() + '/snlp_resources' + if stanza_path is None: + self.stanza_path = os.getcwd() + '/stanza_resources' else: - self.snlp_path = snlp_path + self.stanza_path = stanza_path self.__dep_dist() self.__describe_distances() @@ -21,19 +21,24 @@ def get_sentence_distances(self): def get_text_distances(self): return self.__text_distances - def __dl_missing_langs_snlp(self): + def __dl_missing_langs_stanza(self): """ - Downloads any missing languages from Stanford NLP resources + downloads any missing languages from stanza + + Examples: + >>> dl_missing_langs(langs = "da", stanza_path = os.path.join(str(Path.home()), 'stanza_resources')) """ - import stanfordnlp - if not os.path.exists(self.snlp_path): - os.makedirs(self.snlp_path) + if not os.path.exists(self.stanza_path): + os.makedirs(self.stanza_path) - dl_langs = [folder[:2] for folder in os.listdir(self.snlp_path)] + dl_langs = [folder[:2] for folder in os.listdir(self.stanza_path)] if self.lang not in dl_langs: - stanfordnlp.download(self.lang, resource_dir=self.snlp_path) + try: + stanza.download(self.lang, dir = self.stanza_path) + except ValueError: + raise ValueError(f"Language: '{self.lang}' does not exist in stanza. Try specifying another language") def __dep_dist(self): """ @@ -42,28 +47,28 @@ def __dep_dist(self): """ #Check if snlp language resources are installed, otherwise download them try: - self.__dl_missing_langs_snlp() + self.__dl_missing_langs_stanza() # If the specified language is not in SNLP, throw error and stop the function except ValueError: - ValueError(f"Language '{self.lang}' does not exist in stanford NLP. Try specifying another language") + ValueError(f"Language '{self.lang}' does not exist in stanza. Try specifying another language") if 's_nlp' not in globals(): global s_nlp - s_nlp = stanfordnlp.Pipeline(lang = self.lang, models_dir = self.snlp_path, - processors="tokenize,pos,depparse") + s_nlp = stanza.Pipeline(lang = self.lang, dir = self.stanza_path, + processors = "tokenize,lemma,pos,depparse") - def score_token(dep_relation, governor, idx): + def score_token(dep_relation, head, idx): dep_dist = 0 adj_rel = 0 if dep_relation != 'root': - dep_dist = abs(governor - int(idx)) + dep_dist = abs(head - int(idx)) if dep_dist == 1: adj_rel = 1 return pd.Series([dep_dist, adj_rel]) def score_sentence(df): res = df.apply( - lambda r: score_token(r["dep_rel"], r["governor"], r["token_id"]), + lambda r: score_token(r["dep_rel"], r["head"], r["token_id"]), axis = 1) token_dep_dists = res[0] token_adj_rels = res[1] @@ -73,9 +78,9 @@ def score_sentence(df): def score_text(txt, txt_id): doc = s_nlp(txt) - parsed = [(sent_n, word.index, word.governor, word.dependency_relation) \ + parsed = [(sent_n, word.id, word.head, word.deprel) \ for sent_n, sent in enumerate(doc.sentences) for word in sent.words] - parsed = pd.DataFrame(parsed, columns = ["sent_id", "token_id", "governor", "dep_rel"]) + parsed = pd.DataFrame(parsed, columns = ["sent_id", "token_id", "head", "dep_rel"]) res = parsed.groupby("sent_id").apply(score_sentence).reset_index() res.columns = ["sent_id", "dep_dist", "prop_adjacent"] res["text_id"] = txt_id @@ -118,4 +123,4 @@ def summarizer(df): #text = ["Bare en enkelt sætning for lige at teste"] -#dep = DepDistance(texts, 'da', snlp_path) +#dep = DepDistance(texts, 'da', stanza_path) diff --git a/textdescriptives/textdescriptives.py b/textdescriptives/textdescriptives.py index bba52101..31a5c376 100644 --- a/textdescriptives/textdescriptives.py +++ b/textdescriptives/textdescriptives.py @@ -5,7 +5,7 @@ import pandas as pd class TextDescriptives(): - def __init__(self, texts, lang = 'da', category = 'all', measures = 'all', snlp_path = None): + def __init__(self, texts, lang = 'da', category = 'all', measures = 'all', stanza_path = None): """ texts: str/list/pd.Series containing text lang: str with the language code @@ -37,7 +37,7 @@ def __init__(self, texts, lang = 'da', category = 'all', measures = 'all', snlp_ self.df = pd.DataFrame(texts, columns = ['Text']) # self.token_dfs = token_dfs self.lang = lang - self.snlp_path = snlp_path + self.stanza_path = stanza_path # Category check valid_categories = set(['all', 'basic', 'readability', 'entropy', 'sentiment', 'etymology', 'dep_distance']) @@ -109,7 +109,7 @@ def dependency_distance(self): MDD is calculated on sentence level, ie. MDD is the mean of the average dependency distance pr sentence. Mean and standard deviation of the proportion of adjacent dependency relations pr sentence is further calculated """ - dep = DepDistance(self.df['Text'], self.lang, self.snlp_path) + dep = DepDistance(self.df['Text'], self.lang, self.stanza_path) self.df = pd.concat([self.df, dep.get_text_distances()], axis = 1) def entropy(self): @@ -122,14 +122,14 @@ def get_df(self): return self.df -def all_metrics(texts, lang = 'en', snlp_path = None): +def all_metrics(texts, lang = 'en', stanza_path = None): """ Calculates all implemented statistical metrics text: str/list/pd.Series of strings lang: two character language code, e.g. 'en', 'da' - snlp_path: string, path to stanfordnlp_resources + stanza_path: string, path to stanza resources """ - return TextDescriptives(texts, lang = lang, category = 'all', snlp_path = snlp_path).df + return TextDescriptives(texts, lang = lang, category = 'all', stanza_path = stanza_path).df def basic_stats(texts, lang = 'en', metrics = 'all'): """ @@ -157,11 +157,11 @@ def etymology(texts, lang = 'en'): """ return TextDescriptives(texts, lang = lang, category = 'etymology').df -def dependency_distance(texts, lang = 'en', snlp_path = None): +def dependency_distance(texts, lang = 'en', stanza_path = None): """ Calculates measures related to etymology texts: str/list/pd.Series of strings lang: string, two character language code - snlp_path: string, path to stanfordnlp_resources + stanza_path: string, path to stanza resources """ - return TextDescriptives(texts, lang = lang, category = 'dep_distance', snlp_path = snlp_path).df \ No newline at end of file + return TextDescriptives(texts, lang = lang, category = 'dep_distance', stanza_path = stanza_path).df \ No newline at end of file