lix/rix to readability + option for mean only

HLasse · Feb 26, 2020 · 39a6e00 · 39a6e00
1 parent 4ce1348
commit 39a6e00
Show file tree

Hide file tree

Showing 2 changed files with 34 additions and 10 deletions.
diff --git a/setup.py b/setup.py
@@ -2,13 +2,23 @@
 
 setup(name='textdescriptives',
       version='0.1',
-      description='A package for calculating a wide variety of features from text',
-      url='https://github.com/HLasse/TextDescriptives',
       author='Lasse Hansen',
       author_email='[email protected]',
+      description='A package for calculating a wide variety of features from text',
+      long_description=open('README.md', encoding='utf-8').read(),
+      long_description_content_type='text/markdown',
+      url='https://github.com/HLasse/TextDescriptives',
       license='MIT',
       packages=['textdescriptives'],
-      install_requires=['pandas','numpy'],
+      classifiers=[
+            "Programming Language :: Python :: 3",
+            "License :: OSI Approved :: MIT License",
+            "Operating System :: OS Independent",
+            "Topic :: Text Processing",
+            "Topic :: NLP"
+      ]
+      install_requires=['pandas','numpy', 'pyphen', 'pycountry'],
+      python_requires='>=3.6'
       zip_safe=False,
       include_package_data=True)
 
diff --git a/textdescriptives/textdescriptives.py b/textdescriptives/textdescriptives.py
@@ -57,19 +57,31 @@ def basic(self, measures = 'all'):
         """
         basic_calc = Calculators(lang = self.lang)
 
-        valid_measures = {'avg_word_length' : basic_calc.avg_word_length, 'median_word_length' : basic_calc.median_word_length,
-                          'std_word_length' : basic_calc.std_word_length, 'avg_sentence_length' : basic_calc.avg_sentence_length, 
+        valid_measures = {'mean_word_length' : basic_calc.avg_word_length, 'median_word_length' : basic_calc.median_word_length,
+                          'std_word_length' : basic_calc.std_word_length, 'mean_sentence_length' : basic_calc.avg_sentence_length, 
                           'median_sentence_length' : basic_calc.median_sentence_length, 'std_sentence_length' : basic_calc.std_sentence_length,
-                          'avg_syl_per_word' : basic_calc.avg_syl_per_word, 'median_syl_per_word' : basic_calc.median_syl_per_word, 
+                          'mean_syl_per_word' : basic_calc.avg_syl_per_word, 'median_syl_per_word' : basic_calc.median_syl_per_word, 
                           'std_syl_per_word' : basic_calc.std_syl_per_word, 'type_token_ratio' : basic_calc.type_token_ratio, 
-                          'lix' : basic_calc.lix, 'rix' : basic_calc.rix, 'n_types' : basic_calc.n_types,
-                          'n_sentences' : basic_calc.n_sentences, 'n_tokens' : basic_calc.n_tokens, 'n_chars' : basic_calc.n_chars
+                          'n_chars' : basic_calc.n_chars, 'n_sentences' : basic_calc.n_sentences, 
+                          'n_types' : basic_calc.n_types,  'n_tokens' : basic_calc.n_tokens
                           }
 
+
+        only_mean = {'mean_word_length' : basic_calc.avg_word_length, 'mean_sentence_length' : basic_calc.avg_sentence_length, 
+                          'mean_syl_per_word' : basic_calc.avg_syl_per_word, 'type_token_ratio' : basic_calc.type_token_ratio, 
+                          'n_chars' : basic_calc.n_chars, 'n_sentences' : basic_calc.n_sentences, 
+                          'n_types' : basic_calc.n_types,  'n_tokens' : basic_calc.n_tokens
+                          }
+
+
         if measures == 'all':
             for measure, func in valid_measures.items():
                 self.df[measure] = [func(text) for text in self.df['Text']]
 
+        elif measures == 'only_mean':
+            for measure, func in only_mean.items():
+                self.df[measure] = [func(text) for text in self.df['Text']]
+
         elif not (set(measures).issubset(set(valid_measures.keys()))):
             raise ValueError("Invalid measures provided to self.basic")
 
@@ -82,10 +94,12 @@ def readability(self, measures = 'all'):
         Calculates readability scores
         """
         read = Readability(lang = self.lang)
+        basic_calc = Calculators(lang = self.lang)
 
         valid_measures = {'gunning_fog' : read.gunning_fog, 'smog' : read.smog,
                           'flesch_reading_ease' : read.flesch_reading_ease, 'flesch_kincaid_grade' : read.flesch_kincaid_grade,
                           'automated_readability_index' : read.automated_readability_index, 'coleman_liau_index' : read.coleman_liau_index,
+                          'lix' : basic_calc.lix, 'rix' : basic_calc.rix
                           }
 
         if measures == 'all':
@@ -164,14 +178,14 @@ def basic_stats(texts, lang = 'en', metrics = 'all'):
     """
     return Textdescriptives(texts, lang, 'basic', measures = metrics).df
 
-def readability(texts, lang = 'en', measures = 'all'):
+def readability(texts, lang = 'en', metrics = 'all'):
     """
     Calculates readability metrics
     texts: str/list/pd.Series of strings
     lang: string, two character language code
     measures: string/list of strings, which measures to calculate
     """
-    return Textdescriptives(texts, lang, 'readability', measures = measures).df
+    return Textdescriptives(texts, lang, 'readability', measures = metrics).df
 
 def etymology(texts, lang = 'en'):
     """