-
Notifications
You must be signed in to change notification settings - Fork 0
/
web_readability.py
executable file
·206 lines (182 loc) · 6.78 KB
/
web_readability.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/python
# coding=UTF-8
import math
def count_characters(text):
count = 0
for char in text:
count += 1
return count
def to_cv(word):
""" Convert word to a cv-list.
Each vowel of word is replaced by a "v" in the cv-list, and each
non-vowel is replaced by a 'c'.
"""
cv_list = []
for ch in word:
if ch in 'aeiou':
# if ch in 'aeiouy':
cv_list.append('v')
else:
cv_list.append('c')
return cv_list
def count_cv(cv):
""" Count the # of times "c", "v" occur consecutively in cv.
"""
count = 0
i = 0
while i < len(cv) - 1: # the -1 is important!
if cv[i] == 'c' and cv[i+1] == 'v':
count += 1
i += 1
return count
def count_vowel_groups(word):
""" Return the # of vowel-groups in word.
"""
cv = to_cv(word)
count = count_cv(cv)
if cv[0] == 'v':
count += 1
return count
def count_syllables_in_word(word):
vgc = count_vowel_groups(word)
count_hard_words = 0
if vgc == 0:
return 1
else:
return vgc
def count_syllables(text):
""" Returns total # of syllables in all words of text.
"""
words = text.split()
total_syllables = 0
for w in words:
total_syllables += count_syllables_in_word(w)
return total_syllables
def count_hard_words(text):
hardwords = 0
words = text.split()
for w in words:
if count_syllables_in_word(w) >= 3:
hardwords += 1
return hardwords
def count_sentences(s):
return s.count('.') + s.count('!') + s.count('?')
def count_words(s):
return len(s.split())
def familiar_words(text):
f_words = 0
words = text.split()
for w in words:
if w in open('dale_chall_word_list.txt').read():
f_words += 1
return f_words
def modcharcount(text):
characters = count_characters(text)
spaces = count_words(text)
mod_char_count = characters - spaces
return mod_char_count
def websummarize(s):
text = open(s, 'r').read().lower()
# get text stats
sy_count = count_syllables(text)
w_count = count_words(text)
sent_count = count_sentences(text)
character_count = count_characters(text)
hardwords = count_hard_words(text)
f_words = float(familiar_words(text))
mod_char_count = modcharcount(text)
asl = (float(w_count)/float(sent_count))
#awl modified to mod_char_count from char_count
afw = float(f_words/w_count)
pdw = ((float(hardwords)/float(w_count)) * 100)
pfw = (f_words/w_count) * 100
mod_char_count = modcharcount(text)
awl = (float(mod_char_count)/float(w_count))
#Average Syllables per Word
spw = (float(sy_count)/float(w_count))
#used for coleman liau standard
#ecp = 141.8401 - (0.214590 * mod_char_count) + (1.079812 * sent_count)
# FRES: Flesch Readability Ease score
fres = 206.835 - (84.6 * (float(sy_count)/float(w_count))) - (1.015 * (float(w_count)/float(sent_count)))
# Flesch-Kincaid Grade Level
fkgl = 0.39 * (float(w_count)/float(sent_count)) + (11.8 * (float(sy_count)/float(w_count))) - 15.59
#ColemanLiauIndex (still testing)
# THis one is too low # cli = ((character_count/w_count) * 4.71) - ((100.0 * sent_count)/w_count) * .30 - 15.8
#cli = 0.0588 * ((float(mod_char_count) / float(w_count))) - (0.30 * ((float(sent_count) / (float(w_count))) - 15.8))
cli = 5.888 * (float(mod_char_count) / float(w_count)) - (29.5 * (float(sent_count) / float(w_count))) - 15.800
#standard_cli = -27.4004 * (ecp/100) + 23.06395
#cli = -15.8 + 5.88 * (float(character_count/w_count)) - 29.59 * (float(w_count/sent_count))
#Gunning Fog
fog = (0.4*(float(w_count)/float(sent_count)+100*(float(hardwords)/float(w_count))))
#Bormuth Grade Level Score (unreliable)
bgl = .886593 - (awl * 0.03640) + (pfw * 0.161911) - (asl * 0.21401) - (asl * 0.000577) - (asl * 0.000005)
#SMOG
smog = (1.0430) * math.sqrt((hardwords) * (30.0/sent_count)) + (3.1291)
#ari
#ari = 4.71 * (character_count/w_count) + 0.5 * (w_count/sent_count) - 21.43
ari = 4.71 * (character_count/w_count) + 0.5 * (w_count/sent_count) - 21.43
#Powers-Sumner-Kearl (unreliable; for 100 word passages
#psk = (0.0778 * asl) + .0455 * (100 * (sy_count/w_count)) - 2.2029
#Dale-Chall Readability Formula
dcr = 0.1579 * (pdw) + (0.0496 * asl)
dca = dcr + 3.6365
def inrange(x,min,max):
return (min is None or min <= x) and (max is None or max >=x)
if inrange(dca,0,4.9):
dcgl = 4.0
elif inrange(dca,5.0,5.9):
dcgl = 6.0
elif inrange(dca,6.0,6.9):
dcgl = 8.0
elif inrange(dca,7.0,7.9):
dcgl = 10.0
elif inrange(dca, 8.0, 8.9):
dcgl = 12.0
elif inrange(dca,9.0,9.9):
dcgl = 15.0
else:
dcgl = 16.0
#Grade Level Average
grade = (cli + ari + smog + fkgl + fog + dcgl)/6.0
# print readability report
print 'Content-type: text/html\n'
print ''
print '<html>'
print '<body>'
print '<h2>Readability Summary</h2>'
print '<pre>'
# print 'Readability report for ' + s
print 'Total character count: ' + str(character_count)
print 'Character count without spaces: ' +str(mod_char_count)
print 'Total syllable count: ' + str(sy_count)
print 'Average Syllables Per Word: ' +str(spw)
print 'Total word count: ' + str(w_count)
print 'Total sentence count: ' + str(sent_count)
print 'Polysyllable words (3+ syllables): ' + str(hardwords)
print 'Total Familiar Words: ' + str(f_words)
# print 'ECP: ' +str(ecp)
print 'Average Sentence Length: ' +str(asl)
print 'Average Word Length: ' +str(awl)
print 'Average Familiar Words: ' +str(afw)
print 'Percent Difficult Words: ' +str(pdw)
print 'Percent Familiar Words: ' +str(pfw)
print '----------------'
print 'Flesch reading ease score (FRES): ' + str(fres)
# print 'Dale Chall Adjusted Score: ' +str(dca)
print '----------------'
print 'Flesch-Kincaid grade level: ' + str(fkgl)
print 'Simplified Coleman Liau Index score: ' +str(cli)
# print '*Standard Coleman Liau Index score: ' +str(standard_cli)
print '(Gunning) Fog: ' +str(fog)
print 'SMOG index: ' +str(smog)
print 'Automated Readability Index: ' +str(ari)
print '*Bormuth Grade Level Score: ' +str(bgl)
print 'New Dale Chall Adjusted Grade Level (max of range): ' +str(dcgl)
# print '*Powers-Sumner-Kearl Grade Level: ' +str (psk)
print '----------------'
print 'Average Grade Score: ' +str(grade)
print '* under construction (not reliable or averaged)'
print '</pre>'
print '''<h2>Readability Reasources</h2>
<ul><li><a href="http://www.ideosity.com/ourblog/post/ideosphere-blog/2010/01/14/readability-tests-and-formulas">Ideosity</a></li>'''
print '</body></html>'