-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexercises_chapter4.py
476 lines (423 loc) · 14.8 KB
/
exercises_chapter4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
import nltk, re, pprint
# #######################
# # 4.2 Tuples and Lists
# #######################
# def sequence_operations(list_or_tuple):
# print('Sequenz:', list_or_tuple)
# print('Sortierte Sequenz:', sorted(list_or_tuple))
# print('Umgekehrte Reihenfolge', [x for x in reversed(list_or_tuple)])
# print('Sequenz hat die Länge:', len(list_or_tuple))
# print('Vereinen zweier Sequenzen', list_or_tuple + list_or_tuple)
# print('Multiplizieren einer Sequenz:', list_or_tuple * 3)
# print('4 appears', list.count(4), 'times')
#
#
# list = [4, 8, 2]
# tuple = (4, 8, 2)
# sequence_operations(list)
# sequence_operations(tuple)
# def list_operations(list):
# list.append(6)
# print('Element was adden:', list)
# list.pop()
# print('Last element was removed:', list)
# list.remove(8)
# print('Specific elment was removed:', list)
#
# list = [4, 8, 2]
# list_operations(list)
# #######################
###########################
# 4.10 Sort Words by Length
###########################
#
# def sort_words_by_length(words):
# words.sort(key=len)
# return words
#
# words = ['Das', 'ist', 'das', 'Haus', 'vom', 'Nikolaus']
# print(sort_words_by_length(words))
###########################
# #######################################
# # 4.13 Word Length and Number of Vowels
# #######################################
# Write code to initialize a two-dimensional array of sets called word_vowels and process a list of words,
# adding each word to word_vowels[l][v] where l is the length of the word and v is the number of vowels it contains.
# def word_length_vowels(words):
# word_length_vowels = [(len(word), len(re.sub(r'[^aeiou]', '', word)), word) for word in words]
# word_vowels = [[[] for _ in range(max([x[1] for x in word_length_vowels])+1)] for _ in range(max(x[0] for x in word_length_vowels)+1)]
#
# for length, vowels, word in word_length_vowels:
# word_vowels[length][vowels].append(word)
# return word_vowels
#
# words = nltk.corpus.brown.words(categories=['hobbies'])
# word_vowels = word_length_vowels(words[100:115])
#
# # print word of length 5, that contan 2 vowels
# print(arr[5][2])
# #######################################
# #####################################
# # 4.14 New Words in last 10% of Text
# #####################################
# from nltk.book import *
#
# def novel10(text):
# split = int(0.9 * len(text))
# early_words = set(text[:split])
# late_words = set(text[split:])
# return [w for w in late_words if not w in early_words]
#
# moby_dick = text1
# genesis = text3
# text = nltk.word_tokenize('The new word in the last ten percent is cat')
#
# print('\n New words in last 10% of Moby Dick ({} words): \n'.format(len(moby_dick)), novel10(moby_dick))
# print('\n New words in last 10% of Genesis ({} words): \n'.format(len(genesis)), novel10(genesis))
# print('\n New words in last 10% of Testtext ({}): \n'.format(len(text)), novel10(text))
# #####################################
# ################
# # 4.16 Gematria
# ################
#
# import random
#
# letter_vals = {
# 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 80, 'g': 3,
# 'h': 8, 'i':10, 'j':10, 'k':20, 'l':30, 'm':40, 'n':50,
# 'o':70, 'p':80, 'q':100, 'r':200, 's':300, 't':400, 'u':6,
# 'v':6, 'w':800, 'x':60, 'y':10, 'z':7
# }
#
# def gematria(word):
# gematria_sum = sum(letter_vals[c.lower()] for c in word)
# return gematria_sum
#
# def find_666(text):
# gematria_vals = [(w, gematria(w)) for w in text if re.search(r'^[a-zA-Z]+$', w) and gematria(w) == 666]
# return len(gematria_vals), gematria_vals
#
# def process_documents(files, corpus):
# for file in files:
# count_666 = find_666(corpus.words(file))
# if count_666[0] is not 0:
# print('\n The document {} contains {} words with the gematria number 666: {}'.format(file, count_666[0], [w for (w, _) in count_666[1]]))
# else:
# print('\n The document {} contains NO words with the gematria number 666.'.format(file))
#
# def built_gematria_lexicon(gematria_words):
# gematria_lexicon = {}
# for w, val in gematria_words:
# if re.search(r'^[a-zA-Z]+$', w):
# if val in gematria_lexicon:
# gematria_lexicon[val].append(w)
# else:
# gematria_lexicon[val] = [w]
# return gematria_lexicon
#
#
# def decode(text, gematria_lexicon, percentage=0.3):
# replace_percentage = int(percentage * len(text))
# replace_positions = [random.choice(range(len(text))) for _ in range(replace_percentage)]
# gematria_vals = [(pos, gematria(text[pos])) for pos in replace_positions]
# for pos, gem in gematria_vals:
# if gem in gematria_lexicon:
# gematria_equivalent = random.choice(gematria_lexicon[gem])
# text[pos] = gematria_equivalent
# hidden_meaning = ' '.join(text)
# return hidden_meaning
#
# # find occurences of words with gematria number 666
# state_union_corpus = nltk.corpus.state_union
# state_union_files = nltk.corpus.state_union.fileids()
# #process_documents(state_union_files, state_union_corpus)
#
# # replace randomly words with gematria equivalent
# brown_words = set(nltk.corpus.brown.words())
# brown_words_gematria = [(w, gematria(w)) for w in brown_words if re.search(r'^[a-zA-Z]+$', w)]
# brown_gematria_lexicon = built_gematria_lexicon(brown_words_gematria)
#
# text = nltk.word_tokenize(input('\n Please enter some text: '))
# text = [t for t in text if re.search(r'^[a-zA-Z]+$', t)]
# print('\n The hidden meaning of your text is: ', decode(text, brown_gematria_lexicon))
# ################
#######################################
# 4.17 Remove frequently occuring words
#######################################
# def shorten(text, n):
# tokens = nltk.word_tokenize(text)
# most_common_words = nltk.FreqDist([t.lower() for t in tokens]).most_common(n)
# shortened_tokens = []
# for i, t in enumerate(tokens):
# if not t.lower() in [w for (w, _) in most_common_words]:
# shortened_tokens.append(t)
#
# shortened_text = ' '.join(shortened_tokens)
# return shortened_text
#
# text = input('\n Please enter some text: ')
# print(shorten(text, 3))
#######################################
########################
# 4.18 Indexing Lexicon
########################
# from nltk.corpus import wordnet as wn
#
# def insert(trie, key, value):
# if key:
# first, rest = key[0], key[1:]
# if first not in trie:
# trie[first] = {}
# insert(trie[first], rest, value)
# else:
# if 'value' in trie:
# trie['value'].append(value)
# else:
# trie['value'] = [value]
#
# def built_trie(contents, trie):
# for word, synonyms in contents:
# for synonym in synonyms:
# insert(trie, word, synonym)
# return trie
#
# def lookup_word_in_lexicon(trie, word):
# if word:
# current, rest = word[0], word[1:]
# if current in trie:
# return lookup_word_in_lexicon(trie[current], rest)
# return "Word wasn't found in the lexicon."
# else:
# return ', '.join(trie['value'])
#
#
# synsets = wn.all_synsets('n')
# synsets_synonyms = [(syn.lemma_names()[0], syn.lemma_names()[1:]) for syn in synsets if syn.lemma_names()[1:]]
# trie = {}
# synonym_lexicon = built_trie(synsets_synonyms, trie)
#
# search = input('Search synonyms for: ')
# print('Synonyms for {} are: {}'.format(search, (lookup_word_in_lexicon(synonym_lexicon, search))))
#######################################
###################################
# 4.19 Sorting Synsets by Proximity
###################################
# from nltk.corpus import wordnet as wn
#
# def sort_synsets(target, synsets):
# sorted_synsets = sorted([(target.shortest_path_distance(syn), syn) for syn in synsets])
# return ', '.join([syn.name() for (_, syn) in sorted_synsets])
#
# target = wn.synset('right_whale.n.01')
# synsets = [wn.synset('minke_whale.n.01'), wn.synset('orca.n.01'), wn.synset('novel.n.01'), wn.synset('tortoise.n.01')]
# print('\n from nearest to most distant synset of {}: '.format("right_whale.n.01"), sort_synsets(target, synsets))
###################################
#############################################
# 4.21 Difference between Text and Vocabulary
#############################################
#from nltk.book import *
#print(list(w for w in set(text3[:50]).difference(set(text3[50:100]))))
#############################################
#############################################
# 4.23 Difference between Text and Vocabulary
#############################################
# def insert(trie, key, value):
# if key:
# first, rest = key[0], key[1:]
# if first not in trie:
# trie[first] = {}
# insert(trie[first], rest, value)
# else:
# trie['value'] = value
#
# def lookup(trie, key):
# current = key[0]
# if len(key)>1:
# rest = key[1:]
# if current not in trie:
# return ["no results", '']
# return lookup(trie[current], rest)
# else:
# if current in trie and 'value' in trie[current]:
# return [trie[current]['value'], '']
# return check_prefix(trie, current)
#
# def check_prefix(trie, key, str=''):
# if key == 'value':
# return [trie['value'], '({})'.format(str[1:])]
# elif key in trie and len(trie[key].keys()) == 1:
# str += key
# next = list(trie[key].keys())[0]
# return check_prefix(trie[key], next, str)
# return ["no result", '']
#
#
# trie = {}
# insert(trie, 'cat', 'sweetest animal ever')
# insert(trie, 'catherine', 'sweetest girl ever')
# insert(trie, 'catering', 'sweetest service ever')
# insert(trie, 'chocolate', 'sweetest sweets ever')
#
# search = input('\nsearch dictionary for: ')
# result = lookup(trie, search)
# print('{}{}: {}'.format(search, result[1], result[0]))
#############################################
#######################
# 4.26 Catalan Numbers
#######################
# C0 = 1, and Cn+1 = Σ0..n (CiCn-i)
# 1, 1, 2, 5, 14, 42, 132, 429, 1430
# def catalan(n):
# if n < 0:
# return 'negative input'
# elif n <= 1:
# return 1
#
# sum = 0
# for i in range(n):
# sum += catalan(i) * catalan(n-1-i)
# return sum
#
# # not working yet
# def catalan_dynamic(n, lookup=[1,1]):
# if n < 0:
# return 'negative input'
# sum = 0
# if n in list(range(len(lookup))):
# return lookup[n]
# for i in range(n):
# sum += catalan_dynamic(i) * catalan_dynamic(n-1-i)
# #if i != 0 and i != 1:
# lookup.append(sum)
# return lookup[n]
#
# print(catalan(7))
# print(catalan_dynamic(4))
#######################
####################
# 4.31 Justify Text
####################
# import textwrap, re
#
# def justify_text(str, width):
# sents = textwrap.wrap(str, width=width)
# justified_text = [add_blanks(s, width) for s in sents]
# return justified_text
#
# def add_blanks(s, width):
# s = s.replace(' ', ' ', width-len(s))
# if len(s) < width:
# return add_blanks(s, width)
# return s
#
# str = """Yo listen up here's a story
# About a little guy that lives in a blue world
# And all day and all night and everything he sees
# Is just blue like him inside and outside
# Blue his house with a blue little window
# And a blue corvette
# And everything is blue for him and himself
# And everybody around
# 'Cause he ain't got nobody to listen to
# I'm blue da ba dee da ba die ..."""
#
# justified_text = justify_text(str, 50)
# pprint.pprint(justified_text)
####################
############################################
# 4.32 Sentences with highest Word Frequency
############################################
# from nltk.corpus import brown
#
# def word_freq_in_sents(words, sents, n):
# total_freqdist = nltk.FreqDist([w.lower() for w in words])
# summed_word_freq = [(sum([total_freqdist[w] for w in s]), i) for (i,s) in enumerate(sents)]
# highest_ranked_sentences = sorted(sorted(summed_word_freq)[-n:], key=lambda x: x[1])
# return highest_ranked_sentences
#
# words_news = brown.words(categories='news')
# sents_news = brown.sents(categories='news')
#
# highest_ranked_sentences = word_freq_in_sents(words_news, sents_news, 3)
# for sent in highest_ranked_sentences:
# print('\nTotal Word Frequency: {}\n{}'.format(sent[0], ' '.join(sents_news[sent[1]])))
############################################
#######################
# 4.35 n x n Crosswords
#######################
# write a program to implement a brute-force algorithm for discovering word squares,
# a kind of n × n crossword in which the entry in the nth row is the same as the entry
# in the nth column.
# from nltk.corpus import words,brown
# import random
#
#
# def build_crossword(n, len_dict):
# word = random.choice(len_dict[n])
# crossword = [word]
# trie = build_trie(len_dict[len(word)])
# crossword = find_words(n, crossword, trie, 0)
# if not crossword:
# return build_crossword(n, len_dict)
# return crossword
#
# def find_words(n, crossword, trie, c):
# if c < 10:
# c += 1
# else:
# return False
# for i in range(1,n):
# char = crossword[0][i]
# for j in range(1, i):
# char += crossword[j][i]
# next_word = lookup(trie, char)
# if not next_word:
# return find_words(n, [crossword[0]], trie, c)
# crossword.append(next_word)
# return crossword
#
# def build_len_dict(corpus):
# len_dict = {}
# len_words = [(len(w), w) for w in corpus]
# for length, word in len_words:
# if length in len_dict:
# len_dict[length].append(word)
# else:
# len_dict[length] = [word]
# return len_dict
#
# def build_trie(words):
# trie = {}
# for w in words:
# insert(trie, w, w)
# return trie
#
# def insert(trie, key, value):
# if key:
# first, rest = key[0], key[1:]
# if first not in trie:
# trie[first] = {}
# insert(trie[first], rest, value)
# else:
# trie['word'] = value
#
# def lookup(trie, word):
# if word:
# current, rest = word[0], word[1:]
# if current in trie:
# return lookup(trie[current], rest)
# return False
# else:
# if not 'word' in trie:
# next = random.choice(list(trie.keys()))
# return lookup(trie, next)
# return trie['word']
#
#
# brown_words = [w.lower() for w in list(set(brown.words())) if len(w) <= 10 and re.search(r'^[a-zA-Z]+$', w)]
# words_words = [w.lower() for w in list(set(words.words())) if len(w) <= 10 and re.search(r'^[a-zA-Z]+$', w)]
# corpus = list(set().union(*[set(brown_words), set(words_words)]))
# len_dict = build_len_dict(corpus)
#
# pprint.pprint(build_crossword(5, len_dict), width=5)
#######################