-
Notifications
You must be signed in to change notification settings - Fork 0
/
ex4.py
80 lines (66 loc) · 2.72 KB
/
ex4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import nltk
import os
import math
def f():
# nltk.download("punkt")
bigrams_dict = {}
tokens_dict = {}
for filename in os.listdir(os.getcwd() + '/ustawy'):
with open('./ustawy/' + filename, 'r', encoding='utf-8') as file:
text = "".join(file.readlines()).lower()
text = text.replace('\\n', '')
nltk_tokens = nltk.word_tokenize(text, language="Polish")
for token in list(filter(lambda x: x.isalpha(), nltk_tokens)):
if token in tokens_dict:
tokens_dict[token] += 1
else:
tokens_dict[token] = 1
partial = list(filter(bigrams_filter, list(nltk.bigrams(nltk_tokens))))
for p in partial:
if p in bigrams_dict:
bigrams_dict[p] += 1
else:
bigrams_dict[p] = 1
with open('ex4_bigrams.txt', 'w', encoding='utf-8') as res:
res.write("".join([str(k) + ', ' + str(v) for k, v in bigrams_dict.items()]))
print(pmi(bigrams_dict, tokens_dict)[0:30])
print(llr(bigrams_dict, tokens_dict)[0:30])
def bigrams_filter(bigram):
return bigram[0].isalpha() and bigram[1].isalpha() and (len(bigram[0]) > 1 or len(bigram[1]) > 1)
def get_total_count(provided_dict):
res = 0
for k, v in provided_dict.items():
res += v
return res
def pmi(bigrams, tokens):
total_bigrams_count = get_total_count(bigrams)
total_tokens_count = get_total_count(tokens)
res = []
for k, v in bigrams.items():
pair_prob = v / total_bigrams_count
first_prob = tokens[k[0]] / total_tokens_count
second_prob = tokens[k[1]] / total_tokens_count
pmi = math.log2(pair_prob / (first_prob * second_prob))
res.append((k, pmi))
res.sort(key=lambda x: x[1], reverse=True)
return res
def llr(bigrams, tokens):
total_tokens_count = get_total_count(tokens)
res = []
for k, v in bigrams.items():
pair_count = v
first_not_second = tokens[k[1]] - v
second_not_first = tokens[k[0]] - v
nor_second_nor_first = total_tokens_count - tokens[k[1]] - tokens[k[0]]
llr = 2 * (pair_count + first_not_second + second_not_first + nor_second_nor_first) * (
h([pair_count, first_not_second, second_not_first, nor_second_nor_first]) - h(
[pair_count + second_not_first, first_not_second + nor_second_nor_first]) - h(
[pair_count + first_not_second, second_not_first + nor_second_nor_first]))
res.append((k, llr))
res.sort(key=lambda x: x[1], reverse=True)
return res
def h(arr):
N = sum(arr)
return sum([elem / N * math.log2((elem / N) + (1 if elem == 0 else 0)) for elem in arr])
if __name__ == "__main__":
f()