-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_handler.py
73 lines (65 loc) · 2.02 KB
/
data_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import numpy as np
import re
# with open('SMSSpamCollection', 'r') as f:
# raw_data = f.read()
# f.close()
# data = raw_data.split('\n')
# counter = 0
# for i in range(len(data)):
# D = data[i*counter: i*counter+50]
# for row in D:
# d = re.findall(r"[\w']+", row)
# print(d)
# t, s = d[0], d[1:]
# print(t, s)
# break
# break
# counter += 50
def distributer(data):
# data = raw_data.split('\n')
v_count = 0
ham_multi, spam_multi = dict(), dict()
ham_ber, spam_ber = dict(), dict()
n_ham, n_spam = 0, 0
for i,row in enumerate(data):
d = re.findall(r"[\w']+", row)
if d[0] == 'ham':
n_ham += 1
for j in set(d[1:]):
if j in ham_multi.keys():
ham_multi[j] += 1
else:
ham_multi[j] = 0
if j in ham_ber.keys():
ham_ber[j] += 1
else:
ham_ber[j] = 1
if j not in spam_multi.keys():
spam_multi[j] = 0
if j not in spam_ber.keys():
spam_ber[j] = 0
elif d[0] == 'spam':
n_spam += 1
for j in set(d[1:]):
if j in spam_multi.keys():
spam_multi[j] += 1
else:
spam_multi[j] = 0
if j in spam_ber.keys():
spam_ber[j] += 1
else:
spam_ber[j] = 1
if j not in ham_multi.keys():
ham_multi[j] = 0
if j not in ham_ber.keys():
ham_ber[j] = 0
return ham_ber, ham_multi, spam_ber, spam_multi, n_ham, n_spam
# else:
# print('Incorrect ifelse for row: {}'.format(row))
# print(ham_ber['until'])
if __name__ == '__main__':
with open('SMSSpamCollection', 'r') as f:
raw_data = f.read()
f.close()
raw_data = raw_data.strip()
distributer(raw_data)