-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
40 lines (29 loc) · 930 Bytes
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import string
import numpy as np
alphabet = string.ascii_uppercase + ' '
def cbpd_from_book(book):
text = sanitize(open(book, 'r').read())
g = {}
for c1 in alphabet:
g[c1] = {}
for c2 in alphabet:
g[c1][c2] = 0
for i in range(0, len(text) - 1):
g[text[i]][text[i + 1]] += 1
for c1 in g.keys():
total = sum(g[c1].values())
for c2 in g[c1].keys():
g[c1][c2] = g[c1][c2] / total
return g
def cbfd_from_book(book):
text = sanitize(open(book, 'r').read())
h = np.zeros((len(alphabet), len(alphabet)))
for c1 in alphabet:
for c2 in alphabet:
h[alphabet.index(c1)][alphabet.index(c2)] = 0
for i in range(0, len(text) - 1):
h[alphabet.index(text[i])][alphabet.index(text[i + 1])] += 1
return h
def sanitize(text):
return ' '.join(re.findall("[a-zA-Z]+", text)).upper()