-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
108 lines (92 loc) · 2.92 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import urllib2
def add_page_to_index(index, url, page_content):
list_content = page_content.split()
for i in list_content:
#case 1: i already in index
if i not in index:
index[i] = set()
#case 2: i not in index, insert i to index
#finally: add url to index[i]
index[i].add(url)
def get_page(url):
try:
response = urllib2.urlopen(url)
except:
return ""
page = response.read()
return page
def get_all_links(page, domain):
url_set = set()
start_pos = 0
end_pos = -1
while end_pos < len(page):
start_link = page.find('<a href=', end_pos + 1)
if start_link < 0:
break
start_pos = page.find('"', start_link)
end_pos = page.find('"', start_pos + 1)
url = page[start_pos + 1:end_pos]
if url.find(domain) >= 0:
url_set.add(url)
return url_set
def crawl_web(seed, max_page, domain):
to_crawl = {seed}
crawled = set()
graph = {} # <url>, [list of pages it links to]
index = {}
while to_crawl:
url = to_crawl.pop()
if url not in crawled:
content = get_page(url)
add_page_to_index(index, url, content)
outlinks = get_all_links(content, domain)
graph[url] = outlinks
to_crawl = to_crawl.union(outlinks)
crawled.add(url)
if len(crawled)>=max_page:
break
return index, graph
def compute_ranks(graph):
d = 0.8 # damping factor
numloops = 100
ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages
for i in range(0, numloops):
newranks = {}
for page in graph:
newrank = (1 - d) / npages
for node in graph:
if page in graph[node]:
newrank = newrank + d * (ranks[node] / len(graph[node]))
newranks[page] = newrank
ranks = newranks
return ranks
def search(keyword, index):
if keyword in index:
return index[keyword]
else:
return []
def getKey(item):
return item[1]
def main(seed, max_page, domain):
graph = {} # <url>, [list of pages it links to]
index = {}
ranks = {}
index, graph = crawl_web(seed, max_page, domain)
print index
ranks = compute_ranks(graph)
while True:
l_tuple = []
keyword = input("Please input your keyword:")
topN = input("Please input # of the pages you want:")
urls = search(keyword, index)
for url in urls:
l_tuple.append((url, ranks[url]))
print sorted(l_tuple, key = getKey, reverse = True)[0 : topN]
"""
print_links_in_relevance("http://nus.edu.sg", 10, "nus.edu")
ranks = compute_ranks(graph)
"""
main("http://www.yoursingapore.com/en.html", 100, "yoursingapore.com")