-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetlink.py
57 lines (42 loc) · 1.06 KB
/
getlink.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#coding=utf-8
import re
import os
import urllib.request
web_url = 'http://www.naturalearthdata.com/downloads/10m-cultural-vectors/'
html_file_local_temp = 'html.txt'
html = ''
#因为代码需要调试,下载网页存储为本地文件速度快
if os.path.isfile(html_file) :
file_object = open(html_file_local_temp)
try:
html = file_object.read( )
finally:
file_object.close( )
else:
response = urllib.request.urlopen(web_url)
html = response.read()
print ('html 下载完成')
html = html.decode('utf-8')
with open(html_file_local_temp, "w") as code:
code.write(html)
#print (html)
#pattern = re.compile(r'href="(http://.*\.zip)"')
pattern = re.compile(r'http://(.+?\.zip)')
url_list = []
it = re.finditer(pattern, html)
for match in it:
url_list.append( match.group(0) )
#print ( match.group(0) )
#去除重复url
myset = set(url_list)
#alist = list(myset)
#输出到屏幕上
for item in myset:
print ( item )
#m = re.search(pattern, html)
#if m:
# for item in m.group:
# print( item )
# print('\n')
#else:
# print ('not match' )