-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUrlProcessor.py
142 lines (119 loc) · 4.71 KB
/
UrlProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding:utf-8 -*-
import string
from HTMLParser import HTMLParser
class UrlProcessor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.sublinks = []
self.imgurls = []
self.taglogs = []
def LogTag(self,tag,atts):
if not tag in self.taglogs:
self.taglogs.append(tag)
def IsValid(self,url):
InvalidDic =["/","javascript:;"]
return not url in InvalidDic
def handle_starttag(self,tag,atts):
self.LogTag(tag,atts)
if tag == "a":
if len(atts) == 0:
pass
else:
for (name,value) in atts:
if name == "href" and self.IsValid(value) and not value in self.sublinks:
self.sublinks.append(value)
elif tag == "img":
if len(atts) ==0:
pass
else:
for(name,value) in atts:
if name == "src" and not value in self.imgurls:
self.imgurls.append(value)
class SingleDJS_Processor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.dwPageCount = 0
self.PageUrls = []
self.bNextAIsUrl = False
self.bNextDataIsPageCount = False
self.filename = ""
def clear(self):
self.dwPageCount = 0
self.PageUrls = []
self.bNextDataIsPageCount = False
def handle_starttag(self,tag,atts):
if tag == "div":
for(name,value) in atts:
if name == "class" and value == "block spacetop preview-thumbnails":
self.bNextAIsUrl = True
elif tag == "a":
if self.bNextAIsUrl:
self.bNextAIsUrl = False
for(name,value) in atts:
if name == "href":
#create suffix str
suffixstr = value[value.rfind("/"):]
tempstring = suffixstr[suffixstr.rfind("_"):]
tempstring = tempstring.replace("1","@@@###")
suffixstr = suffixstr[:suffixstr.rfind("_")]+tempstring
self.filename = suffixstr[1:suffixstr.rfind("_")]
#create profix str
profixstr = value[:value.rfind("/")]
profixstr = profixstr[:profixstr.rfind("/")]
hoststr = "http://pururin.com"
indexlen = len(str(self.dwPageCount))
for i in range(0,self.dwPageCount):
#create a indexing with same length
indexstr = str(i)
indexstr = "0"+indexstr
strFileName = suffixstr.replace("@@@###",str(i+1))
self.PageUrls.append(hoststr+profixstr+"/"+indexstr+strFileName)
def handle_data(self, data):
if data == "Pages":
self.bNextDataIsPageCount = True
elif self.bNextDataIsPageCount == True and self.dwPageCount == 0:
print data
print data[:data.find(" ")]+"1"
self.dwPageCount = string.atoi(data[:data.find(" ")])
self.bNextDataIsPageCount =False
print "find pagecount %u" % self.dwPageCount
def handle_endtag(self, tag):
if tag == "li":
self.bInli = False
class SinglePageDJS_Processor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.img = ""
def Reset(self):
self.img = ""
def handle_startendtag(self, tag, attrs):
if tag == "img":
classname =""
src = ""
for (name,value) in attrs:
if name == "class":
classname = value
elif name == "src":
src = value
if classname == "b":
self.img = "http://pururin.com"+src
class MainPageDJS_Processor(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.SinglePageUrls = []
self.NextPageUrl = ""
self.bInPageJumper = False
def handle_endtag(self,tag):
if tag == "div":
self.bInPageJumper = False
def handle_starttag(self,tag,attrs):
if tag == "div":
for (name,value) in attrs:
if name =="class" and value =="pager jumper":
self.bInPageJumper = True
if tag == "a" :
for(name,value) in attrs:
if name == "href" and value.find(u"gallery") != -1:
self.SinglePageUrls.append(value)
elif name == "href" and self.bInPageJumper and self.NextPageUrl == "":
self.NextPageUrl = value