-
Notifications
You must be signed in to change notification settings - Fork 37
/
Operator.py
executable file
·281 lines (272 loc) · 8.81 KB
/
Operator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#!/usr/bin/python
import re
import sys
import urlparse
from bs4 import BeautifulSoup as soup
import sre_constants
import netlib
import URLStripper
import Result
"""Some changes,
*I'm coding while watching reruns of the Office Season 6 lols
*Operator Works in Result objects now
"""
class Operator:
"""
Operator: the object that does all of dorks dirty work for it
*applies URLStripper calls
*applies all the neccesary netlib calls
*performs the dorks
>inurl,intext....
and returns them to Dork, where all the boolean logic will be performed
"""
def __init__(self):
self.netlib = netlib.netlib("Internet Explorer 6.0")
self.stripper = URLStripper.URLStripper()
self.ResList = dict() #a results list to keep the final list
"""Get the HTML of the page from the supplied url
url --- the Resource locator of the page
returns a single str containing the HTML
"""
def setUserAgent(self,UA): #set the user-agent header for all page requests
self.netlib = netlib.netlib(UA)
def setTimeOut(self,timeout): #set the time_out for net requests
self.netlib.set_timeOut(timeout)
def setGoogleResLimit(self,limit): #set the maximum result limit
self.netlib.set_googleResLimit(limit)
def getHTML(self,url):
page = self.netlib.getPage(url)
# print page[1]
if page[0] == False:
sys.stderr.write("Problem fetching page...[%s]" % (url))
return False
"""Google Search the given query
returns the links that google replied with corresponding to the query
"""
def goosearch(self,start,query):
page = self.netlib.googleSearch(start,query)
if page[0] == False:
sys.stderr.write("Problem fetching page...[%s]" % (query))
return False
links = self.stripper.strip(page[1])
return links
def buildResults(self,links): #builds a list of resutls, prevents redundant calls to servers
#simply gets the HTML for the page, and appends it to a results list
#I could have bs4 implemented in the Result obj, but i feel its not neccessary now
if self.ResList or len(self.ResList) == 0:
for link in links:
HTML = self.getHTML(link) #I must remeber to add the response headers to the result object
res = Result.Result(link,[],"",[],HTML) #i know its a lil redundant to have the link in the Result object aswell
#but I intend ot use this object as an easy way to dump info to a database/XML file later
self.ResList[link] = res #add this to the results dictionary
#print self.ResList[link].HTML
#I decided on a dictionary because its easier for GooDork to work with
"""Search the displayable text of a page for a given regex pattern
pattern
url --- the Resource locator of the page
pattern --- the regex pattern to apply
returns True if the regex form appears in the page
returns False if it does not
"""
def _intext(self,pattern,url):
hasPat = False
try:
res = self.Res_list[url]
except KeyError:
return False #the url does not exist in the dictionary
for string in soup(res.HTML):
if re.search(pattern,HTML) != None:
res.Summary += "*\n".join(re.match(pattern,res.HTML))
hasPat = True
return hasPat
def intext(self,pattern,url):
print "Searching in text of %s for %s" % (url,pattern)
#html = self.getHTML(url) #this is the kind of thing I want to prevent
html = self.ResList[url].HTML
if html == False or html == "":
return html
#now we search the text of the page
try:
for string in soup(html).strings:
#print string
if re.search(pattern,string) != None:
return True
except: #this happend when the file is not HTML!, I'm gonna fix this later , so you can search SQL/XML files aswell
return re.search(pattern,html) != None #there i fix!!
return False
"""Search the url supplied for the given regex pattern
url --- the Resource locator to search
pattern --- the regex pattern to apply
returns True if the pattern does appear in the url supplied
returns False if not
"""
def _inurl(self,pattern,url): #the new methods, implimenting the Result object
try:
res = self.Res_list[url]
except KeyError:
return False
hasPat=(re.search(pattern,res.URL) != None)
if hasPat:
res.Summary += '*\n'.join(re.match(pattern,res.URL))
return hasPat
def inurl(self,pattern,url):
#i should let them just use regex!, need to read up on python regex
#print "re.search(%s,%s)" % (pattern,url)
try:
res = re.search(pattern,url)
except sre_constants.error,e:
print "Problem with your regex pattern < %s >" % (pattern)
sys.exit()
#print res
return res != None
"""Search the title tag of a page for the given regex pattern
url --- the Resource locator (URL) to the page
pattern --- the regex pattern to apply
returns True if the regex does appear in the title of the page
returns False if it does not
"""
def _intitle(self,pattern,url):
try:
res = self.Res_list[url]
except KeyError:
return False
hasPat=(re.search(pattern,res.title) != None)
if hasPat:
res.Summary += '*\n'.join(re.split(pattern,res.title))
return hasPat
def intitle(self,pattern,
url):
html = self.ResList[url].HTML
try:
title = soup(html).title
except:
return False
return re.search(pattern,title.string) != None
def _inanchor(self,pattern,
url):
try:
res = self.Res_list[url]
except KeyError:
return False
hasPat=False
for anchor in (res.HTML).findAll("a"):
href = anchor.get("href")
if re.search(pattern,href) != None:
res.Summary += "*\n".join(re.match(pattern,href))
hasPat=True
return hasPat
def inanchor(self,pattern,
url):
isFound = False
html = self.ResList[url].HTML
if html == False:
return html
try:
anchors = soup(html).findAll("a")
for anchor in anchors:
href = anchor.get("href")
if self.inurl(pattern,href):
isFound=True
except:
return isFound
return isFound
def inscript(self,pattern,
url):
html = self.ResList[url].HTML
if html == False:
return html
try:
script_tags = soup(html).findAll("script")
for tag in script_tags:
if re.search(pattern,str(tag.contents)):
return True
except Exception, e:
print e
"""This has yet to be implemented
I hope to be able to have users supply a dork and return have Operator return all the URLs 'related' to the urls from the dork query
e.g
dork .php?*wp-content*=* -related
will return all the URLS that are related to the results returned from the dork
"""
def related(self): #this is gonna take a lil thought to apply properly, will be quite powerful!
return
"""This has yet to be implemented
This will apply the regex pattern to the domain of the given url string
"""
def site(self,url,
pattern):
return
#def cache(self):
# return
"""Search the <input> tags of a page for the supplied regex pattern
in the vales of the supplied attribute
e.g
" ininput(example.com,type,hidden) " will return True if the are input tags
where the attribute type is set to hidden i.e <input type=hidden>
" -ininput [\w]=[\d] " will return True if there are any input tags
where any attributes set to integer data i.e <input abcdefgh123456709=1>
url --- the Resource locator to the page
attr --- the name of the attribute
pattern --- the regex pattern to be applied
returns True if the pattern does appear in the value of the attribute of the input tags
returns False if it does not
"""
def ininput(self,url
,attr,pattern):
return
"""the same as above but applied to the <form> tag
"""
def inform(self,url,
attr,pattern):
return
"""the same as above but applied to the <img> tag
*PROTIP: this will help you find sites that have been XSSed
"""
def inimg(self,url,
attr,pattern):
return
"""Search the contents of the script tags on a page for the supplied pattern
url --- the Resouce locator of the page
pattern -- the regex to be applied to the tag
Returns True if the pattern was found the contents of the script tag
Returns False if not
*PROTIP: this will help you find sites that have been XSSed
"""
def inscript_tag(self,url,
pattern):
return
#I used these methods to test the implementations ;)
def inanchor_(self,html,pattern):
f = open(html,"r")
html = f.read()
anchors = soup(html).findAll("a")
for anchor in anchors:
if re.search(pattern,str(anchor.get("href"))) != None:
print "Found :",anchor
return True
return False
def intext_(self,html,pattern):
f = open(html,"r")
html = f.read()
for string in soup(html).strings:
if re.search(pattern,string) != None:
print "Found :",string
return True
return False
def _inscript(self,
pattern):
f = open('Sample2.html',"r")
html = f.read()
if html == False:
return html
try:
script_tags = soup(html).findAll("script")
for tag in script_tags:
if re.search(pattern,str(tag.contents)):
print "Found :",tag
return True
except Exception, e:
print e
if __name__ == "__main__":
op = Operator()
op._inscript(sys.argv[1])