-
Notifications
You must be signed in to change notification settings - Fork 41
/
AnalyzePDF.py
executable file
·390 lines (347 loc) · 15.4 KB
/
AnalyzePDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
#!/usr/bin/env python
"""
Analyzes PDF files by looking at their characteristics in order to add some intelligence into the determination of them being malicious or benign.
Usage:
$ AnalyzePDF.py [-h] [-m MOVE] [-y YARARULES] Path
Produces a high level overview of a PDF to quickly determine if further
analysis is needed based on it's characteristics
positional arguments:
Path Path to directory/file(s) to be scanned
optional arguments:
-h, --help show this help message and exit
-m MOVE, --move MOVE Directory to move files triggering YARA hits to
-y YARARULES, --yararules YARARULES
Path to YARA rules. Rules should contain a weighted
score in the metadata section. (i.e. weight = 3)
example: python AnalyzePDF.py -m tmp/badness -y foo/pdf.yara bar/getsome.pdf
"""
# AnalyzePDF.py was created by Glenn P. Edwards Jr.
# http://hiddenillusion.blogspot.com
# @hiddenillusion
# Version 0.2
# Date: 10-11-2012
# Requirements:
# - Python 2.x
# - YARA (http://plusvic.github.io/yara/)
# - pdfid (http://blog.didierstevens.com/programs/pdf-tools/)
# Optional:
# * This script will work without these but may miss some conditions to evaluate based on the missing data they would provide (i.e. - # of Pages) *
# - pdfinfo (www.foolabs.com/xpdf/download.html)
# - a "weight" field within the YARA's rule meta should be added to help in the final evaluation
# i.e. - rule pdf_example {meta: weight = 3 strings: $s = "evil" condition: $s}
# To-Do:
# - suppress pdfid's output log
# - be able to print out which conditions it met in the rules
import os
import subprocess
import shutil
import sys
import datetime
import time
import argparse
import binascii
import re
import zipfile
import shutil
import hashlib
from decimal import Decimal
"""
Chose to _import_ PDFiD instead of just using subprocess to spawn it so it can be statically compiled for use on Windows.
If you don't have it installed on your system, you can just download it and have it in the same directory as this script.
"""
try:
import pdfid
except ImportError:
print "[!] PDFiD not installed"
sys.exit()
try:
import yara
except ImportError:
print "[!] Yara not installed"
sys.exit()
# Initialize the list(s) where PDF attribs will be added to
counter = []
page_counter = []
# Initialize the YARA scoring count
yscore = []
ydir = False
# Misc. formatting
trailer = ("=" * 35)
filler = ("-" * 35)
parser = argparse.ArgumentParser(description='Produces a high level overview of a PDF to quickly determine if further analysis is needed based on it\'s characteristics')
parser.add_argument('-m','--move', help='Directory to move files triggering YARA hits to', required=False)
parser.add_argument('-y','--yararules', help='Path to YARA rules. Rules should contain a weighted score in the metadata section. (i.e. weight = 3)', required=False)
parser.add_argument('Path', help='Path to directory/file(s) to be scanned')
args = vars(parser.parse_args())
# Verify supplied path exists or die
if not os.path.exists(args['Path']):
print "[!] The supplied path does not exist"
sys.exit()
# Configure YARA rules
if args['yararules']:
rules = args['yararules']
else:
rules = '/usr/local/etc/capabilities.yara' # REMnux location
if not os.path.exists(rules):
print "[!] Correct path to YARA rules?"
sys.exit()
else:
try:
r = yara.compile(rules)
if args['move']:
ydir = args['move']
except Exception, msg:
print "[!] YARA compile error: %s" % msg
sys.exit()
def main():
# Set the path to file(s)
ploc = args['Path']
if os.path.isfile(ploc):
fileID(ploc)
elif os.path.isdir(ploc):
pwalk(ploc)
# Quote idea credited to: https://github.com/marpaia/jadPY ... useful for Windows, what can I say...
def q(s):
quote = "\""
s = quote + s + quote
return s
def sha256(pdf):
try:
f = open(pdf, "rb")
data = f.read()
sha256 = hashlib.sha256(data).hexdigest()
f.close()
except Exception, msg:
print msg
return sha256
def fileID(pdf):
"""
Generally the PDF header will be within the first (4) bytes but since the PDF specs say it
can be within the first (1024) bytes I'd rather check for atleast (1) instance
of it within that large range. This limits the chance of the PDF using a header
evasion trick and then won't end up getting analyzed. This evasion behavior could later
be detected with a YARA rule.
"""
f = open(pdf,'rb')
s = f.read(1024)
if '\x25\x50\x44\x46' in s:
print "\n" + trailer
print "[+] Analyzing: %s" % pdf
print filler
print "[-] Sha256: %s" % sha256(pdf)
info(pdf)
elif os.path.isdir(pdf): pwalk(pdf)
f.close()
def pwalk(ploc):
# Recursivly walk the supplied path and process files accordingly
for root, dirs, files in os.walk(ploc):
for name in files:
f = os.path.join(root, name)
fileID(f)
def info(pdf):
command = "pdfinfo " + q(pdf)
try:
p = subprocess.Popen(command,shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
#for line in p.stdout:
# if re.match('Pages:\s+(0|1)$', line):
# counter.append("pages")
# print "[-] (1) page PDF"
for line in p.stderr:
if re.search('Unterminated hex string|Loop in Pages tree|Illegal digit in hex char in name', line):
counter.append("sketchy")
print "[-] Sketchyness detected"
elif re.search('Unexpected end of file in flate stream|End of file inside array', line):
counter.append("eof")
print "[-] EoF problem"
elif re.search('Couldn\'t find trailer dictionary', line):
counter.append("trailer")
elif re.search('Invalid XRef entry|No valid XRef size in trailer|Invalid XRef entry|Couldn\'t read xref table', line):
counter.append("xref")
print "[-] Invalid XREF"
break
except Exception, msg:
print "[!] pdfinfo error: %s" % msg
pass
id(pdf)
def id(pdf):
try:
# (dir, allNames, extraData, disarm, force), force)
command = pdfid.PDFiD2String(pdfid.PDFiD(pdf, True, True, False, True), True)
extra = True
except Exception:
# I've observed some files raising errors with the 'extraData' switch
command = pdfid.PDFiD2String(pdfid.PDFiD(pdf, True, False, False, True), True)
print "[!] PDFiD couldn\'t parse extra data"
extra = False
for line in command.split('\n'):
count = re.split(r'[\s]+', line)
if "PDF Header" in line and not re.match('%PDF-1\.\d', count[3]):
counter.append("header")
print "[-] Invalid version number : \"%s\"" % count[3]
elif "/Page " in line:
page_counter.append(count[2])
elif "/Pages " in line:
page_counter.append(count[2])
elif "/JS " in line and not re.match('0', count[2]):
counter.append("js")
print "[-] JavaScript count.......: %s" % count[2]
if count[2] > "1":
counter.append("mucho_javascript")
print "\t[*] That\'s a lot of js ..."
elif "/AcroForm " in line and not re.match('0', count[2]):
counter.append("acroform")
print "[-] AcroForm...............: %s" % count[2]
elif "/AA " in line and not re.match('0', count[2]):
counter.append("aa")
print "[-] Additional Action......: %s" % count[2]
elif "/OpenAction " in line and not re.match('0', count[2]):
counter.append("oa")
print "[-] Open Action............: %s" % count[2]
elif "/Launch " in line and not re.match('0', count[2]):
counter.append("launch")
print "[-] Launch Action..........: %s" % count[2]
elif "/EmbeddedFiles " in line and not re.match('0', count[2]):
counter.append("embed")
print "[-] Embedded File..........: %s" % count[2]
#elif "trailer" in line and not re.match('0|1', count[2]):
# print "[-] Trailer count..........: %s" % count[2]
# print "\t[*] Multiple versions detected"
elif "Total entropy:" in line:
tentropy = count[3]
print "[-] Total Entropy..........: %7s" % count[3]
elif "Entropy inside streams:" in line:
ientropy = count[4]
print "[-] Entropy inside streams : %7s" % count[4]
elif "Entropy outside streams:" in line:
oentropy = count[4]
print "[-] Entropy outside streams: %7s" % count[4]
"""
Entropy levels:
0 = orderly, 8 = random
ASCII text file = ~2/4
ZIP archive = ~ 7/8
PDF Malicious
- total : 6.3
- inside : 6.6
- outside : 4.9
PDF Benign
- total : 6.7
- inside : 7.2
- outside : 5.1
Determine if Total Entropy & Entropy Inside Stream are significantly different than Entropy Outside Streams -> i.e. might indicate a payload w/ long, uncompressed NOP-sled
ref = http://blog.didierstevens.com/2009/05/14/malformed-pdf-documents
"""
if not extra == False:
te_long = Decimal(tentropy)
te_short = Decimal(tentropy[0:3])
ie_long = Decimal(ientropy)
ie_short = Decimal(ientropy[0:3])
oe_long = Decimal(oentropy)
oe_short = Decimal(oentropy[0:3])
ent = (te_short + ie_short) / 2
# I know 'entropy' might get added twice to the counter (doesn't matter) but I wanted to separate these to be alerted on them individually
togo = (8 - oe_long) # Don't want to apply this if it goes over the max of 8
if togo > 2:
if oe_long + 2 > te_long:
counter.append("entropy")
print "\t[*] Entropy of outside stream is questionable:"
print "\t[-] Outside (%s) +2 (%s) > Total (%s)" % (oe_long,oe_long +2,te_long)
elif oe_long > te_long:
counter.append("entropy")
print "\t[*] Entropy of outside stream is questionable:"
print "\t[-] Outside (%s) > Total (%s)" % (oe_long,te_long)
if str(te_short) <= "2.0" or str(ie_short) <= "2.0":
counter.append("entropy")
print "\t[*] LOW entropy detected:"
print "\t[-] Total (%s) or Inside (%s) <= 2.0" % (te_short,ie_short)
# Process the /Page(s) results here just to make sure they were both read
if re.match('0', page_counter[0]) and re.match('0', page_counter[1]):
counter.append("page")
print "[-] Page count suspicious:"
print "\t[*] Both /Page (%s) and /Pages (%s) = 0" % (page_counter[0],page_counter[1])
elif re.match('0', page_counter[0]) and not re.match('0', page_counter[1]):
counter.append("page")
print "[-] Page count suspicious, no individual pages defined:"
print "\t[*] /Page = (%s) , /Pages = (%s)" % (page_counter[0],page_counter[1])
elif re.match('1$', page_counter[0]):
counter.append("page")
print "[-] (1) page PDF"
yarascan(pdf)
def yarascan(pdf):
try:
ymatch = r.match(pdf)
if len(ymatch):
print "[-] YARA hit(s): %s" % ymatch
for rule in ymatch:
meta = rule.meta
for key, value in meta.iteritems():
# If the YARA rule has a weight in it's metadata then parse that for later calculation
if "weight" in key:
yscore.append(value)
if not ydir == False:
print "[-] Moving malicious file to:",ydir
# This will move the file if _any_ YARA rule triggers...which might trick you if the
# rule that triggers on it doesn't have a weight or is displayed in the output
if not os.path.exists(ydir):
os.makedirs(ydir)
try:
shutil.move(pdf, ydir)
except Exception, msg:
continue
except Exception, msg:
print msg
eval(counter)
def eval(counter):
"""
Evaluate the discovered contents of the PDF and assign a severity rating
based on the conditions configured below.
Rating system: 0 (benign), >=2 (sketchy), >=3 (medium), >=5 (high)
"""
print filler
ytotal = sum(yscore)
print "[-] Total YARA score.......: %s" % ytotal
sev = 0
# Below are various combinations used to add some intelligence and help evaluate if a file is malicious or benign.
# This is where you can add your own thoughts or modify existing checks.
# HIGH
if "page" in counter and "launch" in counter and "js" in counter: sev = 5
elif "page" in counter and "xref" in counter: sev += 5
elif "page" in counter and "aa" in counter and "js" in counter: sev += 5
elif "page" in counter and "oa" in counter and "js" in counter: sev += 5
# MEDIUM
if "header" in counter and "xref" in counter: sev += 3
elif "header" in counter and "js" in counter and "page" in counter: sev += 3
elif "header" in counter and "launch" in counter and "page" in counter: sev += 3
elif "header" in counter and "aa" in counter and "page" in counter: sev += 3
if "page" in counter and "mucho_javascript" in counter: sev += 3
elif "page" in counter and "acroform" in counter and "embed" in counter: sev += 3
elif "page" in counter and "acroform" in counter and "js" in counter: sev += 3
if "entropy" in counter and "page" in counter: sev += 3
elif "entropy" in counter and "aa" in counter: sev += 3
elif "entropy" in counter and "oa" in counter: sev += 3
elif "entropy" in counter and "js" in counter: sev += 3
if "oa" in counter and "js" in counter: sev += 3
if "aa" in counter and "mucho_javascript" in counter: sev += 3
# Heuristically sketchy
if "page" in counter and "js" in counter: sev += 2
if "sketchy" in counter and "page" in counter: sev += 2
elif "sketchy" in counter and "aa" in counter: sev += 2
elif "sketchy" in counter and "oa" in counter: sev += 2
elif "sketchy" in counter and "launch" in counter: sev += 2
elif "sketchy" in counter and "eof" in counter: sev += 1
if "page" in counter and "aa" in counter: sev += 1
if "page" in counter and "header" in counter: sev += 1
if "header" in counter and "embed" in counter: sev += 1
print "[-] Total severity score...: %s" % sev
sev = (ytotal + sev)
print "[-] Overall score..........: %s" % sev
if sev >= 5: print trailer + "\n[!] HIGH probability of being malicious"
elif sev >= 3: print trailer + "\n[!] MEDIUM probability of being malicious"
elif sev >= 2: print trailer + "\n[!] Heuristically sketchy"
elif sev >= 0: print trailer + "\n[-] Scanning didn't determine anything warranting suspicion"
# Clear out the scores to start fresh for the next analysis
del counter[:]
del page_counter[:]
del yscore[:]
if __name__ == "__main__":
main()