-
Notifications
You must be signed in to change notification settings - Fork 18
/
basesite.py
executable file
·356 lines (328 loc) · 11.3 KB
/
basesite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
#!/usr/bin/python
import os # fs: exists, mkdir, listdir, rmdir
import time # Sleep
import sys
from threading import Thread
from zipfile import ZipFile, ZIP_DEFLATED
from Web import Web
from shutil import rmtree, copy2
from commands import getstatusoutput
from time import strftime
# Try to import Python Image Library
try:
from PIL import Image
except ImportError:
# Python Image Library not installed, no thumbnail support
Image = None
LOG_NAME = 'log.txt'
RIP_DIRECTORY = 'rips' # Directory to store rips in
MAX_THREADS = 3
MAX_IMAGES = 500
MAX_THUMB_DIM = 6000
MAX_THUMB_SIZE= 5 * 1024 * 1024
"""
Abstract Python 'interface' for a site ripper.
Each inerhiting/implementing class *must* override:
* sanitize_url
-- Must raise Exception if given URL is not valid
* get_dir
-- Must return directory name to download album to
-- Usually this is based on the URL/gallery/album name
-- Should be unique for every album on site
* download
-- Retrieves content from URL, downloads albums
-- Does not complete until entire album is downloaded
"""
class basesite(object):
"""
Constructs object using overriding methods.
Throws Exception if:
* URL is invalid (not appropriate for site class),
* Working directory could not be created.
"""
def __init__(self, url, debugging=False):
self.debugging = debugging
self.web = Web(debugging=self.debugging) # Web object for downloading/parsing
self.base_dir = RIP_DIRECTORY
if not os.path.exists(self.base_dir):
os.mkdir(self.base_dir)
self.original_url = url
self.url = self.sanitize_url(url)
# Directory to store images in
self.working_dir = '%s%s%s' % (self.base_dir, os.sep, self.get_dir(self.url))
self.max_threads = MAX_THREADS
self.thread_count = 0
self.image_count = 0
self.max_images = MAX_IMAGES
self.logfile = '%s%s%s' % (self.working_dir, os.sep, LOG_NAME)
self.first_log = True
""" To be overridden """
def sanitize_url(self, url):
raise Exception("Method 'sanitize_url' was not overridden!")
""" Return directory name to store photos in """
def get_dir(self, url):
raise Exception("Method 'get_dir' was not overridden!")
""" Creates working dir if zip does not exist """
def init_dir(self):
if not os.path.exists(self.working_dir) and \
self.existing_zip_path() == None:
os.mkdir(self.working_dir)
""" Returns true if we hit the image limit, false otherwise """
def hit_image_limit(self):
if self.image_count >= self.max_images:
self.log('hit image limit: %d >= %d' % (self.image_count, self.max_images))
return True
return False
""" To be overridden """
def download(self):
raise Exception("Method 'download' was not overridden!")
""" Checks if album is already being downloaded """
def is_downloading(self):
return os.path.exists(self.logfile)
""" Appends line to log file """
def log(self, text, overwrite=False):
if self.first_log:
self.first_log = False
self.log('http://rip.rarchives.com - file log for URL %s @ %s' % (self.original_url, strftime('%Y-%m-%dT%H:%M:%S PDT')), overwrite=False)
if self.debugging:
sys.stderr.write('%s\n' % text)
text = text.replace('"', '\\"')
if overwrite:
f = open(self.logfile, 'w')
else:
f = open(self.logfile, 'a')
f.write("%s\n" % text)
f.flush()
f.close()
""" Gets last line(s) from log """
def get_log(self, tail_lines=1):
if not os.path.exists(self.logfile):
return ''
f = open(self.logfile, 'r')
r = f.read().strip()
f.close()
while r.endswith('\n'): r = r[:-1]
lines = r.split('\n')
return lines[len(lines)-tail_lines:]
""" Starts separate thread to download image from URL """
def download_image(self, url, index, total='?', subdir='', saveas=None):
unique_saveas = True
if saveas == None:
unique_saveas = False
saveas = url[url.rfind('/')+1:]
# Strip extraneous / non FS safe characters
if '?' in saveas: saveas = saveas[:saveas.find('?')]
if ':' in saveas: saveas = saveas[:saveas.find(':')]
# Add a file extension if necessary
if not '.' in saveas:
m = self.web.get_meta(url)
ct = 'image/jpeg' # Default to jpg
if 'Content-Type' in m: ct = m['Content-Type']
ext = ct[ct.rfind('/')+1:]
if ext == 'jpeg': ext = 'jpg'
saveas = '%s.%s' % (saveas, ext)
# Setup subdirectory saves
if subdir != '': subdir = '/%s' % subdir
savedir = '%s%s' % (self.working_dir, subdir)
if not os.path.exists(savedir): os.mkdir(savedir)
if unique_saveas:
saveas = '%s/%s' % (savedir, saveas)
else:
saveas = '%s/%03d_%s' % (savedir, index, saveas)
if os.path.exists(saveas):
self.log('file exists: %s' % saveas)
self.image_count += 1
else:
while self.thread_count > self.max_threads:
time.sleep(0.1)
self.thread_count += 1
args = (url, saveas, index, total)
t = Thread(target=self.download_image_thread, args=args)
t.start()
""" Multi-threaded download of image """
def download_image_thread(self, url, saveas, index, total):
m = self.web.get_meta(url)
if 'Content-Type' not in m:
text = 'no Content-Type found at URL %s' % (url)
elif ('image' not in m['Content-Type'] and \
'video' not in m['Content-Type'] and \
'audio' not in m['Content-Type'] and \
'octet-stream' not in m['Content-Type']):
text = 'no image/video/octet-stream in Content-Type (found "%s") for URL %s' % (m['Content-Type'], url)
else:
indextotal = self.get_index_total(index, total)
if self.web.download(url, saveas):
self.image_count += 1
# Create thumbnail
thumbnail = self.create_thumb(saveas)
text = 'downloaded %s (%s) - source: (%s) thumbnail: (%s)' % (indextotal, self.get_size(saveas), url, thumbnail)
else:
text = 'download failed %s - %s' % (indextotal, url)
self.log(text)
self.thread_count -= 1
""" Same-thread downlod/save (does not launch new thread) """
def save_image(self, url, saveas, index, total='?'):
indextotal = self.get_index_total(index, total)
if os.path.exists(saveas):
self.image_count += 1
self.log('file exists: %s' % saveas)
elif self.web.download(url, saveas):
self.image_count += 1
thumbnail = self.create_thumb(saveas)
self.log('downloaded %s (%s) - source: (%s) thumbnail: (%s)' % (indextotal, self.get_size(saveas), url, thumbnail))
else:
self.log('download failed %s - %s' % (indextotal, url))
"""
Wait for threads to finish downloading.
Delete working dir if no images are downloaded
"""
def wait_for_threads(self):
while self.thread_count > 0:
time.sleep(0.1)
if os.path.exists(self.working_dir):
if len(os.listdir(self.working_dir)) <= 1:
rmtree(self.working_dir) # Delete everything in working dir
""" Returns human-readable filesize for file """
def get_size(self, filename):
try:
bytes = os.path.getsize(filename)
except:
return '?b'
b = 1024 * 1024 * 1024
a = ['g','m','k','']
for i in a:
if bytes >= b:
return '%.2f%sb' % (float(bytes) / float(b), i)
b /= 1024
return '0b'
"""
Returns path to zip file if it exists, otherwise None.
Does not return path if zipping is in progress.
"""
def existing_zip_path(self):
zipfile = '%s.zip' % (self.working_dir)
if os.path.exists(zipfile):
if not os.path.exists(self.working_dir):
# No direcotry; only zip exists
return zipfile
else:
if not os.path.exists('%s%szipping.txt' % (self.working_dir, os.sep)):
# 'zipping' file/flag does not exist
return zipfile
return None
"""
Zips site's working directory,
Deletes zipped files after zip is created
Returns path to zip file
"""
def zip(self):
self.log('zipping album...')
zip_filename = '%s.zip' % self.working_dir
z = ZipFile(zip_filename, "w", ZIP_DEFLATED)
for root, dirs, files in os.walk(self.working_dir):
if root.endswith('/thumbs'): continue # Do not zip thumbnails
for fn in files:
# Ignore files used by service:
if fn.endswith('zipping.txt'): continue # Album is currently zipping
if fn.endswith('complete.txt'): continue # Album download completed
if fn.endswith('ip.txt'): continue # IP address of ripper
if fn.endswith('reports.txt'): continue # Number of reports, report messages
absfn = os.path.join(root, fn)
zfn = absfn[len(self.working_dir)+len(os.sep):] #XXX: relative path
z.write(absfn, zfn)
z.close()
return zip_filename
"""
Creates thumbnail based on file path.
Creates /thumbs/ sub dir & stores thumbnail.
Returns thumbnail path on success, empty string on failure.
"""
def create_thumb(self, inp):
if inp.lower().endswith('.mp4'):
return self.create_video_thumb(inp)
if Image == None:
sys.stderr.write('Python Image Library (PIL) not installed; unable to create thumbnail for %s\n' % inp)
sys.stderr.write('Go to http://www.pythonware.com/products/pil/ to install PIL\n')
sys.stderr.flush()
return 'rips/nothumb.png'
fields = inp.split(os.sep)
fields.insert(-1, 'thumbs')
saveas = os.sep.join(fields)
if os.path.exists(saveas): return ''
thumbpath = os.sep.join(fields[:-1])
if not os.path.exists(thumbpath):
try: os.mkdir(thumbpath)
except: pass
try:
im = Image.open(inp)
(width, height) = im.size
if width > MAX_THUMB_DIM or height > MAX_THUMB_DIM:
# Image too large to create thumbnail
self.log('unable to create thumbnail, %dx%d > %d' % (width, height, MAX_THUMB_DIM))
return 'rips/nothumb.png'
if os.path.getsize(inp) > MAX_THUMB_SIZE:
self.log('unable to create thumbnail, %db > %db' % (os.path.getsize(inp), MAX_THUMB_SIZE))
return 'rips/nothumb.png'
if im.mode != 'RGB': im = im.convert('RGB')
im.thumbnail( (200,200), Image.ANTIALIAS)
im.save(saveas, 'JPEG')
return saveas
except Exception, e:
self.log('failed to create thumb: %s' % str(e))
pass
return 'rips/nothumb.png'
"""
Create thumbnail for video file, uses ffmpeg.
Returns path to thumbnail or empty string on failure.
"""
def create_video_thumb(self, inp):
fields = inp.split(os.sep)
fields.insert(-1, 'thumbs')
saveas = os.sep.join(fields)
saveas = saveas[:saveas.rfind('.')] + '.png'
thumbpath = os.sep.join(fields[:-1])
if not os.path.exists(thumbpath):
try: os.mkdir(thumbpath)
except: pass
overlay = 'play_overlay.png'
ffmpeg = '/usr/bin/ffmpeg'
if not os.path.exists(ffmpeg):
ffmpeg = '/opt/local/bin/ffmpeg'
if not os.path.exists(ffmpeg):
return '' # Can't get images if we can't find ffmpeg
cmd = ffmpeg
cmd += ' -i "'
cmd += inp
cmd += '" -vf \'movie='
cmd += overlay
cmd += ' [watermark]; '
cmd += '[in]scale=200:200 [scale]; '
cmd += '[scale][watermark] overlay=(main_w-overlay_w)/2:(main_h-overlay_h)/2 [out]\' '
cmd += saveas
try:
(s, o) = getstatusoutput(cmd)
return saveas
except:
pass
return ''
""" Print text to stderr, only if debugging is enabled """
def debug(self, text):
if not self.debugging: return
sys.stderr.write('%s\n' % text)
""" Remove excess / unnecessary characters from URL """
def strip_url(url):
for c in ['?', '#', '&']:
if c in url: url = url[:url.find(c)]
return url
""" Return current index / total (in parenthesis), formatted properly """
def get_index_total(self, index, total):
countmsg = '(%s' % str(index)
if total == '?':
countmsg += ')'
else:
countmsg += '/%s)' % str(total)
return countmsg
""" (Correctly) waits for threads to finish before throwing exception """
def exception(self, e):
self.wait_for_threads()
raise Exception(e)