-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathimage_handlers.py
156 lines (133 loc) · 5.62 KB
/
image_handlers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#-------------------------------------------------------------------------------
# Name: image_handlers
# Purpose: Code for saving image URL, should be called elsewhere
#
# Author: User
#
# Created: 30/03/2015
# Copyright: (c) User 2015
# Licence: <your licence>
#-------------------------------------------------------------------------------
# Libraries
import sqlalchemy
import re
import logging
# This project
from utils import *
from sql_functions import Media
import sql_functions
import config # User settings
# Media handler modules
def download_image_link(session,media_url):
"""Load an image link and hash the data recieved,
then add an entry to the DB for the URL
and if no match is found for the hash, save the file to disk"""
media_already_saved = None # Init var to unknown
logging.debug("download_image_link() Processing image: "+repr(media_url))
# Check if URL is in the DB already, if so return hash.
url_check_row_dict = sql_functions.check_if_media_url_in_DB(session,media_url)
if url_check_row_dict:
media_already_saved = True
return [url_check_row_dict["media_id"]]
# Load URL
request_tuple = getwithinfo(media_url)
if request_tuple is None:
logging.error("Could not load image URL!")
appendlist(
media_url,
list_file_path=os.path.join("debug","image_get_failed.txt"),
initial_text="# List of completed items.\n"
)
return []
file_data, info, r = request_tuple
# Reject HTML responses
if r.headers.getmaintype() == "text":
logging.error("download_image_link() Link was not an image: "+repr(media_url))
logging.debug("download_image_link() r.headers.dict: "+repr(r.headers.dict))
return []
time_of_retreival = get_current_unix_time()
# Generate hash
sha512base16_hash = hash_file_data(file_data)# Used for filenames and dedupe
md5base64_hash = generate_md5b64_for_memory(file_data)# For comparison only
# Generate filename for output file (With extention)
cropped_full_image_url = media_url.split("?")[0]# Remove after ?
remote_filename = os.path.basename(cropped_full_image_url)
file_extention = get_file_extention(media_url)
if not file_extention:
logging.error("download_image_link() No file extention!")
logging.error(repr(locals()))
assert(False)# Something broke and then called this
local_filename = generate_filename(ext=file_extention,sha512base16_hash=sha512base16_hash)
logging.debug("download_image_link() ""local_filename: "+repr(local_filename))
file_path = generate_path(root_path=config.root_path,filename=local_filename)
logging.debug("download_image_link() ""file_path: "+repr(file_path))
# Compare hash with database and add new entry for this URL
hash_check_row_dict = sql_functions.check_if_hash_in_db(session,sha512base16_hash)
if hash_check_row_dict:
media_already_saved = True
local_filename = hash_check_row_dict["local_filename"]
# If hash was already in DB, don't bother saving file
if media_already_saved:
logging.debug("Hash already in DB, no need to save file to disk")
else:
# Save file to disk, using the hash as a filename
logging.debug("Hash was not in DB, saving file: "+repr(file_path))
save_file(
file_path=file_path,
data=file_data,
force_save=False,
allow_fail=False
)
# Get size of file
file_size_in_bytes = len(file_data)# Using len() because actually checking the size on disk kept crashing
# Add new row
new_media_row = Media(
media_url=media_url,
sha512base16_hash=sha512base16_hash,
local_filename=local_filename,
remote_filename = remote_filename,
file_extention=file_extention,
date_added=time_of_retreival,
extractor_used="image_handlers.download_image_link()",
md5base64_hash=md5base64_hash,
file_size_in_bytes=file_size_in_bytes,
)
session.add(new_media_row)
session.commit()
# Get the id back
get_id_row = sql_functions.check_if_media_url_in_DB(session,media_url)
media_id = get_id_row["media_id"]
media_id_list = [media_id]
return media_id_list
def download_image_links(session,media_urls):
# Save image links
media_urls = uniquify(media_urls)
media_id_list = []
for media_url in media_urls:
found_ids = download_image_link(session,media_url)
if found_ids:
media_id_list += found_ids
continue
logging.debug("download_image_links() media_id_list:"+repr(media_id_list))
return media_id_list
def debug():
"""For WIP, debug, ect function calls"""
session = sql_functions.connect_to_db()
result = download_image_links(session,
media_urls = ["http://blog.crooz.jp/svc/userFrontArticle/ShowFiles/?no=1538&blog_id=53800&file_str=5380015381862e669e0073d13d4175ecae9d5a34b8ff05fe3.jpg&guid=on&vga_flg=0&y=2014&m=02&d=19&wid=480&hei=640",
"http://static.tumblr.com/f6539f27dff5045834f7722e61c02e21/w5cnjnh/fVJnpuk9d/tumblr_static_digrsyxj0eg400so4g4kgsc4k.jpg",
"https://dl.dropboxusercontent.com/u/27379736/NSFWSFM/SWF/TwilightSoloLightParticles.swf"
]
)
logging.debug(result)
def main():
try:
setup_logging(log_file_path=os.path.join("debug","image_handlers_log.txt"))
debug()
except Exception, e:# Log fatal exceptions
logging.critical("Unhandled exception!")
logging.exception(e)
logging.info("Finished, exiting.")
return
if __name__ == '__main__':
main()