-
Notifications
You must be signed in to change notification settings - Fork 20
/
python2_download_facescrub.py
350 lines (268 loc) · 13.3 KB
/
python2_download_facescrub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This script is released under a Creative Commons Attribution-NonCommercial 4.0 International Public License.
To view a copy of this license, visit <http://creativecommons.org/licenses/by-nc/4.0/legalcode>
File: python2_download_facescrub.py
Author: Hong-Wei Ng
Email: [email protected]
Github: https://github.com/lightalchemist
Description: Script to download FaceScrub dataset
Tested on Ubuntu 14.04, Python 2.7.
# Requirements:
pip install requests
# Interchangeable with PIL. Can be ignored if you already have PIL installed
pip install Pillow
# Optional, but good to have, for detecting file type. May not work on Windows
pip install python-magic
# Steps to download FaceScrub dataset
1. First, obtain the FaceScrub files containing links to the images from http://vintage.winklerbros.net/facescrub.html
2. Next, set MY_USER_AGENT_STRING below. You can obtain it by visiting a site such as https://www.whatismybrowser.com/detect/what-is-my-user-agent
3. Finally, run download_facescrub.py to download the dataset.
# Example to download actors images.
Note: actors_users_normal_bbox.txt is obtained from the above link.
>>> # To download and save full size images only
>>> python python2_download_facescrub.py actors_users_normal_bbox.txt actors/
>>> # To download and save full size images along with cropped faces
>>> python python2_download_facescrub.py actors_users_normal_bbox.txt actors/ --crop_face
>>> # Additional (optional) arguments to set log file name, time out (10 seconds),
>>> # max retries (3), start download at line 10 (note: line 1 is header) and
>>> # end at line 20.
>>> python python2_download_facescrub.py actors_users_normal_bbox.txt actors/ \
--crop_face --logfile=download.log --timeout=10 --max_retries=3 --start_at_line=10 --end_at_line=20
The above code will save full size images to the directory actors/images and faces (if required) to actors/faces
"""
import os
import shutil
import mimetypes
import logging
import urlparse
import hashlib
import argparse
from itertools import islice
import imghdr
try:
import magic
has_magic_lib = True
except ImportError as e:
has_magic_lib = False
from PIL import Image
import requests
from requests import ConnectionError
from requests import TooManyRedirects
from requests import Timeout
from requests import HTTPError
from requests import RequestException
# Visit website and copy user agent string as single line https://www.whatismybrowser.com/detect/what-is-my-user-agent
MY_USER_AGENT_STRING="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/47.0.2526.106 Chrome/47.0.2526.106 Safari/537.36"
session = None
def setup_session(max_retries=1):
# Use a `Session` instance to customize how `requests` handles making HTTP requests.
global session
session = requests.Session()
# `mount` a custom adapter that retries failed connections for HTTP and HTTPS requests.
session.mount("http://", requests.adapters.HTTPAdapter(max_retries=max_retries))
session.mount("https://", requests.adapters.HTTPAdapter(max_retries=max_retries))
def create_logger(logfilename):
"""Create logger for logging to screen and file."""
logger = logging.getLogger("logger")
logger.setLevel(logging.DEBUG)
fh = logging.FileHandler(logfilename)
fh.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s\n%(message)s", "%Y-%m-%d %H:%M:%S")
fh.setFormatter(formatter)
logger.addHandler(fh)
# Also print log messages to console
console = logging.StreamHandler()
console.setLevel(logging.DEBUG)
console.setFormatter(formatter)
logger.addHandler(console)
def hashfile(afile, hasher=None, blocksize=65536):
"""Returns sha256 hash of file"""
if not hasher:
hasher = hashlib.sha256()
buf = afile.read(blocksize)
while len(buf) > 0:
hasher.update(buf)
buf = afile.read(blocksize)
return hasher.hexdigest()
def hashbinary(raw_bytes, hasher=None):
"""Returns sha256 hash of raw bytes"""
if not hasher:
hasher = hashlib.sha256()
hasher.update(raw_bytes)
return hasher.hexdigest()
def get_referer(url):
"""Returns a made up a referer from the given url"""
parsed_uri = urlparse.urlparse(url)
netloc = parsed_uri.netloc
scheme = parsed_uri.scheme
if netloc.startswith("fansshare"): # Hack for fansshare.
netloc = "www." + netloc
domain = '{}://{}'.format(scheme, netloc)
return domain
def generate_headers(url):
"""Returns dict for header of requests"""
user_agent = MY_USER_AGENT_STRING
referer = get_referer(url)
headers = {"Referer":referer, "User-agent":user_agent}
return headers
def download_image(counter, url, sha256, timeout=60):
"""Download image from url.
Returns response object if successful else return None
"""
logger = logging.getLogger("logger")
try:
headers = generate_headers(url)
response = session.get(url, headers=headers, timeout=timeout)
if response.status_code != requests.codes.OK: # Status 200
response.raise_for_status()
# Check if returned image
if has_magic_lib:
content_type = magic.from_buffer(response.content, mime=True)
else:
content_type = response.headers["content-type"] # Sometimes this is missing, raising KeyError
if (content_type is None) or not content_type.startswith("image"):
logger.error("Line {number}: Invalid content-type {content_type}: {url}".format(number=counter,
content_type=content_type,
url=url.encode("utf-8", "ignore")))
return None
if hashbinary(response.content) != sha256:
logger.error("Line {number}: SHA 256 hash different: {url}".format(number=counter, url=url.encode("utf-8", "ignore")))
return None
return response
except KeyError as e:
logger.error("Line {number}: {error}: {url}".format(number=counter, error=e, url=url.encode("utf-8", "ignore")))
return None
except ConnectionError as e:
logger.error("Line {number}: {error}: {url}".format(number=counter, error=e, url=url.encode("utf-8", "ignore")))
return None
except HTTPError as e:
logger.error("Line {number}: {error}: {url}".format(number=counter, error=e, url=url.encode("utf-8", "ignore")))
return None
except Timeout as e:
logger.error("Line {number}: {error}: {url}".format(number=counter, error=e, url=url.encode("utf-8", "ignore")))
return None
except TooManyRedirects as e:
logger.error("Line {number}: {error}: {url}".format(number=counter, error=e, url=url.encode("utf-8", "ignore")))
return None
except RequestException as e:
logger.error("Line {number}: {error}: {url}".format(number=counter, error=e, url=url.encode("utf-8", "ignore")))
return None
except Exception as e:
logger.error("Line {number}: {error}: {url}".format(number=counter, error=e, url=url.encode("utf-8", "ignore")))
return None
def parse_line(line):
"""Parse a line in FaceScrub data file"""
parts = line.rstrip().split('\t') # Split on tabs
name = parts[0]
image_id = int(parts[1])
face_id = int(parts[2])
url = parts[3]
bbox = map(int, parts[4].split(',')) # This is a list of int
sha256 = parts[5]
return name, image_id, face_id, url, bbox, sha256
def ensure_dir_exists(dirpath):
"""Create directory specified by dirpath if it does not exists"""
if not os.path.exists(dirpath):
os.makedirs(dirpath)
def save_image(counter, url, response, datasetpath, name, image_id, face_id, bbox, save_face=False):
"""Save image
Full images saved to datasetpath/images/name_image_id.ext
Face images saved to datasetpath/faces/name_image_id_face_id.ext
Returns True if successful else False
"""
logger = logging.getLogger("logger")
# Output dir for images is datasetpath/images/name
output_dir = os.path.join(datasetpath, "images", name)
ensure_dir_exists(output_dir)
# Filename without extension
filename = "{name}_{image_id}".format(name=name,
image_id=image_id)
outpath = os.path.join(output_dir, filename)
# Save file without file extension
with open(outpath, 'wb') as outfile:
outfile.write(response.content)
filetype = imghdr.what(outpath)
# Cannot determine filetype.
if filetype is None and not has_magic_lib:
os.remove(outpath)
logger.error("Line {number}: Cannot determine file type: {url}".format(number=counter, url=url.encode("utf-8", "ignore")))
return False
# Get filetype using lib magic
elif filetype is None and has_magic_lib:
mimetype = magic.from_buffer(response.content, mime=True)
if mimetype is None:
logger.error("Line {number}: Cannot determine file type: {url}".format(number=counter, url=url.encode("utf-8", "ignore")))
return False
ext = mimetypes.guess_extension(mimetype).lstrip('.')
if ext is None:
logger.error("Line {number}: Cannot determine file type: {url}".format(number=counter, url=url.encode("utf-8", "ignore")))
return False
elif ext == "jpe":
filetype = "jpeg"
# Rename file to have extension
newpath = "{}.{}".format(outpath, filetype)
shutil.move(outpath, newpath)
# If user wants face images
if save_face:
try:
I = Image.open(newpath)
output_dir = os.path.join(datasetpath, "faces", name)
ensure_dir_exists(output_dir)
filename = "{name}_{image_id}_{face_id}.{ext}".format(name=name,
image_id=image_id,
face_id=face_id,
ext=filetype)
I.crop(bbox).save(os.path.join(output_dir, filename))
except IOError as e:
logger.error("Line {number}: {error}: {url}".format(number=counter, error=e, url=url.encode("utf-8", "ignore")))
return False
return True
def main():
parser = argparse.ArgumentParser(description="Script to download FaceScrub dataset")
parser.add_argument("inputfile", help="FaceScrub data file. E.g., actors_users_normal_bbox.txt", type=str)
parser.add_argument("datasetpath", help="Directory to save images", type=str)
parser.add_argument("--crop_face", help="Whether to crop and save face images", dest="crop_face", action="store_true", default=False)
parser.add_argument('-t', '--timeout', type=float, help="Number of seconds (float) to wait before requests timeout", action="store", required=False, dest="timeout", default=60)
parser.add_argument('-r', '--max_retries', type=int, help="Maximum number of retries before giving up", action="store", required=False, dest="max_retries", default=1)
parser.add_argument('-l', '--logfile', type=str, help="File to log operations", action="store", required=False, dest="logfile", default="download.log")
parser.add_argument('-s', '--start_at_line', type=int, help="Line number in FaceScrub data file to start download. Note: Header counts as 1 line",
action="store", required=False, dest="start_at_line", default=2)
parser.add_argument('-e', '--end_at_line', type=int, help="Last line number in FaceScrub data file to download. Note: Header counts as 1 line",
action="store", required=False, dest="end_at_line", default=0)
args = parser.parse_args()
assert args.timeout > 0, "timeout must be > 0"
assert args.max_retries >= 1, "max_retries must be >= 1"
assert args.start_at_line >= 1, "start_at_line must be >= 1"
assert args.end_at_line >= 0, "end_at_line must be >= 0"
end_at_line = None # Process until end of file
if args.end_at_line > 0:
end_at_line = args.end_at_line + 1
start_at_line = args.start_at_line - 1 # Index starts at 0
create_logger(args.logfile)
logger = logging.getLogger("logger")
setup_session(args.max_retries)
print("")
print('=' * 30)
print("Start processing from line: {}".format(args.start_at_line))
if end_at_line is None:
print("Processing till end of file")
else:
print("End processing at line: {}".format(args.end_at_line))
print('=' * 30)
print("")
try:
with open(args.inputfile) as infile:
for counter, line in enumerate(islice(infile, start_at_line, end_at_line),
start_at_line + 1):
name, image_id, face_id, url, bbox, sha256 = parse_line(line)
logger.info("Processing line {}: {}".format(counter, url))
response = download_image(counter, url, sha256, args.timeout)
if response is None:
continue
save_image(counter, url, response, args.datasetpath, name.replace(' ', '_'), image_id, face_id, bbox, save_face=args.crop_face)
except EnvironmentError as e:
logger.error("{}".format(e))
if __name__ == "__main__":
main()