-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
167 lines (132 loc) · 5.74 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from util import get_image_format, get_image_link, is_already_saved, image_extensions, has_img_extension, list_folders, get_names_from_csv, sanitize_names_for_folders, add_row_to_csv, write_csv_header, read_json, last_row_from_csv
import csv
import requests
from serpapi import GoogleSearch
from dotenv import load_dotenv
from pathvalidate import sanitize_filename
from PIL import Image
import os
import sys
from glob import glob1
import urllib.request
load_dotenv()
token = os.environ.get('SERP_API_KEY')
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Firefox/60.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5"}
ambiguous_names = ['Adam Grünewald', 'Anton Burger', 'Carl_Værnet'
'Karl Otto von Salisch', 'Yuri Frolov']
name_only = ['Adolf Ax', 'Adolf Maurer']
def scrape_images(name, number_of_images, start_number):
name = str(name)
query = ''
if name in ambiguous_names:
# clarify search for names that are common
query = name + ' nazi'
elif name in name_only:
query = name
params = {
"api_key": token,
"engine": "google",
"ijn": "0",
"start": start_number,
"num": number_of_images,
"q": query,
"google_domain": "google.com",
"tbm": "isch",
'tbs': 'images'
}
search = GoogleSearch(params)
results = search.get_dict()
image_results = results['images_results']
try:
# saves json of response incase of failure
urllib.request.urlretrieve(
results['search_metadata']['json_endpoint'], 'data.json')
print(f'data.json for {query} saved!')
except FileNotFoundError in Exception as e:
print(f'{FileNotFoundError()}: {e}')
pass
try:
filtered_images = [
image for image in image_results if image['original']]
except KeyError in Exception as e:
print(f'{KeyError}: {e}')
except TypeError in Exception as e:
print(f'{TypeError}: {e}')
return filtered_images
def save_images(name, image_urls, last_image_index):
for i, image in enumerate(image_urls):
if i >= last_image_index:
# get the image extension
img_ext = get_image_format(image['original'])
if img_ext not in image_extensions or None:
continue
title = image['title']
img_position = image['position'] # position in search results
file_title = sanitize_filename(name).lower().replace(
' ', '').strip('_')[:6] # trim the file title for consistency
filename = f'{i+1}-{file_title}{img_ext}'
data_row = [name, filename, title,
img_position, image['original']]
try:
current_path = os.getcwd()
with open(current_path + "/" + filename, 'wb') as handler:
print(f'\nDownloading: {image["original"]}')
img_data = requests.get(get_image_link(
image['original']), headers=headers).content
handler.write(img_data)
print(f'\n Successfuly Saved: {filename}')
handler.close()
except KeyError as e:
print(
f'\nMapping of original key not found: {KeyError}, {e}')
except Exception as e:
print(f'{e}')
continue
# add info about saved image
try:
if is_already_saved(data_row, 'data-info.csv') == False:
add_row_to_csv(current_path + '/' +
'data-info.csv', data_row)
except Exception as e:
print(
f'\n Failed to Save: data-info.csv at the {i}th file, {e}')
continue
def lookup_recent_row(csv_file):
# finds image position in returned response
most_recent_row = last_row_from_csv(csv_file)
# if it hasnt been parsed yet start at the beginning
if most_recent_row == ['Name', 'Filename', 'Title', 'Position', 'Original URL']:
last_image_index = 1
else: # if there are already saved rows, gets last index used to prepend filename
last_image_index = int(most_recent_row[3])
return last_image_index
def list_images_in_folder(number_of_images, folder_name):
os.chdir(folder_name)
files = os.listdir(os.getcwd())
images_only = [file for file in files if has_img_extension(file)]
number_of_files_in_folder = len(images_only)
print(
f'Status: {number_of_files_in_folder}/{number_of_images} files in {os.getcwd()}')
if number_of_files_in_folder >= number_of_images:
print(
f'{number_of_files_in_folder} files in folder, only require {number_of_images} images\nFolder contains enough images. Skipping download.')
return
else:
remainder = number_of_images - number_of_files_in_folder
print(
f'{number_of_files_in_folder} files in folder, need {remainder} more images')
return number_of_files_in_folder
def download_images(name, number_of_images):
if os.path.isfile('data.json') and os.path.isfile('data-info.csv'):
last_image_index = lookup_recent_row('data-info.csv')
image_results = read_json('data.json')
image_urls = [
image for image in image_results if image['position'] > last_image_index]
save_images(name, image_urls, last_image_index)
else:
write_csv_header(
'data-info.csv', ["Name", "Filename", "Title", "Position", "Original URL"])
image_urls = scrape_images(name, number_of_images, 1)
save_images(name, image_urls, 0)