-
Notifications
You must be signed in to change notification settings - Fork 0
/
search_image.py
129 lines (102 loc) · 3.97 KB
/
search_image.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import requests
from bs4 import BeautifulSoup
import json
import re
from urllib.parse import urlparse, quote, quote_plus
import io
import time
import os
import threading
import argparse
headers = {
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Linux; Android 12) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Mobile Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "en-GB,en;q=0.9",
}
all_images = {}
threads_running = 0 # don't change this
def get_extension(url):
path = urlparse(url).path.strip('/')
path,filename = os.path.split(path)
filename,ext = os.path.splitext(filename)
if ext:
return ext
else:
return '.jpg'
def threaded_download(savepath,url):
global threads_running
try:
r = requests.get(url,headers=headers)
except Exception as e:
print("Request Error : "+url)
threads_running-=1
return
if r.status_code==200:
if 'image' in r.headers.get('content-type','').lower():
with open(savepath,'wb',) as f:
f.write(r.content)
print('Downloaded '+os.path.basename(savepath))
else:
print("Not an Image : "+url)
else:
print(f"{r.status_code} Error : "+url+" ")
threads_running-=1
def collect_images_from_google(query):
chunk = 0
step = 0
prev_len = 0
no_more = 0
while True:
r = requests.get(f'https://www.google.com/search?q={query}&tbm=isch&biw=1271&bih=697&async=_id:islrg_c,_fmt:json&asearch=ichunklite&ved=0ahUKEwjw_KTXkM_3AhVJxDgGHR-pDeoQtDIIPSgA&start={chunk}&ijn={step}',headers=headers)
if r.status_code==200:
json_text = r.content.decode('utf8').removeprefix(")]}'")
json_data = json.loads(json_text)
try:
results = json_data['ichunklite']['results']
for result in results:
original_image = result['viewer_metadata']['original_image']['url']
all_images[original_image] = result['image_docid']
except:
pass
chunk +=100
step +=1
if no_more>5:
break
if prev_len==len(all_images):
no_more+=1
else:
print(len(all_images))
prev_len = len(all_images)
no_more = 0
def download_images(query):
global threads_running
print("Downloading "+str(len(all_images))+" images")
if not os.path.exists(os.path.join("Images",f"{query}")):
os.makedirs(os.path.join("Images",f"{query}"))
for image_link,doc_id in all_images.items():
image_save_path = os.path.join("Images",f"{query}",doc_id+get_extension(image_link))
if os.path.exists(image_save_path):
if os.path.getsize(image_save_path)>1:
continue
threading.Thread(target=threaded_download,kwargs={'url':image_link,'savepath':image_save_path}).start()
threads_running+=1
while threads_running>9:
time.sleep(1)
while threads_running>0:
time.sleep(1)
if __name__=="__main__":
parser = argparse.ArgumentParser(description = "Download images from google")
parser.add_argument("-k", "--keyword", help = "Example: Help argument", required = True, default = "")
arguments = parser.parse_args()
keyword= arguments.keyword
collect_images_from_google(keyword)
download_images(keyword)