-
Notifications
You must be signed in to change notification settings - Fork 0
/
ShopeeSpider.py
171 lines (151 loc) · 7.56 KB
/
ShopeeSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import json
import os
import requests
import scrapy
from pathlib import Path
#####################################################
#################### USER INPUTS ####################
#####################################################
# Source: http://image-net.org/challenges/LSVRC/2014/browse-synsets
# search_terms is a dictionary of search_term: list of illegal_entities
search_terms = {
'tiger': {'tiger cat', 'tiger'}
}
#####################################################
################### DEFAULT INPUTS ##################
#####################################################
# Do not change unless you know what you are doing
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
#####################################################
############## GENERAL SPIDER HELPERS ###############
#####################################################
# TODO: Move this section to a helper file
def download_image(image_url, download_file_path):
with open(download_file_path, 'wb') as f:
f.write(requests.get(image_url).content)
def is_illegal(image_file_path, illegal_entities):
stream = os.popen(f'. predict.sh {image_file_path}')
predictions = list(map(lambda s: s.split(':')[-1].strip(), stream.readlines()))
# TODO: Add confidence threshold here
return predictions[0] in illegal_entities # Top 1 only
#####################################################
############### SHOPEE SPIDER HELPERS ###############
#####################################################
def get_shopee_search_url(search_term, page_num=1):
batch_size = 2 # Increase if needed to impress
search_by = 'relevancy' # or ctime
return f'https://shopee.sg/api/v2/search_items/?by={search_by}&keyword={search_term}&limit={batch_size}&newest={page_num * batch_size}&order=desc&page_type=search&version=2'
def get_shopee_search_referer_url(search_term, page_num=0):
page_query_param = f'&page={page_num}' if page_num else ''
return f'https://shopee.sg/search?keyword={search_term}{page_query_param}'
def get_shopee_item_referer_url(search_term):
return f'https://shopee.sg/search?keyword={search_term}'
def get_shopee_item_url(item_id, shop_id):
return f'https://shopee.sg/api/v2/item/get?itemid={item_id}&shopid={shop_id}'
def get_shopee_image_url(long_image_id):
return f'https://cf.shopee.sg/file/{long_image_id}'
#####################################################
################### SHOPEE SPIDER ###################
#####################################################
class ShopeeSpider(scrapy.Spider):
name = 'ShopeeSpider'
allowed_domains = ['shopee.sg']
# Shopee blocks Scrapy's default user-agent and checks for valid referer.
custom_settings = {
'AUTOTHROTTLE_ENABLED': True,
'USER_AGENT': user_agent,
'has_more': True,
'page_num': 1,
'page_lim': 2, # Because this is just a POC
'illegal_items': []
}
def start_requests(self):
# Cannot return yields. See:
# https://stackoverflow.com/questions/8779964/python-yield-and-return-statements-and-scrapy-yielding-the-requests
return [
scrapy.Request(
url=get_shopee_search_url(search_term),
headers = {
'Accept': '*/*',
'Content-Type': 'application/json',
'Referer': get_shopee_search_referer_url(search_term)
},
callback=self.parse,
meta={
'search_term': search_term,
'illegal_entities': search_terms[search_term]
}
) for search_term in search_terms
]
def parse(self, response):
# TODO: Remove page_lim in the future
meta = {
'search_term': response.meta['search_term'],
'illegal_entities': response.meta['illegal_entities']
}
if not ShopeeSpider.custom_settings['has_more'] or ShopeeSpider.custom_settings['page_num'] > ShopeeSpider.custom_settings['page_lim']:
return
try:
data = json.loads(response.text)
if 'items' in data:
for item in data['items']:
try:
item_id, shop_id = item['itemid'], item['shopid']
yield scrapy.Request(
url=get_shopee_item_url(item_id, shop_id),
headers = {
'Accept': '*/*',
'Content-Type': 'application/json',
'Referer': get_shopee_item_referer_url(meta['search_term'])
},
callback=self.parse,
meta=meta
)
except Exception as e:
print(e)
print('Item was not processed either because of missing keys or scrapy request failure.')
continue
try:
yield scrapy.Request(
url=get_shopee_search_url(meta['search_term'], page_num=ShopeeSpider.custom_settings['page_num']),
headers = {
'Accept': '*/*',
'Content-Type': 'application/json',
'Referer': get_shopee_search_referer_url(meta['search_term'], page_num=ShopeeSpider.custom_settings['page_num'] - 1)
},
callback=self.parse,
meta=meta
)
except Exception as e:
print(e)
print('Could not retrieve search_term from response. Consider storing search_term as a class/instance variable for one fewer point of failure.')
ShopeeSpider.custom_settings['has_more'] = not data['nomore'] if 'nomore' in data else True
ShopeeSpider.custom_settings['page_num'] = ShopeeSpider.custom_settings['page_num'] + 1
elif 'item' in data:
try:
item = data['item']
item_id, shop_id, images = item['itemid'], item['shopid'], item['images']
if not images:
print('Either there are no images to process or the correct key to query has changed.')
return
image_id = images[0]
download_directory = f'{ShopeeSpider.name}/{shop_id}/{item_id}'
download_file_path = f'{download_directory}/{image_id}.jpg'
Path(download_directory).mkdir(parents=True, exist_ok=True)
image_url = get_shopee_image_url(image_id)
download_image(image_url, download_file_path)
if is_illegal(download_file_path, meta['illegal_entities']):
ShopeeSpider.custom_settings['illegal_items'].append({
'url': get_shopee_item_url(item_id, shop_id),
'image_url': image_url
})
except Exception as e:
print(e)
print('Item was not processed either because of missing keys or scrapy request failure.')
return
except Exception as e:
print(e)
print('Unable to load response.text as JSON.')
return
def closed(self, reason):
print(f'Current ShopeeSpider.custom_settings looks like: {str(ShopeeSpider.custom_settings)}')