-
Notifications
You must be signed in to change notification settings - Fork 0
/
functions.py
83 lines (73 loc) · 3 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#%%
from selectorlib import Extractor
import requests
import json
from time import sleep
import random
import json
import re
user_agent_list = [
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36',
]
#%%
# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('search_results.yml')
def scrape(url):
headers = {
'dnt': '1',
'upgrade-insecure-requests': '1',
'user-agent': random.choice(user_agent_list),
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'same-origin',
'sec-fetch-mode': 'navigate',
'sec-fetch-user': '?1',
'sec-fetch-dest': 'document',
'referer': 'https://www.amazon.com/',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
# Download the page using requests
print("Downloading %s"%url)
r = requests.get(url, headers=headers)
# Simple check to check if page was blocked (Usually 503)
if r.status_code > 500:
if "To discuss automated access to Amazon data please contact" in r.text:
print("Page %s was blocked by Amazon. Please try using better proxies\n"%url)
else:
print("Page %s must have been blocked by Amazon as the status code was %d"%(url,r.status_code))
return None
# Pass the HTML of the page and create
return e.extract(r.text)
#%%
# product_data = []
#%%
def load_jsonl(input_path) -> list:
with open("search_results_urls.txt",'r') as urllist, open('search_results_output.jsonl','w') as outfile:
for url in urllist.read().splitlines():
data = scrape(url)
if data:
for product in data['products']:
product['search_url'] = url
print("Saving Product: %s"%product['title'])
json.dump(product,outfile)
outfile.write("\n")
#sleep(1)
"""
Read list of objects from a JSON lines file.
"""
data = []
with open(input_path, 'r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line.rstrip('\n|\r')))
print('Loaded {} records from {}'.format(len(data), input_path))
return data
def pattern_searcher(search_str:str, search_list:str):
search_obj = re.search(search_list, search_str)
if search_obj :
return_str = search_str[search_obj.start(): search_obj.end()]
else:
return_str = 'NA'
return return_str