-
Notifications
You must be signed in to change notification settings - Fork 0
/
pinterest.py
143 lines (95 loc) · 3.61 KB
/
pinterest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import configparser
import requests
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
class SiteScraper:
def __init__(
self,
search_word="car"
):
conf = configparser.ConfigParser()
conf.read("config.ini")
self.url = conf["selenium"]["url"]
self.sources = []
self.searchwords = []
args = conf["selenium"]["args"].split(",")
self.path = conf["selenium"]["path"]
self.wait = conf.getint("selenium", "wait")
self.options = self.set_options(args)
self.driver = webdriver.Chrome(self.path, chrome_options=self.options)
self.driver.implicitly_wait(self.wait)
self.driver.get(self.url+search_word)
# initializes a new driver
def start_driver(self):
self.driver = webdriver.Chrome(self.path, chrome_options=self.options)
self.driver.implicitly_wait(self.wait)
def set_options(self,
args=["headless","--ignore-certificate-errors","--test-type"]):
#binary_location="/usr/bin/chromium"
options = webdriver.ChromeOptions()
#options.binary_location=binary_location
for arg in args:
options.add_argument(arg)
return options
def run_scrape(self):
self.get_search_words()
self.get_sources()
# gets the html img elements whole
def get_images(self):
return self.driver.find_elements_by_tag_name('img')
# loads up a new page
def load_search(self, search_word):
# attempts to close the previous page if there is one
try:
self.driver.close()
except Exception as e:
pass
try:
self.driver.get(self.url+search_word)
except Exception as e:
print(e)
def get_search_words(self):
# gets recommended words for given search.
# which appear on top of the page on some searches
try:
elems = self.driver.find_elements_by_css_selector('a[title^="Search for"]')
except Exception as e:
elems = []
for elem in elems:
# gets the part of the title of element that contains the recommended word
word = re.findall(r'\"(.+?)\"', elem.get_attribute("title"))[0]
word = word.split(' ')[1]
# don't add duplicates
if word not in self.searchwords:
self.searchwords.append(word)
# gets the urls to the images from the html elements
def get_sources(self):
images = self.get_images()
for img in images:
src = img.get_attribute('src')
self.sources.append(src)
def close(self):
self.driver.quit()
def return_search_words(self):
return self.searchwords
def return_sources(self):
return self.sources
def return_img_elements(self):
return self.images
if __name__ == "__main__":
scraper = SiteScraper()
print (scraper.return_sources())
print (scraper.return_search_words())
# SELENIUM
# MANY-TO-MANY
# https://www.pinterest.co.uk/search/pins/?q=car
# THE PLAN:
# 1 . BACKGROUND PROCESSES:
# (1.1) Process for getting images and writing them into db with color vals
# (1.1.1) Use SELENIUM to simulate Chrome
# (1.1.2) Use https://www.pinterest.co.uk/search/pins/?q=car
# (1.2) Process for getting entries from there and entering them into db with
# Many-to-Many relationsip
# 2 . ACTUAL API:
# just make queries to the database