-
Notifications
You must be signed in to change notification settings - Fork 28
/
Copy pathDice_URL_Scraper.py
95 lines (83 loc) · 3.49 KB
/
Dice_URL_Scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 29 12:14:51 2018
You must install the chrome driver for this script. You can find it at:
https://sites.google.com/a/chromium.org/chromedriver/home
Move the file into C:\Windows\System32 or /usr/local/bin to "install" it
"""
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from sqlalchemy import create_engine
# Read in Credentials line by line and set equivalent variables
with open('Credentials.R') as f:
for line in f:
if 'mysql' in line:
line = line.replace("'", '')
var_name, value = line.split(' <- ')
if var_name == 'mysql_host':
host = value.strip()
elif var_name == 'mysql_user':
user = value.strip()
elif var_name == 'mysql_password':
password = value.strip()
f.close()
links_to_scrape = list()
def get_links(browser, conn):
links_to_return = list()
soup = BeautifulSoup(browser.page_source, 'html5lib')
# Grab all the job links and insert them into our table of Dice URLs
links = soup.find_all('a', {'class': 'dice-btn-link'})
for link in links:
if 'jobs/detail' in link['href']:
page_url = 'https://www.dice.com' + link['href']
links_to_return.append(page_url)
return(links_to_return)
# Start up the selenium instance for scrapping
browser = webdriver.Chrome()
browser.get('https://www.dice.com/')
# Create a connection to the MySQL database
engine = create_engine('mysql+pymysql://'+user+':'+password+'@'+host+'/DATA607')
conn = engine.connect()
# Scrape Dice results location by location
start_url = 'https://www.dice.com/jobs?q=%22Data+Scientist%22&l='
browser.get(start_url)
links_to_scrape = links_to_scrape + get_links(browser, conn)
more_to_scrape = True
while more_to_scrape:
try:
myElem = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.ID, 'predictsal')))
try:
browser.find_element_by_xpath('//*[@title="Go to next page"]').click()
links_to_scrape = links_to_scrape + get_links(browser, conn)
except NoSuchElementException:
# This is thrown when the browser can't find an element by
# xpath meaning we have reached the end of the search results
more_to_scrape = False
except TimeoutException:
print('Loading took too much time!')
continue
except:
# Second Chance - Refresh the browser and try again
print('Something bad happend. Trying one more time..')
browser.refresh()
myElem = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.ID, 'predictsal')))
try:
browser.find_element_by_xpath('//*[@title="Go to next page"]').click()
links_to_scrape = links_to_scrape + get_links(browser, conn)
print('Success!')
except:
print('Failed again!')
more_to_scrape = False
# Time to close up the show
browser.close()
# Get a unique set of URLs to insert and do it
for page_url in set(links_to_scrape):
data_to_insert = (page_url, 0)
conn.execute("""INSERT INTO DICE_RAW_HTML (url, scraped) VALUES (%s, %s)""", data_to_insert)
conn.close()