-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_users.py
70 lines (69 loc) · 2.24 KB
/
scrape_users.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementNotInteractableException
from utils import init_driver, get_profile_urls, login,\
load_config, load_queries
from time import sleep
from classes.UserScraper import UserScraper
import argparse
import sys
import json
import uuid
parser = argparse.ArgumentParser(
description=("Scrape linkedin profiles based on the " +
"queries specified in the conf file")
)
parser.add_argument(
'-c', '--conf',
type=str,
metavar='',
required=True,
help='Specify the path of the configuration file'
)
parser.add_argument(
'-p', '--persons',
type=str,
metavar='',
required=True,
help='Specify the names of the persons you want to search separated by a comma and delimited by quotes'
)
args = parser.parse_args()
conf = load_config(args.conf)
queries = load_queries(args.persons)
parameters = conf["parameters"]
credentials = conf["credentials"]
CHROME_PATH = parameters["CHROME_PATH"]
CHROMEDRIVER_PATH = parameters["CHROMEDRIVER_PATH"]
N_PAGES = parameters["N_PAGES"]
LINUSERNAME = credentials["LINUSERNAME"]
LINPWD = credentials["LINPWD"]
driver = init_driver(CHROME_PATH, CHROMEDRIVER_PATH)
driver.get("https://www.linkedin.com")
login(driver, LINUSERNAME, LINPWD)
us = UserScraper(driver)
users_data = []
for query in queries:
driver.get("https://www.google.com")
sleep(2)
search_query = driver.find_element_by_name('q')
try:
search_query.send_keys(query)
except ElementNotInteractableException:
print("ERROR :: Cannot send query. Google might be blocking")
sys.exit(1)
sleep(0.5)
search_query.send_keys(Keys.RETURN)
profile_urls = get_profile_urls(driver, N_PAGES)
if len(profile_urls) == 0:
print()
print("WARNING :: " +
"Could not get any URLs for the query\n" + query)
print("Please double-check that Google is not " +
"blocking the query")
continue
for url in profile_urls:
users_data.append(us.scrape_user(query, url))
filename = 'data/'+ str(uuid.uuid4()) +'.json'
with open(filename, 'w') as outfile:
json.dump(users_data, outfile, ensure_ascii=False)
print("Data saved to " + filename)
driver.quit()