-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_linkedin.py
73 lines (60 loc) · 2.34 KB
/
scrape_linkedin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import logging
from linkedin_jobs_scraper import LinkedinScraper
from linkedin_jobs_scraper.events import Events, EventData
from linkedin_jobs_scraper.query import Query, QueryOptions, QueryFilters
from linkedin_jobs_scraper.filters import RelevanceFilters, TimeFilters, TypeFilters, ExperienceLevelFilters, RemoteFilters
import json
# Change root logger level (default is WARN)
logging.basicConfig(level = logging.INFO)
cache = []
def on_data(data: EventData):
scraped = {
"job_id": data.job_id,
"link": data.link,
"apply_link": data.apply_link,
"title": data.title,
"company": data.company,
"place": data.place,
"description": data.description,
"description_html": data.description_html,
"date": data.date,
"seniority_level": data.seniority_level,
"job_function": data.job_function,
"employment_type": data.employment_type,
"industries": data.industries
}
cache.append(scraped)
def on_error(error):
print('[ON_ERROR]', error)
def on_end():
print('[ON_END]')
scraper = LinkedinScraper(
chrome_executable_path=r'C:\Program Files\Google\chromedriver.exe', # Custom Chrome executable path (e.g. /foo/bar/bin/chromedriver)
chrome_options=None, # Custom Chrome options here
headless=True, # Overrides headless mode only if chrome_options is None
max_workers=1, # How many threads will be spawned to run queries concurrently (one Chrome driver for each thread)
slow_mo=0.4, # Slow down the scraper to avoid 'Too many requests (429)' errors
)
# Add event listeners
scraper.on(Events.DATA, on_data)
scraper.on(Events.ERROR, on_error)
scraper.on(Events.END, on_end)
queries = [
Query(
options=QueryOptions(
locations=['Singapore'],
optimize=True, # Blocks requests for resources like images and stylesheet
limit=100, # Limit the number of jobs to scrape
filters=QueryFilters(
relevance=RelevanceFilters.RECENT,
time=TimeFilters.MONTH,
type=[TypeFilters.FULL_TIME, TypeFilters.INTERNSHIP],
experience=None,
)
)
),
]
scraper.run(queries)
with open('./jobs.json', 'w') as f:
json.dump(cache, f, indent=4)
print(f"Operation completed. Scraped {len(cache)} jobs")