-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
17 changed files
with
206 additions
and
145 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,32 +22,34 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) | |
|
||
```python | ||
from pathlib import Path | ||
from staffspy import scrape_staff, SolverType | ||
from staffspy import LinkedInAccount, SolverType | ||
|
||
session_file = Path(__file__).resolve().parent / "session.pkl" | ||
account = LinkedInAccount( | ||
## credentials - remove these to sign in with browser | ||
username="[email protected]", | ||
password="mypassword", | ||
solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha | ||
solver_service=SolverType.CAPSOLVER, | ||
|
||
session_file=str(session_file), # save login cookies to only log in once (lasts a week or so) | ||
log_level=1, # 0 for no logs | ||
) | ||
|
||
staff = scrape_staff( | ||
## staff filters | ||
# search by company | ||
staff = account.scrape_staff( | ||
company_name="openai", | ||
search_term="software engineer", | ||
location="london", | ||
extra_profile_data=True, # fetch all past experiences, schools, & skills | ||
## | ||
|
||
## config | ||
max_results=50, # can go up to 1000 | ||
session_file=str(session_file), # save login cookies to only log in once (lasts a week or so) | ||
log_level=1, # 0 for no logs | ||
## | ||
|
||
## credentials - remove these to sign in with browser | ||
username="[email protected]", | ||
password="mypassword", | ||
solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha | ||
solver_service=SolverType.CAPSOLVER | ||
## | ||
) | ||
filename = "staff.csv" | ||
staff.to_csv(filename, index=False) | ||
# or fetch by user ids | ||
users = account.scrape_users( | ||
user_ids=['williamhgates', 'rbranson', 'jeffweiner08'] | ||
) | ||
staff.to_csv("staff.csv", index=False) | ||
users.to_csv("users.csv", index=False) | ||
``` | ||
|
||
#### Browser login | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "staffspy" | ||
version = "0.2.5" | ||
version = "0.2.6" | ||
description = "Staff scraper library for LinkedIn" | ||
authors = ["Cullen Watson <[email protected]>"] | ||
readme = "README.md" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,75 +1,115 @@ | ||
import pandas as pd | ||
|
||
from staffspy.linkedin.linkedin import LinkedInScraper | ||
from staffspy.utils.models import Staff | ||
from staffspy.solvers.capsolver import CapSolver | ||
from staffspy.solvers.solver_type import SolverType | ||
from staffspy.solvers.two_captcha import TwoCaptchaSolver | ||
from staffspy.utils.utils import set_logger_level, logger, Login | ||
|
||
from staffspy.utils import set_logger_level, logger, Login | ||
|
||
class LinkedInAccount: | ||
solver_map = { | ||
SolverType.CAPSOLVER: CapSolver, | ||
SolverType.TWO_CAPTCHA: TwoCaptchaSolver | ||
} | ||
|
||
def scrape_staff( | ||
*, | ||
company_name: str = None, | ||
user_id: str = None, | ||
session_file: str = None, | ||
search_term: str = None, | ||
location: str = None, | ||
extra_profile_data: bool = False, | ||
max_results: int = 1000, | ||
log_level: int = 0, | ||
username: str = None, | ||
password: str = None, | ||
solver_api_key: str = None, | ||
solver_service: SolverType = SolverType.CAPSOLVER | ||
def __init__( | ||
self, | ||
session_file: str = None, | ||
username: str = None, | ||
password: str = None, | ||
log_level: int = 0, | ||
solver_api_key: str = None, | ||
solver_service: SolverType = SolverType.CAPSOLVER | ||
): | ||
self.session_file = session_file | ||
self.username = username | ||
self.password = password | ||
self.log_level = log_level | ||
self.solver = self.solver_map[solver_service](solver_api_key) | ||
self.session = None | ||
self.linkedin_scraper = None | ||
self.login() | ||
|
||
) -> pd.DataFrame: | ||
"""Scrape staff from Linkedin | ||
company_name - name of company to find staff frame | ||
user_id - alternative to company_name, fetches the company_name from the user profile | ||
session_file - place to save cookies to only sign in once | ||
search_term - occupation / term to search for at the company | ||
location - filter for staff at a location | ||
extra_profile_data - fetches staff's experiences, schools, and mor | ||
max_results - amount of results you desire | ||
log_level - level of logs, 0 for no logs, 2 for all | ||
usernme,password - for requests based sign in | ||
solver_api_key,solver_service - options to bypass captcha | ||
""" | ||
set_logger_level(log_level) | ||
def login(self): | ||
set_logger_level(self.log_level) | ||
login = Login(self.username, self.password, self.solver, self.session_file) | ||
self.session = login.load_session() | ||
|
||
solver=None | ||
if solver_service == SolverType.CAPSOLVER: | ||
solver = CapSolver(solver_api_key) | ||
elif solver_service == SolverType.TWO_CAPTCHA: | ||
solver = TwoCaptchaSolver(solver_api_key) | ||
login = Login(username, password, solver, session_file) | ||
session = login.load_session() | ||
def scrape_staff( | ||
self, | ||
company_name: str = None, | ||
user_id: str = None, | ||
search_term: str = None, | ||
location: str = None, | ||
extra_profile_data: bool = False, | ||
max_results: int = 1000 | ||
) -> pd.DataFrame: | ||
"""Scrape staff from Linkedin | ||
company_name - name of company to find staff frame | ||
user_id - alternative to company_name, fetches the company_name from the user profile | ||
search_term - occupation / term to search for at the company | ||
location - filter for staff at a location | ||
extra_profile_data - fetches staff's experiences, schools, and mor | ||
max_results - amount of results you desire | ||
""" | ||
li_scraper = LinkedInScraper(self.session) | ||
|
||
li = LinkedInScraper(session) | ||
if not company_name: | ||
if not user_id: | ||
raise ValueError("Either company_name or user_id must be provided") | ||
company_name = li_scraper.fetch_user_profile_data_from_public_id('company_id') | ||
|
||
if not company_name: | ||
if not user_id: | ||
raise ValueError("Either company_name or user_id must be provided") | ||
staff = li_scraper.scrape_staff( | ||
company_name=company_name, | ||
extra_profile_data=extra_profile_data, | ||
search_term=search_term, | ||
location=location, | ||
max_results=max_results, | ||
) | ||
staff_dicts = [staff.to_dict() for staff in staff] | ||
staff_df = pd.DataFrame(staff_dicts) | ||
|
||
company_name = li.fetch_company_id_from_user(user_id) | ||
if staff_df.empty: | ||
return staff_df | ||
linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"] | ||
non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"] | ||
staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) | ||
logger.info(f"Scraped {len(staff_df)} staff members from {company_name}") | ||
return staff_df | ||
|
||
staff = li.scrape_staff( | ||
company_name=company_name, | ||
extra_profile_data=extra_profile_data, | ||
search_term=search_term, | ||
location=location, | ||
max_results=max_results, | ||
) | ||
staff_dicts = [staff.to_dict() for staff in staff] | ||
staff_df = pd.DataFrame(staff_dicts) | ||
def scrape_users( | ||
self, | ||
user_ids: list[str] | ||
) -> pd.DataFrame: | ||
"""Scrape users from Linkedin by user IDs | ||
user_ids - list of LinkedIn user IDs | ||
""" | ||
li_scraper = LinkedInScraper(self.session) | ||
li_scraper.num_staff = len(user_ids) | ||
users = [ | ||
Staff( | ||
id='', | ||
search_term='manual', | ||
profile_id=user_id, | ||
) for user_id in user_ids | ||
] | ||
|
||
if staff_df.empty: | ||
return staff_df | ||
linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"] | ||
non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"] | ||
staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) | ||
logger.info( | ||
f"Scraped {len(staff_df)} staff members, with {len(linkedin_member_df)} hidden LinkedIn Members." | ||
) | ||
return staff_df | ||
for i, user in enumerate(users,start=1): | ||
user.id = li_scraper.fetch_user_profile_data_from_public_id(user.profile_id, 'user_id') | ||
if user.id: | ||
li_scraper.fetch_all_info_for_employee( | ||
user, i | ||
) | ||
|
||
users_dicts = [user.to_dict() for user in users if user.id] | ||
users_df = pd.DataFrame(users_dicts) | ||
|
||
if users_df.empty: | ||
return users_df | ||
linkedin_member_df = users_df[users_df["name"] == "LinkedIn Member"] | ||
non_linkedin_member_df = users_df[users_df["name"] != "LinkedIn Member"] | ||
users_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) | ||
logger.info(f"Scraped {len(users_df)} users") | ||
return users_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.