Skip to content

Commit

Permalink
enh: search by user id (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson authored Aug 2, 2024
1 parent 4546e64 commit 0d116a9
Show file tree
Hide file tree
Showing 17 changed files with 206 additions and 145 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2023 Cullen Watson
Copyright (c) 2024 Cullen Watson

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
38 changes: 20 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,33 +21,35 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
### Usage

```python
from staffspy import scrape_staff, SolverType
from staffspy import LinkedInAccount, SolverType
from pathlib import Path

session_file = Path(__file__).resolve().parent / "session.pkl"
account = LinkedInAccount(
## credentials - remove these to sign in with browser
username="[email protected]",
password="mypassword",
solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha
solver_service=SolverType.CAPSOLVER,

session_file=str(session_file), # save login cookies to only log in once (lasts a week or so)
log_level=1, # 0 for no logs
)

staff = scrape_staff(
## staff filters
# search by company
staff = account.scrape_staff(
company_name="openai",
search_term="software engineer",
location="london",
extra_profile_data=True, # fetch all past experiences, schools, & skills
##

## config
max_results=50, # can go up to 1000
session_file=str(session_file), # save login cookies to only log in once (lasts a week or so)
log_level=1, # 0 for no logs
##

## credentials - remove these to sign in with browser
username="[email protected]",
password="mypassword",
solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha
solver_service=SolverType.CAPSOLVER
##
)
filename = "staff.csv"
staff.to_csv(filename, index=False)
# or fetch by user ids
users = account.scrape_users(
user_ids=['williamhgates', 'rbranson', 'jeffweiner08']
)
staff.to_csv("staff.csv", index=False)
users.to_csv("users.csv", index=False)
```

#### Browser login
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "staffspy"
version = "0.2.5"
version = "0.2.6"
description = "Staff scraper library for LinkedIn"
authors = ["Cullen Watson <[email protected]>"]
readme = "README.md"
Expand Down
158 changes: 99 additions & 59 deletions staffspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,115 @@
import pandas as pd

from staffspy.linkedin.linkedin import LinkedInScraper
from staffspy.utils.models import Staff
from staffspy.solvers.capsolver import CapSolver
from staffspy.solvers.solver_type import SolverType
from staffspy.solvers.two_captcha import TwoCaptchaSolver
from staffspy.utils.utils import set_logger_level, logger, Login

from staffspy.utils import set_logger_level, logger, Login

class LinkedInAccount:
solver_map = {
SolverType.CAPSOLVER: CapSolver,
SolverType.TWO_CAPTCHA: TwoCaptchaSolver
}

def scrape_staff(
*,
company_name: str = None,
user_id: str = None,
session_file: str = None,
search_term: str = None,
location: str = None,
extra_profile_data: bool = False,
max_results: int = 1000,
log_level: int = 0,
username: str = None,
password: str = None,
solver_api_key: str = None,
solver_service: SolverType = SolverType.CAPSOLVER
def __init__(
self,
session_file: str = None,
username: str = None,
password: str = None,
log_level: int = 0,
solver_api_key: str = None,
solver_service: SolverType = SolverType.CAPSOLVER
):
self.session_file = session_file
self.username = username
self.password = password
self.log_level = log_level
self.solver = self.solver_map[solver_service](solver_api_key)
self.session = None
self.linkedin_scraper = None
self.login()

) -> pd.DataFrame:
"""Scrape staff from Linkedin
company_name - name of company to find staff frame
user_id - alternative to company_name, fetches the company_name from the user profile
session_file - place to save cookies to only sign in once
search_term - occupation / term to search for at the company
location - filter for staff at a location
extra_profile_data - fetches staff's experiences, schools, and mor
max_results - amount of results you desire
log_level - level of logs, 0 for no logs, 2 for all
usernme,password - for requests based sign in
solver_api_key,solver_service - options to bypass captcha
"""
set_logger_level(log_level)
def login(self):
set_logger_level(self.log_level)
login = Login(self.username, self.password, self.solver, self.session_file)
self.session = login.load_session()

solver=None
if solver_service == SolverType.CAPSOLVER:
solver = CapSolver(solver_api_key)
elif solver_service == SolverType.TWO_CAPTCHA:
solver = TwoCaptchaSolver(solver_api_key)
login = Login(username, password, solver, session_file)
session = login.load_session()
def scrape_staff(
self,
company_name: str = None,
user_id: str = None,
search_term: str = None,
location: str = None,
extra_profile_data: bool = False,
max_results: int = 1000
) -> pd.DataFrame:
"""Scrape staff from Linkedin
company_name - name of company to find staff frame
user_id - alternative to company_name, fetches the company_name from the user profile
search_term - occupation / term to search for at the company
location - filter for staff at a location
extra_profile_data - fetches staff's experiences, schools, and mor
max_results - amount of results you desire
"""
li_scraper = LinkedInScraper(self.session)

li = LinkedInScraper(session)
if not company_name:
if not user_id:
raise ValueError("Either company_name or user_id must be provided")
company_name = li_scraper.fetch_user_profile_data_from_public_id('company_id')

if not company_name:
if not user_id:
raise ValueError("Either company_name or user_id must be provided")
staff = li_scraper.scrape_staff(
company_name=company_name,
extra_profile_data=extra_profile_data,
search_term=search_term,
location=location,
max_results=max_results,
)
staff_dicts = [staff.to_dict() for staff in staff]
staff_df = pd.DataFrame(staff_dicts)

company_name = li.fetch_company_id_from_user(user_id)
if staff_df.empty:
return staff_df
linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"]
non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"]
staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
logger.info(f"Scraped {len(staff_df)} staff members from {company_name}")
return staff_df

staff = li.scrape_staff(
company_name=company_name,
extra_profile_data=extra_profile_data,
search_term=search_term,
location=location,
max_results=max_results,
)
staff_dicts = [staff.to_dict() for staff in staff]
staff_df = pd.DataFrame(staff_dicts)
def scrape_users(
self,
user_ids: list[str]
) -> pd.DataFrame:
"""Scrape users from Linkedin by user IDs
user_ids - list of LinkedIn user IDs
"""
li_scraper = LinkedInScraper(self.session)
li_scraper.num_staff = len(user_ids)
users = [
Staff(
id='',
search_term='manual',
profile_id=user_id,
) for user_id in user_ids
]

if staff_df.empty:
return staff_df
linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"]
non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"]
staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
logger.info(
f"Scraped {len(staff_df)} staff members, with {len(linkedin_member_df)} hidden LinkedIn Members."
)
return staff_df
for i, user in enumerate(users,start=1):
user.id = li_scraper.fetch_user_profile_data_from_public_id(user.profile_id, 'user_id')
if user.id:
li_scraper.fetch_all_info_for_employee(
user, i
)

users_dicts = [user.to_dict() for user in users if user.id]
users_df = pd.DataFrame(users_dicts)

if users_df.empty:
return users_df
linkedin_member_df = users_df[users_df["name"] == "LinkedIn Member"]
non_linkedin_member_df = users_df[users_df["name"] != "LinkedIn Member"]
users_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
logger.info(f"Scraped {len(users_df)} users")
return users_df
4 changes: 2 additions & 2 deletions staffspy/linkedin/certifications.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import logging

from staffspy.exceptions import TooManyRequests
from staffspy.models import Certification
from staffspy.utils.exceptions import TooManyRequests
from staffspy.utils.models import Certification

logger = logging.getLogger(__name__)

Expand Down
14 changes: 11 additions & 3 deletions staffspy/linkedin/employee.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import logging
import re

import staffspy.utils as utils
from staffspy.exceptions import TooManyRequests
import staffspy.utils.utils as utils
from staffspy.utils.exceptions import TooManyRequests
from staffspy.utils.models import Staff

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -40,7 +41,7 @@ def fetch_employee(self, base_staff, domain):
self.parse_emp(base_staff, employee_json)
return True

def parse_emp(self, emp, emp_dict):
def parse_emp(self, emp: Staff, emp_dict: dict):
"""Parse the employee data from the employee profile."""
try:
photo_data = emp_dict["profilePicture"]["displayImageReference"][
Expand All @@ -53,7 +54,14 @@ def parse_emp(self, emp, emp_dict):
profile_photo = None

emp.profile_id = emp_dict["publicIdentifier"]
try:
emp.headline = emp_dict.get('headline')
if not emp.headline:
emp.headline = emp_dict['memberRelationship']['memberRelationshipData']['noInvitation']['targetInviteeResolutionResult']['headline']
except:
pass
emp.is_connection = next(iter(emp_dict['memberRelationship']['memberRelationshipUnion'])) == 'connection'
emp.open_to_work = emp_dict['profilePicture'].get('frameType')=='OPEN_TO_WORK'

emp.profile_link = f'https://www.linkedin.com/in/{emp_dict["publicIdentifier"]}'

Expand Down
2 changes: 1 addition & 1 deletion staffspy/linkedin/employee_bio.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import logging

from staffspy.exceptions import TooManyRequests
from staffspy.utils.exceptions import TooManyRequests

logger = logging.getLogger(__name__)

Expand Down
6 changes: 3 additions & 3 deletions staffspy/linkedin/experiences.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import json
import logging

import staffspy.utils as utils
from staffspy.exceptions import TooManyRequests
from staffspy.models import Experience
import staffspy.utils.utils as utils
from staffspy.utils.exceptions import TooManyRequests
from staffspy.utils.models import Experience

logger = logging.getLogger(__name__)

Expand Down
Loading

0 comments on commit 0d116a9

Please sign in to comment.