Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enh: search by user id #34

Merged
merged 1 commit into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2023 Cullen Watson
Copyright (c) 2024 Cullen Watson

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
38 changes: 20 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,33 +21,35 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
### Usage

```python
from staffspy import scrape_staff, SolverType
from staffspy import LinkedInAccount, SolverType
from pathlib import Path

session_file = Path(__file__).resolve().parent / "session.pkl"
account = LinkedInAccount(
## credentials - remove these to sign in with browser
username="[email protected]",
password="mypassword",
solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha
solver_service=SolverType.CAPSOLVER,

session_file=str(session_file), # save login cookies to only log in once (lasts a week or so)
log_level=1, # 0 for no logs
)

staff = scrape_staff(
## staff filters
# search by company
staff = account.scrape_staff(
company_name="openai",
search_term="software engineer",
location="london",
extra_profile_data=True, # fetch all past experiences, schools, & skills
##

## config
max_results=50, # can go up to 1000
session_file=str(session_file), # save login cookies to only log in once (lasts a week or so)
log_level=1, # 0 for no logs
##

## credentials - remove these to sign in with browser
username="[email protected]",
password="mypassword",
solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha
solver_service=SolverType.CAPSOLVER
##
)
filename = "staff.csv"
staff.to_csv(filename, index=False)
# or fetch by user ids
users = account.scrape_users(
user_ids=['williamhgates', 'rbranson', 'jeffweiner08']
)
staff.to_csv("staff.csv", index=False)
users.to_csv("users.csv", index=False)
```

#### Browser login
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "staffspy"
version = "0.2.5"
version = "0.2.6"
description = "Staff scraper library for LinkedIn"
authors = ["Cullen Watson <[email protected]>"]
readme = "README.md"
Expand Down
158 changes: 99 additions & 59 deletions staffspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,75 +1,115 @@
import pandas as pd

from staffspy.linkedin.linkedin import LinkedInScraper
from staffspy.utils.models import Staff
from staffspy.solvers.capsolver import CapSolver
from staffspy.solvers.solver_type import SolverType
from staffspy.solvers.two_captcha import TwoCaptchaSolver
from staffspy.utils.utils import set_logger_level, logger, Login

from staffspy.utils import set_logger_level, logger, Login

class LinkedInAccount:
solver_map = {
SolverType.CAPSOLVER: CapSolver,
SolverType.TWO_CAPTCHA: TwoCaptchaSolver
}

def scrape_staff(
*,
company_name: str = None,
user_id: str = None,
session_file: str = None,
search_term: str = None,
location: str = None,
extra_profile_data: bool = False,
max_results: int = 1000,
log_level: int = 0,
username: str = None,
password: str = None,
solver_api_key: str = None,
solver_service: SolverType = SolverType.CAPSOLVER
def __init__(
self,
session_file: str = None,
username: str = None,
password: str = None,
log_level: int = 0,
solver_api_key: str = None,
solver_service: SolverType = SolverType.CAPSOLVER
):
self.session_file = session_file
self.username = username
self.password = password
self.log_level = log_level
self.solver = self.solver_map[solver_service](solver_api_key)
self.session = None
self.linkedin_scraper = None
self.login()

) -> pd.DataFrame:
"""Scrape staff from Linkedin
company_name - name of company to find staff frame
user_id - alternative to company_name, fetches the company_name from the user profile
session_file - place to save cookies to only sign in once
search_term - occupation / term to search for at the company
location - filter for staff at a location
extra_profile_data - fetches staff's experiences, schools, and mor
max_results - amount of results you desire
log_level - level of logs, 0 for no logs, 2 for all
usernme,password - for requests based sign in
solver_api_key,solver_service - options to bypass captcha
"""
set_logger_level(log_level)
def login(self):
set_logger_level(self.log_level)
login = Login(self.username, self.password, self.solver, self.session_file)
self.session = login.load_session()

solver=None
if solver_service == SolverType.CAPSOLVER:
solver = CapSolver(solver_api_key)
elif solver_service == SolverType.TWO_CAPTCHA:
solver = TwoCaptchaSolver(solver_api_key)
login = Login(username, password, solver, session_file)
session = login.load_session()
def scrape_staff(
self,
company_name: str = None,
user_id: str = None,
search_term: str = None,
location: str = None,
extra_profile_data: bool = False,
max_results: int = 1000
) -> pd.DataFrame:
"""Scrape staff from Linkedin
company_name - name of company to find staff frame
user_id - alternative to company_name, fetches the company_name from the user profile
search_term - occupation / term to search for at the company
location - filter for staff at a location
extra_profile_data - fetches staff's experiences, schools, and mor
max_results - amount of results you desire
"""
li_scraper = LinkedInScraper(self.session)

li = LinkedInScraper(session)
if not company_name:
if not user_id:
raise ValueError("Either company_name or user_id must be provided")
company_name = li_scraper.fetch_user_profile_data_from_public_id('company_id')

if not company_name:
if not user_id:
raise ValueError("Either company_name or user_id must be provided")
staff = li_scraper.scrape_staff(
company_name=company_name,
extra_profile_data=extra_profile_data,
search_term=search_term,
location=location,
max_results=max_results,
)
staff_dicts = [staff.to_dict() for staff in staff]
staff_df = pd.DataFrame(staff_dicts)

company_name = li.fetch_company_id_from_user(user_id)
if staff_df.empty:
return staff_df
linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"]
non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"]
staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
logger.info(f"Scraped {len(staff_df)} staff members from {company_name}")
return staff_df

staff = li.scrape_staff(
company_name=company_name,
extra_profile_data=extra_profile_data,
search_term=search_term,
location=location,
max_results=max_results,
)
staff_dicts = [staff.to_dict() for staff in staff]
staff_df = pd.DataFrame(staff_dicts)
def scrape_users(
self,
user_ids: list[str]
) -> pd.DataFrame:
"""Scrape users from Linkedin by user IDs
user_ids - list of LinkedIn user IDs
"""
li_scraper = LinkedInScraper(self.session)
li_scraper.num_staff = len(user_ids)
users = [
Staff(
id='',
search_term='manual',
profile_id=user_id,
) for user_id in user_ids
]

if staff_df.empty:
return staff_df
linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"]
non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"]
staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
logger.info(
f"Scraped {len(staff_df)} staff members, with {len(linkedin_member_df)} hidden LinkedIn Members."
)
return staff_df
for i, user in enumerate(users,start=1):
user.id = li_scraper.fetch_user_profile_data_from_public_id(user.profile_id, 'user_id')
if user.id:
li_scraper.fetch_all_info_for_employee(
user, i
)

users_dicts = [user.to_dict() for user in users if user.id]
users_df = pd.DataFrame(users_dicts)

if users_df.empty:
return users_df
linkedin_member_df = users_df[users_df["name"] == "LinkedIn Member"]
non_linkedin_member_df = users_df[users_df["name"] != "LinkedIn Member"]
users_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
logger.info(f"Scraped {len(users_df)} users")
return users_df
4 changes: 2 additions & 2 deletions staffspy/linkedin/certifications.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import json
import logging

from staffspy.exceptions import TooManyRequests
from staffspy.models import Certification
from staffspy.utils.exceptions import TooManyRequests
from staffspy.utils.models import Certification

logger = logging.getLogger(__name__)

Expand Down
14 changes: 11 additions & 3 deletions staffspy/linkedin/employee.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
import logging
import re

import staffspy.utils as utils
from staffspy.exceptions import TooManyRequests
import staffspy.utils.utils as utils
from staffspy.utils.exceptions import TooManyRequests
from staffspy.utils.models import Staff

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -40,7 +41,7 @@ def fetch_employee(self, base_staff, domain):
self.parse_emp(base_staff, employee_json)
return True

def parse_emp(self, emp, emp_dict):
def parse_emp(self, emp: Staff, emp_dict: dict):
"""Parse the employee data from the employee profile."""
try:
photo_data = emp_dict["profilePicture"]["displayImageReference"][
Expand All @@ -53,7 +54,14 @@ def parse_emp(self, emp, emp_dict):
profile_photo = None

emp.profile_id = emp_dict["publicIdentifier"]
try:
emp.headline = emp_dict.get('headline')
if not emp.headline:
emp.headline = emp_dict['memberRelationship']['memberRelationshipData']['noInvitation']['targetInviteeResolutionResult']['headline']
except:
pass
emp.is_connection = next(iter(emp_dict['memberRelationship']['memberRelationshipUnion'])) == 'connection'
emp.open_to_work = emp_dict['profilePicture'].get('frameType')=='OPEN_TO_WORK'

emp.profile_link = f'https://www.linkedin.com/in/{emp_dict["publicIdentifier"]}'

Expand Down
2 changes: 1 addition & 1 deletion staffspy/linkedin/employee_bio.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import logging

from staffspy.exceptions import TooManyRequests
from staffspy.utils.exceptions import TooManyRequests

logger = logging.getLogger(__name__)

Expand Down
6 changes: 3 additions & 3 deletions staffspy/linkedin/experiences.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import json
import logging

import staffspy.utils as utils
from staffspy.exceptions import TooManyRequests
from staffspy.models import Experience
import staffspy.utils.utils as utils
from staffspy.utils.exceptions import TooManyRequests
from staffspy.utils.models import Experience

logger = logging.getLogger(__name__)

Expand Down
Loading
Loading