enh: search by user id (#34)

cullenwatson · Aug 2, 2024 · 0d116a9 · 0d116a9
1 parent 4546e64
commit 0d116a9
Show file tree

Hide file tree

Showing 17 changed files with 206 additions and 145 deletions.
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 Cullen Watson
+Copyright (c) 2024 Cullen Watson
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -21,33 +21,35 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/)
 ### Usage
 
 ```python
-from staffspy import scrape_staff, SolverType
+from staffspy import LinkedInAccount, SolverType
 from pathlib import Path
+
 session_file = Path(__file__).resolve().parent / "session.pkl"
+account = LinkedInAccount(
+    ## credentials - remove these to sign in with browser
+    username="[email protected]",
+    password="mypassword",
+    solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha
+    solver_service=SolverType.CAPSOLVER,
+
+    session_file=str(session_file), # save login cookies to only log in once (lasts a week or so)
+    log_level=1, # 0 for no logs
+)
 
-staff = scrape_staff(
-    ## staff filters
+# search by company
+staff = account.scrape_staff(
     company_name="openai",
     search_term="software engineer",
     location="london",
     extra_profile_data=True, # fetch all past experiences, schools, & skills
-    ##
-
-    ## config
     max_results=50, # can go up to 1000
-    session_file=str(session_file), # save login cookies to only log in once (lasts a week or so)
-    log_level=1, # 0 for no logs
-    ##
-
-    ## credentials - remove these to sign in with browser
-    username="[email protected]",
-    password="mypassword",
-    solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha
-    solver_service=SolverType.CAPSOLVER
-    ##
 )
-filename = "staff.csv"
-staff.to_csv(filename, index=False)
+# or fetch by user ids
+users = account.scrape_users(
+    user_ids=['williamhgates', 'rbranson', 'jeffweiner08']
+)
+staff.to_csv("staff.csv", index=False)
+users.to_csv("users.csv", index=False)
 ```
 
 #### Browser login

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "staffspy"
-version = "0.2.5"
+version = "0.2.6"
 description = "Staff scraper library for LinkedIn"
 authors = ["Cullen Watson <[email protected]>"]
 readme = "README.md"

diff --git a/staffspy/__init__.py b/staffspy/__init__.py
@@ -1,75 +1,115 @@
 import pandas as pd
 
 from staffspy.linkedin.linkedin import LinkedInScraper
+from staffspy.utils.models import Staff
 from staffspy.solvers.capsolver import CapSolver
 from staffspy.solvers.solver_type import SolverType
 from staffspy.solvers.two_captcha import TwoCaptchaSolver
+from staffspy.utils.utils import set_logger_level, logger, Login
 
-from staffspy.utils import set_logger_level, logger, Login
 
+class LinkedInAccount:
+    solver_map = {
+        SolverType.CAPSOLVER: CapSolver,
+        SolverType.TWO_CAPTCHA: TwoCaptchaSolver
+    }
 
-def scrape_staff(
-        *,
-        company_name: str = None,
-        user_id: str = None,
-        session_file: str = None,
-        search_term: str = None,
-        location: str = None,
-        extra_profile_data: bool = False,
-        max_results: int = 1000,
-        log_level: int = 0,
-        username: str = None,
-        password: str = None,
-        solver_api_key: str = None,
-        solver_service: SolverType = SolverType.CAPSOLVER
+    def __init__(
+            self,
+            session_file: str = None,
+            username: str = None,
+            password: str = None,
+            log_level: int = 0,
+            solver_api_key: str = None,
+            solver_service: SolverType = SolverType.CAPSOLVER
+    ):
+        self.session_file = session_file
+        self.username = username
+        self.password = password
+        self.log_level = log_level
+        self.solver = self.solver_map[solver_service](solver_api_key)
+        self.session = None
+        self.linkedin_scraper = None
+        self.login()
 
-) -> pd.DataFrame:
-    """Scrape staff from Linkedin
-    company_name - name of company to find staff frame
-    user_id - alternative to company_name, fetches the company_name from the user profile
-    session_file - place to save cookies to only sign in once
-    search_term - occupation / term to search for at the company
-    location - filter for staff at a location
-    extra_profile_data - fetches staff's experiences, schools, and mor
-    max_results - amount of results you desire
-    log_level - level of logs, 0 for no logs, 2 for all
-    usernme,password - for requests based sign in
-    solver_api_key,solver_service - options to bypass captcha
-    """
-    set_logger_level(log_level)
+    def login(self):
+        set_logger_level(self.log_level)
+        login = Login(self.username, self.password, self.solver, self.session_file)
+        self.session = login.load_session()
 
-    solver=None
-    if solver_service == SolverType.CAPSOLVER:
-        solver = CapSolver(solver_api_key)
-    elif solver_service == SolverType.TWO_CAPTCHA:
-        solver = TwoCaptchaSolver(solver_api_key)
-    login = Login(username, password, solver, session_file)
-    session = login.load_session()
+    def scrape_staff(
+            self,
+            company_name: str = None,
+            user_id: str = None,
+            search_term: str = None,
+            location: str = None,
+            extra_profile_data: bool = False,
+            max_results: int = 1000
+    ) -> pd.DataFrame:
+        """Scrape staff from Linkedin
+        company_name - name of company to find staff frame
+        user_id - alternative to company_name, fetches the company_name from the user profile
+        search_term - occupation / term to search for at the company
+        location - filter for staff at a location
+        extra_profile_data - fetches staff's experiences, schools, and mor
+        max_results - amount of results you desire
+        """
+        li_scraper = LinkedInScraper(self.session)
 
-    li = LinkedInScraper(session)
+        if not company_name:
+            if not user_id:
+                raise ValueError("Either company_name or user_id must be provided")
+            company_name = li_scraper.fetch_user_profile_data_from_public_id('company_id')
 
-    if not company_name:
-        if not user_id:
-            raise ValueError("Either company_name or user_id must be provided")
+        staff = li_scraper.scrape_staff(
+            company_name=company_name,
+            extra_profile_data=extra_profile_data,
+            search_term=search_term,
+            location=location,
+            max_results=max_results,
+        )
+        staff_dicts = [staff.to_dict() for staff in staff]
+        staff_df = pd.DataFrame(staff_dicts)
 
-        company_name = li.fetch_company_id_from_user(user_id)
+        if staff_df.empty:
+            return staff_df
+        linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"]
+        non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"]
+        staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
+        logger.info(f"Scraped {len(staff_df)} staff members from {company_name}")
+        return staff_df
 
-    staff = li.scrape_staff(
-        company_name=company_name,
-        extra_profile_data=extra_profile_data,
-        search_term=search_term,
-        location=location,
-        max_results=max_results,
-    )
-    staff_dicts = [staff.to_dict() for staff in staff]
-    staff_df = pd.DataFrame(staff_dicts)
+    def scrape_users(
+            self,
+            user_ids: list[str]
+        ) -> pd.DataFrame:
+        """Scrape users from Linkedin by user IDs
+        user_ids - list of LinkedIn user IDs
+        """
+        li_scraper = LinkedInScraper(self.session)
+        li_scraper.num_staff = len(user_ids)
+        users = [
+            Staff(
+                id='',
+                search_term='manual',
+                profile_id=user_id,
+            ) for user_id in user_ids
+        ]
 
-    if staff_df.empty:
-        return staff_df
-    linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"]
-    non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"]
-    staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
-    logger.info(
-        f"Scraped {len(staff_df)} staff members, with {len(linkedin_member_df)} hidden LinkedIn Members."
-    )
-    return staff_df
+        for i, user in enumerate(users,start=1):
+            user.id = li_scraper.fetch_user_profile_data_from_public_id(user.profile_id, 'user_id')
+            if user.id:
+                li_scraper.fetch_all_info_for_employee(
+                    user, i
+                )
+
+        users_dicts = [user.to_dict() for user in users if user.id]
+        users_df = pd.DataFrame(users_dicts)
+
+        if users_df.empty:
+            return users_df
+        linkedin_member_df = users_df[users_df["name"] == "LinkedIn Member"]
+        non_linkedin_member_df = users_df[users_df["name"] != "LinkedIn Member"]
+        users_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
+        logger.info(f"Scraped {len(users_df)} users")
+        return users_df
diff --git a/staffspy/linkedin/certifications.py b/staffspy/linkedin/certifications.py
@@ -1,8 +1,8 @@
 import json
 import logging
 
-from staffspy.exceptions import TooManyRequests
-from staffspy.models import Certification
+from staffspy.utils.exceptions import TooManyRequests
+from staffspy.utils.models import Certification
 
 logger = logging.getLogger(__name__)
 

diff --git a/staffspy/linkedin/employee.py b/staffspy/linkedin/employee.py
@@ -2,8 +2,9 @@
 import logging
 import re
 
-import staffspy.utils as utils
-from staffspy.exceptions import TooManyRequests
+import staffspy.utils.utils as utils
+from staffspy.utils.exceptions import TooManyRequests
+from staffspy.utils.models import Staff
 
 logger = logging.getLogger(__name__)
 
@@ -40,7 +41,7 @@ def fetch_employee(self, base_staff, domain):
         self.parse_emp(base_staff, employee_json)
         return True
 
-    def parse_emp(self, emp, emp_dict):
+    def parse_emp(self, emp: Staff, emp_dict: dict):
         """Parse the employee data from the employee profile."""
         try:
             photo_data = emp_dict["profilePicture"]["displayImageReference"][
@@ -53,7 +54,14 @@ def parse_emp(self, emp, emp_dict):
             profile_photo = None
 
         emp.profile_id = emp_dict["publicIdentifier"]
+        try:
+            emp.headline = emp_dict.get('headline')
+            if not emp.headline:
+                emp.headline = emp_dict['memberRelationship']['memberRelationshipData']['noInvitation']['targetInviteeResolutionResult']['headline']
+        except:
+            pass
         emp.is_connection = next(iter(emp_dict['memberRelationship']['memberRelationshipUnion'])) == 'connection'
+        emp.open_to_work = emp_dict['profilePicture'].get('frameType')=='OPEN_TO_WORK'
 
         emp.profile_link = f'https://www.linkedin.com/in/{emp_dict["publicIdentifier"]}'
 

diff --git a/staffspy/linkedin/employee_bio.py b/staffspy/linkedin/employee_bio.py
@@ -1,7 +1,7 @@
 import json
 import logging
 
-from staffspy.exceptions import TooManyRequests
+from staffspy.utils.exceptions import TooManyRequests
 
 logger = logging.getLogger(__name__)
 

diff --git a/staffspy/linkedin/experiences.py b/staffspy/linkedin/experiences.py
@@ -1,9 +1,9 @@
 import json
 import logging
 
-import staffspy.utils as utils
-from staffspy.exceptions import TooManyRequests
-from staffspy.models import Experience
+import staffspy.utils.utils as utils
+from staffspy.utils.exceptions import TooManyRequests
+from staffspy.utils.models import Experience
 
 logger = logging.getLogger(__name__)