From ed61c1dd6413747a55c613c652d3dbd875331802 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 29 Sep 2024 14:21:26 -0500 Subject: [PATCH] Browser type (#43) --- README.md | 11 ++++++- pyproject.toml | 2 +- staffspy/__init__.py | 71 ++++++++++++++++++++++++----------------- staffspy/utils/utils.py | 59 ++++++++++++++++++++++++---------- 4 files changed, 94 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index bfeb416..36ed4e3 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ _Python version >= [3.10](https://www.python.org/downloads/release/python-3100/) ```python from pathlib import Path -from staffspy import LinkedInAccount, SolverType +from staffspy import LinkedInAccount, SolverType, DriverType, BrowserType session_file = Path(__file__).resolve().parent / "session.pkl" account = LinkedInAccount( @@ -31,6 +31,12 @@ account = LinkedInAccount( # password="mypassword", # solver_api_key="your-api-key", # solver_service=SolverType.TWO_CAPTCHA, + + # if issues with webdriver, specify + # driver_type=DriverType( + # browser_type=BrowserType.CHROME, + # executable_path="/Users/pc/chromedriver-mac-arm64/chromedriver" + # ), session_file=str(session_file), # save login cookies to only log in once (lasts a week or so) log_level=1, # 0 for no logs @@ -84,6 +90,9 @@ Optional ├── password (str): | linkedin account password | +├── driver_type (DriverType): +| signs in with the given BrowserType (Chrome, Firefox) and executable_path +| ├── solver_service (SolverType): | solves the captcha using the desired service - either CapSolver, or 2Captcha (worse of the two) | diff --git a/pyproject.toml b/pyproject.toml index 115c8c9..9fd1c7b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "staffspy" -version = "0.2.12" +version = "0.2.13" description = "Staff scraper library for LinkedIn" authors = ["Cullen Watson "] readme = "README.md" diff --git a/staffspy/__init__.py b/staffspy/__init__.py index 3fabaa8..3a21f0f 100644 --- a/staffspy/__init__.py +++ b/staffspy/__init__.py @@ -6,45 +6,54 @@ from staffspy.solvers.solver_type import SolverType from staffspy.solvers.two_captcha import TwoCaptchaSolver from staffspy.utils.utils import set_logger_level, logger, Login +from utils.driver_type import DriverType, BrowserType class LinkedInAccount: solver_map = { SolverType.CAPSOLVER: CapSolver, - SolverType.TWO_CAPTCHA: TwoCaptchaSolver + SolverType.TWO_CAPTCHA: TwoCaptchaSolver, } def __init__( - self, - session_file: str = None, - username: str = None, - password: str = None, - log_level: int = 0, - solver_api_key: str = None, - solver_service: SolverType = SolverType.CAPSOLVER + self, + session_file: str = None, + username: str = None, + password: str = None, + log_level: int = 0, + solver_api_key: str = None, + solver_service: SolverType = SolverType.CAPSOLVER, + driver_type: DriverType = None, ): self.session_file = session_file self.username = username self.password = password self.log_level = log_level self.solver = self.solver_map[solver_service](solver_api_key) + self.driver_type = driver_type self.session = None self.linkedin_scraper = None self.login() def login(self): set_logger_level(self.log_level) - login = Login(self.username, self.password, self.solver, self.session_file) + login = Login( + self.username, + self.password, + self.solver, + self.session_file, + self.driver_type, + ) self.session = login.load_session() def scrape_staff( - self, - company_name: str = None, - user_id: str = None, - search_term: str = None, - location: str = None, - extra_profile_data: bool = False, - max_results: int = 1000 + self, + company_name: str = None, + user_id: str = None, + search_term: str = None, + location: str = None, + extra_profile_data: bool = False, + max_results: int = 1000, ) -> pd.DataFrame: """Scrape staff from Linkedin company_name - name of company to find staff frame @@ -59,7 +68,9 @@ def scrape_staff( if not company_name: if not user_id: raise ValueError("Either company_name or user_id must be provided") - company_name = li_scraper.fetch_user_profile_data_from_public_id('company_id') + company_name = li_scraper.fetch_user_profile_data_from_public_id( + "company_id" + ) staff = li_scraper.scrape_staff( company_name=company_name, @@ -76,13 +87,12 @@ def scrape_staff( linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"] non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"] staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) - logger.info(f"Scraped {len(staff_df)} staff members from {company_name}, with {len(linkedin_member_df)} hidden LinkedIn users") + logger.info( + f"Scraped {len(staff_df)} staff members from {company_name}, with {len(linkedin_member_df)} hidden LinkedIn users" + ) return staff_df - def scrape_users( - self, - user_ids: list[str] - ) -> pd.DataFrame: + def scrape_users(self, user_ids: list[str]) -> pd.DataFrame: """Scrape users from Linkedin by user IDs user_ids - list of LinkedIn user IDs """ @@ -90,18 +100,19 @@ def scrape_users( li_scraper.num_staff = len(user_ids) users = [ Staff( - id='', - search_term='manual', + id="", + search_term="manual", profile_id=user_id, - ) for user_id in user_ids + ) + for user_id in user_ids ] - for i, user in enumerate(users,start=1): - user.id = li_scraper.fetch_user_profile_data_from_public_id(user.profile_id, 'user_id') + for i, user in enumerate(users, start=1): + user.id = li_scraper.fetch_user_profile_data_from_public_id( + user.profile_id, "user_id" + ) if user.id: - li_scraper.fetch_all_info_for_employee( - user, i - ) + li_scraper.fetch_all_info_for_employee(user, i) users_dicts = [user.to_dict() for user in users if user.id] users_df = pd.DataFrame(users_dicts) diff --git a/staffspy/utils/utils.py b/staffspy/utils/utils.py index 9a8d101..c8f7e02 100644 --- a/staffspy/utils/utils.py +++ b/staffspy/utils/utils.py @@ -3,16 +3,18 @@ import pickle import re from datetime import datetime +from typing import Optional from urllib.parse import quote -from dateutil.parser import parse import requests import tldextract from bs4 import BeautifulSoup +from dateutil.parser import parse from tenacity import stop_after_attempt, retry_if_exception_type, retry, RetryError -from staffspy.utils.exceptions import BlobException from staffspy.solvers.solver import Solver +from staffspy.utils.driver_type import DriverType, BrowserType +from staffspy.utils.exceptions import BlobException logger = logging.getLogger("StaffSpy") logger.propagate = False @@ -50,32 +52,55 @@ def create_emails(first, last, domain): return emails -def get_webdriver(): +def get_webdriver(driver_type: Optional[DriverType] = None): try: from selenium import webdriver - from selenium.common.exceptions import WebDriverException + from selenium.webdriver.chrome.service import Service as ChromeService + from selenium.webdriver.firefox.service import Service as FirefoxService except ImportError as e: raise Exception( "install package `pip install staffspy[browser]` to login with browser" ) - for browser in [webdriver.Chrome, webdriver.Firefox]: - try: - return browser() - except WebDriverException: - continue + if driver_type: + if str(driver_type.browser_type) == str(BrowserType.CHROME): + if driver_type.executable_path: + service = ChromeService(executable_path=driver_type.executable_path) + return webdriver.Chrome(service=service) + else: + return webdriver.Chrome() + elif str(driver_type.browser_type) == str(BrowserType.FIREFOX): + if driver_type.executable_path: + service = FirefoxService(executable_path=driver_type.executable_path) + return webdriver.Firefox(service=service) + else: + return webdriver.Firefox() + else: + for browser in [webdriver.Chrome, webdriver.Firefox]: + try: + return browser() + except Exception: + continue return None class Login: - def __init__(self, username: str, password: str, solver: Solver, session_file: str): - self.username, self.password, self.solver, self.session_file = ( - username, - password, - solver, - session_file, - ) + def __init__( + self, + username: str, + password: str, + solver: Solver, + session_file: str, + driver_type: DriverType = None, + ): + ( + self.username, + self.password, + self.solver, + self.session_file, + self.driver_type, + ) = (username, password, solver, session_file, driver_type) def solve_captcha(self, session, data, payload): url = data["challenge_url"] @@ -204,7 +229,7 @@ def login_requests(self): def login_browser(self): """Backup login method""" - driver = get_webdriver() + driver = get_webdriver(self.driver_type) if driver is None: logger.debug("No browser found for selenium")