From 65ac58fe89761091c405a72efadbbe13e94218a6 Mon Sep 17 00:00:00 2001 From: Cullen Date: Tue, 4 Jun 2024 19:32:19 -0500 Subject: [PATCH 1/3] enh: potential email --- poetry.lock | 37 ++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + staffspy/linkedin/__init__.py | 9 +++++++-- staffspy/models.py | 4 +++- staffspy/utils.py | 13 ++++++++++++ 5 files changed, 60 insertions(+), 4 deletions(-) diff --git a/poetry.lock b/poetry.lock index de2ace5..ae93ef0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2202,6 +2202,20 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] +[[package]] +name = "requests-file" +version = "2.1.0" +description = "File transport adapter for Requests" +optional = false +python-versions = "*" +files = [ + {file = "requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c"}, + {file = "requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658"}, +] + +[package.dependencies] +requests = ">=1.0.0" + [[package]] name = "rfc3339-validator" version = "0.1.4" @@ -2471,6 +2485,27 @@ webencodings = ">=0.4" doc = ["sphinx", "sphinx_rtd_theme"] test = ["pytest", "ruff"] +[[package]] +name = "tldextract" +version = "5.1.2" +description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well." +optional = false +python-versions = ">=3.8" +files = [ + {file = "tldextract-5.1.2-py3-none-any.whl", hash = "sha256:4dfc4c277b6b97fa053899fcdb892d2dc27295851ab5fac4e07797b6a21b2e46"}, + {file = "tldextract-5.1.2.tar.gz", hash = "sha256:c9e17f756f05afb5abac04fe8f766e7e70f9fe387adb1859f0f52408ee060200"}, +] + +[package.dependencies] +filelock = ">=3.0.8" +idna = "*" +requests = ">=2.1.0" +requests-file = ">=1.4" + +[package.extras] +release = ["build", "twine"] +testing = ["black", "mypy", "pytest", "pytest-gitignore", "pytest-mock", "responses", "ruff", "syrupy", "tox", "types-filelock", "types-requests"] + [[package]] name = "tomli" version = "2.0.1" @@ -2721,4 +2756,4 @@ h11 = ">=0.9.0,<1" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "18f4b026a10b97aac4b4ecc6b70f3df7c75fb78aed5f15ac2178b31bb30dde0d" +content-hash = "f5ea2868d347488b9e1f8f1d39bd0232a72b95048b0808255a3973dc271e072c" diff --git a/pyproject.toml b/pyproject.toml index d5a58b5..cefb4e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,7 @@ selenium = "^4.21.0" pydantic = "^2.7.2" pandas = "^2.2.2" requests = "^2.32.3" +tldextract = "^5.1.2" [tool.poetry.group.dev.dependencies] diff --git a/staffspy/linkedin/__init__.py b/staffspy/linkedin/__init__.py index 4d03479..6c86cd0 100644 --- a/staffspy/linkedin/__init__.py +++ b/staffspy/linkedin/__init__.py @@ -22,8 +22,8 @@ class LinkedInScraper: def __init__(self, session_file): self.session = utils.load_session(session_file) self.company_id = self.staff_count = self.num_staff = self.company_name = ( - self.max_results - ) = self.search_term = None + self.domain + ) = self.max_results = self.search_term = None def get_company_id(self, company_name): res = self.session.get(f"{self.company_id_ep}{company_name}") @@ -40,6 +40,7 @@ def get_company_id(self, company_name): logger.debug(res.text[:200]) sys.exit() company = response_json["elements"][0] + self.domain = utils.extract_base_domain(company["companyPageUrl"]) staff_count = company["staffCount"] company_id = company["trackingInfo"]["objectUrn"].split(":")[-1] logger.info(f"Found company {company_name} with {staff_count} staff") @@ -95,6 +96,10 @@ def parse_emp(self, emp, emp_dict): emp.profile_photo = profile_photo emp.first_name = emp_dict["firstName"] emp.last_name = emp_dict["lastName"] + emp.potential_email = utils.create_email( + emp.first_name, emp.last_name, self.domain + ) + emp.followers = emp_dict.get("followingState", {}).get("followerCount") emp.connections = emp_dict["connections"]["paging"]["total"] emp.location = emp_dict["geoLocation"]["geo"]["defaultLocalizedName"] diff --git a/staffspy/models.py b/staffspy/models.py index a77bc12..5cc98bd 100644 --- a/staffspy/models.py +++ b/staffspy/models.py @@ -23,7 +23,7 @@ class Skill(BaseModel): def to_dict(self): return { "name": self.name, - "endorsements": self.endorsements, + "endorsements": self.endorsements if self.endorsements else 0, } @@ -75,6 +75,7 @@ class Staff(BaseModel): profile_link: str | None = None first_name: str | None = None last_name: str | None = None + potential_email: str | None = None followers: int | None = None connections: int | None = None location: str | None = None @@ -98,6 +99,7 @@ def to_dict(self): "profile_id": self.profile_id, "first_name": self.first_name, "last_name": self.last_name, + "potential_email": self.potential_email, "company": self.company, "school": self.school, "location": self.location, diff --git a/staffspy/utils.py b/staffspy/utils.py index 735a919..97109c5 100644 --- a/staffspy/utils.py +++ b/staffspy/utils.py @@ -5,6 +5,7 @@ from datetime import datetime import requests +import tldextract from selenium import webdriver from selenium.common.exceptions import WebDriverException @@ -26,6 +27,18 @@ def set_csrf_token(session): return session +def extract_base_domain(url: str): + extracted = tldextract.extract(url) + base_domain = "{}.{}".format(extracted.domain, extracted.suffix) + return base_domain + + +def create_email(first, last, domain): + first = "".join(filter(str.isalpha, first)) + last = "".join(filter(str.isalpha, last)) + return f"{first.lower()}.{last.lower()}@{domain}" + + def get_webdriver(): for browser in [webdriver.Firefox, webdriver.Chrome]: try: From e4f3163c5d760eda38a5ca38961ed360a5a10b69 Mon Sep 17 00:00:00 2001 From: Cullen Date: Tue, 4 Jun 2024 19:33:21 -0500 Subject: [PATCH 2/3] enh: potential email --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index cefb4e8..37900a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "staffspy" -version = "0.1.4" +version = "0.1.5" description = "Staff scraper library for LinkedIn" authors = ["Cullen Watson "] readme = "README.md" From 835fd82d01aa11e16e5a24c1a6a974b77b11148b Mon Sep 17 00:00:00 2001 From: Cullen Date: Sun, 9 Jun 2024 14:04:46 -0500 Subject: [PATCH 3/3] enh: location filter --- pyproject.toml | 2 +- staffspy/__init__.py | 4 ++ staffspy/exceptions.py | 8 ++++ staffspy/linkedin/__init__.py | 73 +++++++++++++++++++++++++++++------ 4 files changed, 74 insertions(+), 13 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 37900a0..6cd5d9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "staffspy" -version = "0.1.5" +version = "0.1.6" description = "Staff scraper library for LinkedIn" authors = ["Cullen Watson "] readme = "README.md" diff --git a/staffspy/__init__.py b/staffspy/__init__.py index a3ba867..a5ff3e0 100644 --- a/staffspy/__init__.py +++ b/staffspy/__init__.py @@ -10,6 +10,7 @@ def scrape_staff( company_name: str, session_file: str = None, search_term: str = None, + location: str = None, extra_profile_data: bool = False, max_results: int = 1000, log_level: int = 0, @@ -21,11 +22,14 @@ def scrape_staff( company_name=company_name, extra_profile_data=extra_profile_data, search_term=search_term, + location=location, max_results=max_results, ) staff_dicts = [staff.to_dict() for staff in staff] staff_df = pd.DataFrame(staff_dicts) + if staff_df.empty: + return staff_df linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"] non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"] staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) diff --git a/staffspy/exceptions.py b/staffspy/exceptions.py index 0b8eb89..aa0f449 100644 --- a/staffspy/exceptions.py +++ b/staffspy/exceptions.py @@ -3,3 +3,11 @@ class TooManyRequests(RequestException): """Too many requests.""" + + +class BadCookies(RequestException): + """Login expiration.""" + + +class GeoUrnNotFound(RequestException): + """Could not find geo urn for given location.""" diff --git a/staffspy/linkedin/__init__.py b/staffspy/linkedin/__init__.py index 6c86cd0..2655492 100644 --- a/staffspy/linkedin/__init__.py +++ b/staffspy/linkedin/__init__.py @@ -6,24 +6,25 @@ import staffspy.utils as utils from staffspy.utils import logger -from staffspy.exceptions import TooManyRequests +from staffspy.exceptions import TooManyRequests, BadCookies, GeoUrnNotFound from staffspy.models import Staff, Experience, Certification, Skill, School class LinkedInScraper: company_id_ep = "https://www.linkedin.com/voyager/api/organization/companies?q=universalName&universalName=" - employees_ep = "https://www.linkedin.com/voyager/api/graphql?variables=(start:{offset},query:(flagshipSearchIntent:SEARCH_SRP,{search}queryParameters:List((key:currentCompany,value:List({company_id})),(key:resultType,value:List(PEOPLE))),includeFiltersInResponse:false),count:{count})&queryId=voyagerSearchDashClusters.66adc6056cf4138949ca5dcb31bb1749" + employees_ep = "https://www.linkedin.com/voyager/api/graphql?variables=(start:{offset},query:(flagshipSearchIntent:SEARCH_SRP,{search}queryParameters:List((key:currentCompany,value:List({company_id})),{location}(key:resultType,value:List(PEOPLE))),includeFiltersInResponse:false),count:{count})&queryId=voyagerSearchDashClusters.66adc6056cf4138949ca5dcb31bb1749" employee_ep = "https://www.linkedin.com/voyager/api/voyagerIdentityDashProfiles?count=1&decorationId=com.linkedin.voyager.dash.deco.identity.profile.TopCardComplete-138&memberIdentity={employee_id}&q=memberIdentity" skills_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:skills,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)" experience_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:experience,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)" certifications_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:certifications,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)" schools_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:education,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)" + urn_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashReusableTypeahead.57a4fa1dd92d3266ed968fdbab2d7bf5&queryName=SearchReusableTypeaheadByType&variables=(query:(showFullLastNameForConnections:false,typeaheadFilterQuery:(geoSearchTypes:List(MARKET_AREA,COUNTRY_REGION,ADMIN_DIVISION_1,CITY))),keywords:{location},type:GEO,start:0)" def __init__(self, session_file): self.session = utils.load_session(session_file) self.company_id = self.staff_count = self.num_staff = self.company_name = ( self.domain - ) = self.max_results = self.search_term = None + ) = self.max_results = self.search_term = self.location = None def get_company_id(self, company_name): res = self.session.get(f"{self.company_id_ep}{company_name}") @@ -304,11 +305,17 @@ def fetch_staff(self, offset, company_id): company_id=company_id, count=min(50, self.max_results), search=f"keywords:{quote(self.search_term)}," if self.search_term else "", + location=( + f"(key:geoUrn,value:List({self.location}))," if self.location else "" + ), ) + print(self.location) res = self.session.get(ep) logger.debug(f"employees, status code - {res.status_code}") - if res.status_code == 429: - return TooManyRequests("429 Too Many Requests") + if res.status_code == 400: + raise BadCookies("Outdated login, delete the session file to log in again") + elif res.status_code == 429: + raise TooManyRequests("429 Too Many Requests") if not res.ok: return try: @@ -328,25 +335,67 @@ def fetch_staff(self, offset, company_id): ) return new_staff + def fetch_urn(self, location: str): + ep = self.urn_ep.format(location=quote(location)) + res = self.session.get(ep) + try: + res_json = res.json() + except json.decoder.JSONDecodeError: + logger.debug(res.text[:200]) + raise GeoUrnNotFound("Failed to find geo id") + + try: + elems = res_json["data"]["searchDashReusableTypeaheadByType"]["elements"] + except (KeyError, IndexError, TypeError): + logger.debug(res_json) + raise GeoUrnNotFound("Failed to find geo id") + + geo_id = None + if elems: + urn = elems[0]["trackingUrn"] + m = re.search("urn:li:geo:(.+)", urn) + if m: + geo_id = m.group(1) + if not geo_id: + raise GeoUrnNotFound("Failed to find geo id") + self.location = geo_id + def scrape_staff( self, company_name: str, search_term: str, + location: str, extra_profile_data: bool, max_results: int, ): self.search_term = search_term self.company_name = company_name self.max_results = max_results + company_id, staff_count = self.get_company_id(company_name) staff_list: list[Staff] = [] self.num_staff = min(staff_count, max_results, 1000) - for offset in range(0, self.num_staff, 50): - staff = self.fetch_staff(offset, company_id) - if not staff: - break - staff_list += staff - logger.info(f"Found {len(staff_list)} staff") + + if location: + try: + self.fetch_urn(location) + except GeoUrnNotFound as e: + logger.error(str(e)) + return staff_list[:max_results] + + try: + for offset in range(0, self.num_staff, 50): + staff = self.fetch_staff(offset, company_id) + if not staff: + break + staff_list += staff + logger.info( + f"Found {len(staff_list)} staff at {company_name} {f'at {location}' if location else ''}" + ) + except (BadCookies, TooManyRequests) as e: + logger.error(str(e)) + return staff_list[:max_results] + reduced_staff_list = staff_list[:max_results] non_restricted = list( @@ -380,7 +429,7 @@ def fetch_all_info_for_employee(employee: Staff, index: int): try: for i, employee in enumerate(non_restricted, start=1): fetch_all_info_for_employee(employee, i) - except TooManyRequests as e: + except (BadCookies, TooManyRequests) as e: logger.error(str(e)) return reduced_staff_list