From 65ac58fe89761091c405a72efadbbe13e94218a6 Mon Sep 17 00:00:00 2001
From: Cullen <cullen@cullenwatson.com>
Date: Tue, 4 Jun 2024 19:32:19 -0500
Subject: [PATCH 1/3] enh: potential email

---
 poetry.lock                   | 37 ++++++++++++++++++++++++++++++++++-
 pyproject.toml                |  1 +
 staffspy/linkedin/__init__.py |  9 +++++++--
 staffspy/models.py            |  4 +++-
 staffspy/utils.py             | 13 ++++++++++++
 5 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index de2ace5..ae93ef0 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2202,6 +2202,20 @@ urllib3 = ">=1.21.1,<3"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
+[[package]]
+name = "requests-file"
+version = "2.1.0"
+description = "File transport adapter for Requests"
+optional = false
+python-versions = "*"
+files = [
+    {file = "requests_file-2.1.0-py2.py3-none-any.whl", hash = "sha256:cf270de5a4c5874e84599fc5778303d496c10ae5e870bfa378818f35d21bda5c"},
+    {file = "requests_file-2.1.0.tar.gz", hash = "sha256:0f549a3f3b0699415ac04d167e9cb39bccfb730cb832b4d20be3d9867356e658"},
+]
+
+[package.dependencies]
+requests = ">=1.0.0"
+
 [[package]]
 name = "rfc3339-validator"
 version = "0.1.4"
@@ -2471,6 +2485,27 @@ webencodings = ">=0.4"
 doc = ["sphinx", "sphinx_rtd_theme"]
 test = ["pytest", "ruff"]
 
+[[package]]
+name = "tldextract"
+version = "5.1.2"
+description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "tldextract-5.1.2-py3-none-any.whl", hash = "sha256:4dfc4c277b6b97fa053899fcdb892d2dc27295851ab5fac4e07797b6a21b2e46"},
+    {file = "tldextract-5.1.2.tar.gz", hash = "sha256:c9e17f756f05afb5abac04fe8f766e7e70f9fe387adb1859f0f52408ee060200"},
+]
+
+[package.dependencies]
+filelock = ">=3.0.8"
+idna = "*"
+requests = ">=2.1.0"
+requests-file = ">=1.4"
+
+[package.extras]
+release = ["build", "twine"]
+testing = ["black", "mypy", "pytest", "pytest-gitignore", "pytest-mock", "responses", "ruff", "syrupy", "tox", "types-filelock", "types-requests"]
+
 [[package]]
 name = "tomli"
 version = "2.0.1"
@@ -2721,4 +2756,4 @@ h11 = ">=0.9.0,<1"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "18f4b026a10b97aac4b4ecc6b70f3df7c75fb78aed5f15ac2178b31bb30dde0d"
+content-hash = "f5ea2868d347488b9e1f8f1d39bd0232a72b95048b0808255a3973dc271e072c"
diff --git a/pyproject.toml b/pyproject.toml
index d5a58b5..cefb4e8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,6 +11,7 @@ selenium = "^4.21.0"
 pydantic = "^2.7.2"
 pandas = "^2.2.2"
 requests = "^2.32.3"
+tldextract = "^5.1.2"
 
 
 [tool.poetry.group.dev.dependencies]
diff --git a/staffspy/linkedin/__init__.py b/staffspy/linkedin/__init__.py
index 4d03479..6c86cd0 100644
--- a/staffspy/linkedin/__init__.py
+++ b/staffspy/linkedin/__init__.py
@@ -22,8 +22,8 @@ class LinkedInScraper:
     def __init__(self, session_file):
         self.session = utils.load_session(session_file)
         self.company_id = self.staff_count = self.num_staff = self.company_name = (
-            self.max_results
-        ) = self.search_term = None
+            self.domain
+        ) = self.max_results = self.search_term = None
 
     def get_company_id(self, company_name):
         res = self.session.get(f"{self.company_id_ep}{company_name}")
@@ -40,6 +40,7 @@ def get_company_id(self, company_name):
             logger.debug(res.text[:200])
             sys.exit()
         company = response_json["elements"][0]
+        self.domain = utils.extract_base_domain(company["companyPageUrl"])
         staff_count = company["staffCount"]
         company_id = company["trackingInfo"]["objectUrn"].split(":")[-1]
         logger.info(f"Found company {company_name} with {staff_count} staff")
@@ -95,6 +96,10 @@ def parse_emp(self, emp, emp_dict):
         emp.profile_photo = profile_photo
         emp.first_name = emp_dict["firstName"]
         emp.last_name = emp_dict["lastName"]
+        emp.potential_email = utils.create_email(
+            emp.first_name, emp.last_name, self.domain
+        )
+
         emp.followers = emp_dict.get("followingState", {}).get("followerCount")
         emp.connections = emp_dict["connections"]["paging"]["total"]
         emp.location = emp_dict["geoLocation"]["geo"]["defaultLocalizedName"]
diff --git a/staffspy/models.py b/staffspy/models.py
index a77bc12..5cc98bd 100644
--- a/staffspy/models.py
+++ b/staffspy/models.py
@@ -23,7 +23,7 @@ class Skill(BaseModel):
     def to_dict(self):
         return {
             "name": self.name,
-            "endorsements": self.endorsements,
+            "endorsements": self.endorsements if self.endorsements else 0,
         }
 
 
@@ -75,6 +75,7 @@ class Staff(BaseModel):
     profile_link: str | None = None
     first_name: str | None = None
     last_name: str | None = None
+    potential_email: str | None = None
     followers: int | None = None
     connections: int | None = None
     location: str | None = None
@@ -98,6 +99,7 @@ def to_dict(self):
             "profile_id": self.profile_id,
             "first_name": self.first_name,
             "last_name": self.last_name,
+            "potential_email": self.potential_email,
             "company": self.company,
             "school": self.school,
             "location": self.location,
diff --git a/staffspy/utils.py b/staffspy/utils.py
index 735a919..97109c5 100644
--- a/staffspy/utils.py
+++ b/staffspy/utils.py
@@ -5,6 +5,7 @@
 from datetime import datetime
 
 import requests
+import tldextract
 from selenium import webdriver
 from selenium.common.exceptions import WebDriverException
 
@@ -26,6 +27,18 @@ def set_csrf_token(session):
     return session
 
 
+def extract_base_domain(url: str):
+    extracted = tldextract.extract(url)
+    base_domain = "{}.{}".format(extracted.domain, extracted.suffix)
+    return base_domain
+
+
+def create_email(first, last, domain):
+    first = "".join(filter(str.isalpha, first))
+    last = "".join(filter(str.isalpha, last))
+    return f"{first.lower()}.{last.lower()}@{domain}"
+
+
 def get_webdriver():
     for browser in [webdriver.Firefox, webdriver.Chrome]:
         try:

From e4f3163c5d760eda38a5ca38961ed360a5a10b69 Mon Sep 17 00:00:00 2001
From: Cullen <cullen@cullenwatson.com>
Date: Tue, 4 Jun 2024 19:33:21 -0500
Subject: [PATCH 2/3] enh: potential email

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index cefb4e8..37900a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "staffspy"
-version = "0.1.4"
+version = "0.1.5"
 description = "Staff scraper library for LinkedIn"
 authors = ["Cullen Watson <cullen@bunsly.com>"]
 readme = "README.md"

From 835fd82d01aa11e16e5a24c1a6a974b77b11148b Mon Sep 17 00:00:00 2001
From: Cullen <cullen@cullenwatson.com>
Date: Sun, 9 Jun 2024 14:04:46 -0500
Subject: [PATCH 3/3] enh: location filter

---
 pyproject.toml                |  2 +-
 staffspy/__init__.py          |  4 ++
 staffspy/exceptions.py        |  8 ++++
 staffspy/linkedin/__init__.py | 73 +++++++++++++++++++++++++++++------
 4 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 37900a0..6cd5d9d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "staffspy"
-version = "0.1.5"
+version = "0.1.6"
 description = "Staff scraper library for LinkedIn"
 authors = ["Cullen Watson <cullen@bunsly.com>"]
 readme = "README.md"
diff --git a/staffspy/__init__.py b/staffspy/__init__.py
index a3ba867..a5ff3e0 100644
--- a/staffspy/__init__.py
+++ b/staffspy/__init__.py
@@ -10,6 +10,7 @@ def scrape_staff(
     company_name: str,
     session_file: str = None,
     search_term: str = None,
+    location: str = None,
     extra_profile_data: bool = False,
     max_results: int = 1000,
     log_level: int = 0,
@@ -21,11 +22,14 @@ def scrape_staff(
         company_name=company_name,
         extra_profile_data=extra_profile_data,
         search_term=search_term,
+        location=location,
         max_results=max_results,
     )
     staff_dicts = [staff.to_dict() for staff in staff]
     staff_df = pd.DataFrame(staff_dicts)
 
+    if staff_df.empty:
+        return staff_df
     linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"]
     non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"]
     staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
diff --git a/staffspy/exceptions.py b/staffspy/exceptions.py
index 0b8eb89..aa0f449 100644
--- a/staffspy/exceptions.py
+++ b/staffspy/exceptions.py
@@ -3,3 +3,11 @@
 
 class TooManyRequests(RequestException):
     """Too many requests."""
+
+
+class BadCookies(RequestException):
+    """Login expiration."""
+
+
+class GeoUrnNotFound(RequestException):
+    """Could not find geo urn for given location."""
diff --git a/staffspy/linkedin/__init__.py b/staffspy/linkedin/__init__.py
index 6c86cd0..2655492 100644
--- a/staffspy/linkedin/__init__.py
+++ b/staffspy/linkedin/__init__.py
@@ -6,24 +6,25 @@
 
 import staffspy.utils as utils
 from staffspy.utils import logger
-from staffspy.exceptions import TooManyRequests
+from staffspy.exceptions import TooManyRequests, BadCookies, GeoUrnNotFound
 from staffspy.models import Staff, Experience, Certification, Skill, School
 
 
 class LinkedInScraper:
     company_id_ep = "https://www.linkedin.com/voyager/api/organization/companies?q=universalName&universalName="
-    employees_ep = "https://www.linkedin.com/voyager/api/graphql?variables=(start:{offset},query:(flagshipSearchIntent:SEARCH_SRP,{search}queryParameters:List((key:currentCompany,value:List({company_id})),(key:resultType,value:List(PEOPLE))),includeFiltersInResponse:false),count:{count})&queryId=voyagerSearchDashClusters.66adc6056cf4138949ca5dcb31bb1749"
+    employees_ep = "https://www.linkedin.com/voyager/api/graphql?variables=(start:{offset},query:(flagshipSearchIntent:SEARCH_SRP,{search}queryParameters:List((key:currentCompany,value:List({company_id})),{location}(key:resultType,value:List(PEOPLE))),includeFiltersInResponse:false),count:{count})&queryId=voyagerSearchDashClusters.66adc6056cf4138949ca5dcb31bb1749"
     employee_ep = "https://www.linkedin.com/voyager/api/voyagerIdentityDashProfiles?count=1&decorationId=com.linkedin.voyager.dash.deco.identity.profile.TopCardComplete-138&memberIdentity={employee_id}&q=memberIdentity"
     skills_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:skills,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)"
     experience_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:experience,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)"
     certifications_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:certifications,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)"
     schools_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerIdentityDashProfileComponents.277ba7d7b9afffb04683953cede751fb&queryName=ProfileComponentsBySectionType&variables=(tabIndex:0,sectionType:education,profileUrn:urn%3Ali%3Afsd_profile%3A{employee_id},count:50)"
+    urn_ep = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSearchDashReusableTypeahead.57a4fa1dd92d3266ed968fdbab2d7bf5&queryName=SearchReusableTypeaheadByType&variables=(query:(showFullLastNameForConnections:false,typeaheadFilterQuery:(geoSearchTypes:List(MARKET_AREA,COUNTRY_REGION,ADMIN_DIVISION_1,CITY))),keywords:{location},type:GEO,start:0)"
 
     def __init__(self, session_file):
         self.session = utils.load_session(session_file)
         self.company_id = self.staff_count = self.num_staff = self.company_name = (
             self.domain
-        ) = self.max_results = self.search_term = None
+        ) = self.max_results = self.search_term = self.location = None
 
     def get_company_id(self, company_name):
         res = self.session.get(f"{self.company_id_ep}{company_name}")
@@ -304,11 +305,17 @@ def fetch_staff(self, offset, company_id):
             company_id=company_id,
             count=min(50, self.max_results),
             search=f"keywords:{quote(self.search_term)}," if self.search_term else "",
+            location=(
+                f"(key:geoUrn,value:List({self.location}))," if self.location else ""
+            ),
         )
+        print(self.location)
         res = self.session.get(ep)
         logger.debug(f"employees, status code - {res.status_code}")
-        if res.status_code == 429:
-            return TooManyRequests("429 Too Many Requests")
+        if res.status_code == 400:
+            raise BadCookies("Outdated login, delete the session file to log in again")
+        elif res.status_code == 429:
+            raise TooManyRequests("429 Too Many Requests")
         if not res.ok:
             return
         try:
@@ -328,25 +335,67 @@ def fetch_staff(self, offset, company_id):
         )
         return new_staff
 
+    def fetch_urn(self, location: str):
+        ep = self.urn_ep.format(location=quote(location))
+        res = self.session.get(ep)
+        try:
+            res_json = res.json()
+        except json.decoder.JSONDecodeError:
+            logger.debug(res.text[:200])
+            raise GeoUrnNotFound("Failed to find geo id")
+
+        try:
+            elems = res_json["data"]["searchDashReusableTypeaheadByType"]["elements"]
+        except (KeyError, IndexError, TypeError):
+            logger.debug(res_json)
+            raise GeoUrnNotFound("Failed to find geo id")
+
+        geo_id = None
+        if elems:
+            urn = elems[0]["trackingUrn"]
+            m = re.search("urn:li:geo:(.+)", urn)
+            if m:
+                geo_id = m.group(1)
+        if not geo_id:
+            raise GeoUrnNotFound("Failed to find geo id")
+        self.location = geo_id
+
     def scrape_staff(
         self,
         company_name: str,
         search_term: str,
+        location: str,
         extra_profile_data: bool,
         max_results: int,
     ):
         self.search_term = search_term
         self.company_name = company_name
         self.max_results = max_results
+
         company_id, staff_count = self.get_company_id(company_name)
         staff_list: list[Staff] = []
         self.num_staff = min(staff_count, max_results, 1000)
-        for offset in range(0, self.num_staff, 50):
-            staff = self.fetch_staff(offset, company_id)
-            if not staff:
-                break
-            staff_list += staff
-        logger.info(f"Found {len(staff_list)} staff")
+
+        if location:
+            try:
+                self.fetch_urn(location)
+            except GeoUrnNotFound as e:
+                logger.error(str(e))
+                return staff_list[:max_results]
+
+        try:
+            for offset in range(0, self.num_staff, 50):
+                staff = self.fetch_staff(offset, company_id)
+                if not staff:
+                    break
+                staff_list += staff
+            logger.info(
+                f"Found {len(staff_list)} staff at {company_name} {f'at {location}' if location else ''}"
+            )
+        except (BadCookies, TooManyRequests) as e:
+            logger.error(str(e))
+            return staff_list[:max_results]
+
         reduced_staff_list = staff_list[:max_results]
 
         non_restricted = list(
@@ -380,7 +429,7 @@ def fetch_all_info_for_employee(employee: Staff, index: int):
             try:
                 for i, employee in enumerate(non_restricted, start=1):
                     fetch_all_info_for_employee(employee, i)
-            except TooManyRequests as e:
+            except (BadCookies, TooManyRequests) as e:
                 logger.error(str(e))
 
         return reduced_staff_list