Skip to content

Commit

Permalink
tracking
Browse files Browse the repository at this point in the history
  • Loading branch information
honzajavorek committed Feb 16, 2024
1 parent 2c6021c commit 1d7adb9
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 16 deletions.
4 changes: 3 additions & 1 deletion juniorguru_plucker/actors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from typing import Generator, Type

import nest_asyncio
from apify import Actor
from apify import Actor, Configuration
from apify.scrapy.utils import apply_apify_settings
from scrapy import Item, Spider
from scrapy.settings import BaseSettings, Settings
Expand All @@ -13,6 +13,8 @@


async def run_actor(settings: Settings, spider_class: Type[Spider]) -> None:
config = Configuration.get_global_configuration()
config.purge_on_start = True
async with Actor:
Actor.log.info(f"Spider {spider_class.name}")
actor_input = await Actor.get_input() or {}
Expand Down
58 changes: 43 additions & 15 deletions juniorguru_plucker/jobs_jobscz/spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import hashlib
import json
import logging
import re
import uuid
from datetime import date, datetime
Expand Down Expand Up @@ -53,7 +55,9 @@ class Spider(BaseSpider):
def parse(self, response: HtmlResponse) -> Generator[Request, None, None]:
card_xpath = "//article[contains(@class, 'SearchResultCard')]"
for n, card in enumerate(response.xpath(card_xpath), start=1):
url = card.css('a[data-link="jd-detail"]::attr(href)').get()
url = cast(str, card.css('a[data-link="jd-detail"]::attr(href)').get())
track_id = get_track_id(url)

loader = Loader(item=Job(), response=response)
card_loader = loader.nested_xpath(f"{card_xpath}[{n}]")
card_loader.add_value("source", self.name)
Expand All @@ -69,8 +73,12 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]:
card_loader.add_value("source_urls", response.url)
card_loader.add_value("source_urls", url)
item = loader.load_item()

self.track_logger(track_id).debug(f"Parsing card for {url}")
yield response.follow(
url, callback=self.parse_job, cb_kwargs=dict(item=item)
url,
callback=self.parse_job,
cb_kwargs=dict(item=item, track_id=track_id),
)
urls = [
response.urljoin(relative_url)
Expand All @@ -80,25 +88,26 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]:
yield from response.follow_all(urls, callback=self.parse)

def parse_job(
self, response: HtmlResponse, item: Job
self, response: HtmlResponse, item: Job, track_id: str
) -> Generator[Job | Request, None, None]:
self.track_logger(track_id).debug("Parsing job page")
loader = Loader(item=item, response=response)
loader.add_value("url", response.url)
loader.add_value("source_urls", response.url)

if "www.jobs.cz" not in response.url:
yield from self.parse_job_widget_data(response, item)
yield from self.parse_job_widget_data(response, item, track_id)
else:
# standard
self.track_logger(track_id).debug("Parsing as standard job page")
for label in self.employment_types_labels:
loader.add_xpath(
"employment_types",
f"//span[contains(text(), {label!r})]/following-sibling::p/text()",
)
loader.add_css("description_html", '[data-jobad="body"]')

# company
if response.css('[class*="CompanyProfileNavigation"]').get():
self.track_logger(track_id).debug("Parsing as company job page")
loader.add_css(
"company_logo_urls",
".CompanyProfileNavigation__logo img::attr(src)",
Expand All @@ -113,20 +122,22 @@ def parse_job(
yield loader.load_item()

def parse_job_widget_data(
self, response: HtmlResponse, item: Job
self, response: HtmlResponse, item: Job, track_id: str
) -> Generator[Request, None, None]:
try:
self.logger.debug("Looking for widget data in the HTML")
self.track_logger(track_id).debug("Looking for widget data in the HTML")
widget_data = json.loads(response.css("script::text").re(WIDGET_DATA_RE)[0])
except IndexError:
self.logger.debug("Looking for widget data in attached JavaScript")
self.track_logger(track_id).debug(
"Looking for widget data in attached JavaScript"
)
script_url = response.css(
'script[src*="assets/js/script.min.js"]::attr(src)'
).get()
yield response.follow(
script_url,
callback=self.parse_job_widget_script,
cb_kwargs=dict(item=item, html_response=response),
cb_kwargs=dict(item=item, html_response=response, track_id=track_id),
)
else:
yield from self.parse_job_widget(
Expand All @@ -135,10 +146,15 @@ def parse_job_widget_data(
widget_host=widget_data["host"],
widget_api_key=widget_data["apiKey"],
widget_id=widget_data["widgetId"],
track_id=track_id,
)

def parse_job_widget_script(
self, script_response: TextResponse, html_response: HtmlResponse, item: Job
self,
script_response: TextResponse,
html_response: HtmlResponse,
item: Job,
track_id: str,
) -> Generator[Request, None, None]:
if match := re.search(WIDGET_DATA_SCRIPT_RE, script_response.text):
data_text = re.sub(r"\'", r"\\'", match.group(1))
Expand All @@ -153,6 +169,7 @@ def parse_job_widget_script(
widget_host=data["host"],
widget_api_key=widget_data["apiKey"],
widget_id=widget_data["id"],
track_id=track_id,
)
else:
raise NotImplementedError("Widget data not found")
Expand All @@ -164,14 +181,15 @@ def parse_job_widget(
widget_host: str,
widget_api_key: str,
widget_id: str,
track_id: str,
) -> Generator[Request, None, None]:
params = get_params(response.url)

loader = Loader(item=item, response=response)
loader.add_value("url", response.url)
loader.add_value("company_url", f"https://{widget_host}")
loader.add_value("source_urls", response.url)

self.track_logger(track_id).debug("Requesting data from job widget API")
params = get_params(response.url)
yield Request(
"https://api.capybara.lmc.cz/api/graphql/widget",
method="POST",
Expand Down Expand Up @@ -202,12 +220,13 @@ def parse_job_widget(
)
),
callback=self.parse_job_widget_api,
cb_kwargs=dict(item=loader.load_item()),
cb_kwargs=dict(item=loader.load_item(), track_id=track_id),
)

def parse_job_widget_api(
self, response: TextResponse, item: Job
self, response: TextResponse, item: Job, track_id: str
) -> Generator[Job, None, None]:
self.track_logger(track_id).debug("Parsing job widget API response")
payload = cast(dict, response.json())
job_ad = payload["data"]["widget"]["jobAd"]

Expand All @@ -227,6 +246,15 @@ def parse_job_widget_api(

yield loader.load_item()

def track_logger(self, track_id: str) -> logging.LoggerAdapter:
logger = logging.getLogger(f"{self.name}.{track_id}")
return logging.LoggerAdapter(logger, {"spider": self, "track_id": track_id})


@lru_cache
def get_track_id(seed: str) -> str:
return hashlib.sha1(seed.encode()).hexdigest()[:10]


@lru_cache
def load_gql(path: str | Path) -> str:
Expand Down

0 comments on commit 1d7adb9

Please sign in to comment.