From 1d7adb9076fe72a077f8ed6937af32d30b73a959 Mon Sep 17 00:00:00 2001 From: Honza Javorek Date: Fri, 16 Feb 2024 15:42:07 +0100 Subject: [PATCH] tracking --- juniorguru_plucker/actors.py | 4 +- juniorguru_plucker/jobs_jobscz/spider.py | 58 ++++++++++++++++++------ 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/juniorguru_plucker/actors.py b/juniorguru_plucker/actors.py index 5d167a3..e5628ed 100644 --- a/juniorguru_plucker/actors.py +++ b/juniorguru_plucker/actors.py @@ -2,7 +2,7 @@ from typing import Generator, Type import nest_asyncio -from apify import Actor +from apify import Actor, Configuration from apify.scrapy.utils import apply_apify_settings from scrapy import Item, Spider from scrapy.settings import BaseSettings, Settings @@ -13,6 +13,8 @@ async def run_actor(settings: Settings, spider_class: Type[Spider]) -> None: + config = Configuration.get_global_configuration() + config.purge_on_start = True async with Actor: Actor.log.info(f"Spider {spider_class.name}") actor_input = await Actor.get_input() or {} diff --git a/juniorguru_plucker/jobs_jobscz/spider.py b/juniorguru_plucker/jobs_jobscz/spider.py index 072b4df..f790fe5 100644 --- a/juniorguru_plucker/jobs_jobscz/spider.py +++ b/juniorguru_plucker/jobs_jobscz/spider.py @@ -1,4 +1,6 @@ +import hashlib import json +import logging import re import uuid from datetime import date, datetime @@ -53,7 +55,9 @@ class Spider(BaseSpider): def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: card_xpath = "//article[contains(@class, 'SearchResultCard')]" for n, card in enumerate(response.xpath(card_xpath), start=1): - url = card.css('a[data-link="jd-detail"]::attr(href)').get() + url = cast(str, card.css('a[data-link="jd-detail"]::attr(href)').get()) + track_id = get_track_id(url) + loader = Loader(item=Job(), response=response) card_loader = loader.nested_xpath(f"{card_xpath}[{n}]") card_loader.add_value("source", self.name) @@ -69,8 +73,12 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: card_loader.add_value("source_urls", response.url) card_loader.add_value("source_urls", url) item = loader.load_item() + + self.track_logger(track_id).debug(f"Parsing card for {url}") yield response.follow( - url, callback=self.parse_job, cb_kwargs=dict(item=item) + url, + callback=self.parse_job, + cb_kwargs=dict(item=item, track_id=track_id), ) urls = [ response.urljoin(relative_url) @@ -80,16 +88,17 @@ def parse(self, response: HtmlResponse) -> Generator[Request, None, None]: yield from response.follow_all(urls, callback=self.parse) def parse_job( - self, response: HtmlResponse, item: Job + self, response: HtmlResponse, item: Job, track_id: str ) -> Generator[Job | Request, None, None]: + self.track_logger(track_id).debug("Parsing job page") loader = Loader(item=item, response=response) loader.add_value("url", response.url) loader.add_value("source_urls", response.url) if "www.jobs.cz" not in response.url: - yield from self.parse_job_widget_data(response, item) + yield from self.parse_job_widget_data(response, item, track_id) else: - # standard + self.track_logger(track_id).debug("Parsing as standard job page") for label in self.employment_types_labels: loader.add_xpath( "employment_types", @@ -97,8 +106,8 @@ def parse_job( ) loader.add_css("description_html", '[data-jobad="body"]') - # company if response.css('[class*="CompanyProfileNavigation"]').get(): + self.track_logger(track_id).debug("Parsing as company job page") loader.add_css( "company_logo_urls", ".CompanyProfileNavigation__logo img::attr(src)", @@ -113,20 +122,22 @@ def parse_job( yield loader.load_item() def parse_job_widget_data( - self, response: HtmlResponse, item: Job + self, response: HtmlResponse, item: Job, track_id: str ) -> Generator[Request, None, None]: try: - self.logger.debug("Looking for widget data in the HTML") + self.track_logger(track_id).debug("Looking for widget data in the HTML") widget_data = json.loads(response.css("script::text").re(WIDGET_DATA_RE)[0]) except IndexError: - self.logger.debug("Looking for widget data in attached JavaScript") + self.track_logger(track_id).debug( + "Looking for widget data in attached JavaScript" + ) script_url = response.css( 'script[src*="assets/js/script.min.js"]::attr(src)' ).get() yield response.follow( script_url, callback=self.parse_job_widget_script, - cb_kwargs=dict(item=item, html_response=response), + cb_kwargs=dict(item=item, html_response=response, track_id=track_id), ) else: yield from self.parse_job_widget( @@ -135,10 +146,15 @@ def parse_job_widget_data( widget_host=widget_data["host"], widget_api_key=widget_data["apiKey"], widget_id=widget_data["widgetId"], + track_id=track_id, ) def parse_job_widget_script( - self, script_response: TextResponse, html_response: HtmlResponse, item: Job + self, + script_response: TextResponse, + html_response: HtmlResponse, + item: Job, + track_id: str, ) -> Generator[Request, None, None]: if match := re.search(WIDGET_DATA_SCRIPT_RE, script_response.text): data_text = re.sub(r"\'", r"\\'", match.group(1)) @@ -153,6 +169,7 @@ def parse_job_widget_script( widget_host=data["host"], widget_api_key=widget_data["apiKey"], widget_id=widget_data["id"], + track_id=track_id, ) else: raise NotImplementedError("Widget data not found") @@ -164,14 +181,15 @@ def parse_job_widget( widget_host: str, widget_api_key: str, widget_id: str, + track_id: str, ) -> Generator[Request, None, None]: - params = get_params(response.url) - loader = Loader(item=item, response=response) loader.add_value("url", response.url) loader.add_value("company_url", f"https://{widget_host}") loader.add_value("source_urls", response.url) + self.track_logger(track_id).debug("Requesting data from job widget API") + params = get_params(response.url) yield Request( "https://api.capybara.lmc.cz/api/graphql/widget", method="POST", @@ -202,12 +220,13 @@ def parse_job_widget( ) ), callback=self.parse_job_widget_api, - cb_kwargs=dict(item=loader.load_item()), + cb_kwargs=dict(item=loader.load_item(), track_id=track_id), ) def parse_job_widget_api( - self, response: TextResponse, item: Job + self, response: TextResponse, item: Job, track_id: str ) -> Generator[Job, None, None]: + self.track_logger(track_id).debug("Parsing job widget API response") payload = cast(dict, response.json()) job_ad = payload["data"]["widget"]["jobAd"] @@ -227,6 +246,15 @@ def parse_job_widget_api( yield loader.load_item() + def track_logger(self, track_id: str) -> logging.LoggerAdapter: + logger = logging.getLogger(f"{self.name}.{track_id}") + return logging.LoggerAdapter(logger, {"spider": self, "track_id": track_id}) + + +@lru_cache +def get_track_id(seed: str) -> str: + return hashlib.sha1(seed.encode()).hexdigest()[:10] + @lru_cache def load_gql(path: str | Path) -> str: