From 0559d739b1605f1636036558c68f5c0921ea10b7 Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sun, 8 Dec 2024 16:09:27 +0100 Subject: [PATCH] =?UTF-8?q?Refactor:=20Don=E2=80=99t=20compute=20checksums?= =?UTF-8?q?=20in=20scraper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I initially added some code to `BaseScraper` that automatically computes the response checksum and exposes it as a property. This is only ever used by pipelines, so in order to remove some indirection, I’m moving it to a pipelines helper. --- backend/howtheyvote/pipelines/common.py | 6 ++++++ backend/howtheyvote/pipelines/rcv_list.py | 6 +++--- backend/howtheyvote/scrapers/common.py | 15 +++++---------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/backend/howtheyvote/pipelines/common.py b/backend/howtheyvote/pipelines/common.py index 774eb1b0..e4022999 100644 --- a/backend/howtheyvote/pipelines/common.py +++ b/backend/howtheyvote/pipelines/common.py @@ -1,7 +1,9 @@ from abc import ABC, abstractmethod from dataclasses import dataclass +import hashlib from typing import Any +from requests import Response from structlog import get_logger from ..models import PipelineStatus @@ -59,3 +61,7 @@ def run(self) -> PipelineResult: @abstractmethod def _run(self) -> None: raise NotImplementedError + + +def compute_response_checksum(response: Response) -> str: + return hashlib.sha256(response.content).hexdigest() diff --git a/backend/howtheyvote/pipelines/rcv_list.py b/backend/howtheyvote/pipelines/rcv_list.py index f2e942f6..004854fa 100644 --- a/backend/howtheyvote/pipelines/rcv_list.py +++ b/backend/howtheyvote/pipelines/rcv_list.py @@ -27,7 +27,7 @@ ) from ..sharepics import generate_vote_sharepic from ..store import Aggregator, BulkWriter, index_records, map_vote, map_vote_group -from .common import BasePipeline, DataUnavailableError, DataUnchangedError +from .common import BasePipeline, DataUnavailableError, DataUnchangedError, compute_response_checksum log = get_logger(__name__) @@ -98,13 +98,13 @@ def _scrape_rcv_list(self) -> None: if ( self.last_run_checksum is not None - and self.last_run_checksum == scraper.response_checksum + and self.last_run_checksum == compute_response_checksum(scraper.response) ): raise DataUnchangedError( "The data source hasn't changed since the last pipeline run." ) - self.checksum = scraper.response_checksum + self.checksum = compute_response_checksum(scraper.response) writer = BulkWriter() writer.add(fragments) diff --git a/backend/howtheyvote/scrapers/common.py b/backend/howtheyvote/scrapers/common.py index 82b70779..02f11a0c 100644 --- a/backend/howtheyvote/scrapers/common.py +++ b/backend/howtheyvote/scrapers/common.py @@ -1,4 +1,3 @@ -import hashlib import html import random import time @@ -95,18 +94,17 @@ def get_url( class BaseScraper(ABC, Generic[ResourceType]): REQUEST_MAX_RETRIES: int = 0 - response_checksum: str | None + response: Response | None def __init__(self, request_cache: RequestCache | None = None, **kwargs: Any) -> None: self._request_cache = request_cache self._log = log.bind(scraper=type(self).__name__, **kwargs) - self.response_checksum = None + self.response = None def run(self) -> Any: self._log.info("Running scraper") - self._response = self._fetch() - self.response_checksum = self._compute_checksum(self._response) - doc = self._parse(self._response) + self.response = self._fetch() + doc = self._parse(self.response) return self._extract_data(doc) @abstractmethod @@ -132,7 +130,7 @@ def _fragment( model=model.__name__, source_id=source_id, source_name=type(self).__name__, - source_url=self._response.request.url, + source_url=self.response.request.url, group_key=group_key, data=data, ) @@ -168,9 +166,6 @@ def _headers(self) -> dict[str, str]: "user-agent": random.choice(USER_AGENTS), } - def _compute_checksum(self, response: Response) -> str: - return hashlib.sha256(response.content).hexdigest() - class BeautifulSoupScraper(BaseScraper[BeautifulSoup]): BS_PARSER: str = "lxml"