Skip to content

Commit

Permalink
Refactor: Don’t compute checksums in scraper
Browse files Browse the repository at this point in the history
I initially added some code to `BaseScraper` that automatically computes the response checksum and exposes it as a property. This is only ever used by pipelines, so in order to remove some indirection, I’m moving it to a pipelines helper.
  • Loading branch information
tillprochaska committed Dec 8, 2024
1 parent 2a969eb commit 0559d73
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 13 deletions.
6 changes: 6 additions & 0 deletions backend/howtheyvote/pipelines/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
import hashlib
from typing import Any

from requests import Response
from structlog import get_logger

from ..models import PipelineStatus
Expand Down Expand Up @@ -59,3 +61,7 @@ def run(self) -> PipelineResult:
@abstractmethod
def _run(self) -> None:
raise NotImplementedError


def compute_response_checksum(response: Response) -> str:
return hashlib.sha256(response.content).hexdigest()
6 changes: 3 additions & 3 deletions backend/howtheyvote/pipelines/rcv_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)
from ..sharepics import generate_vote_sharepic
from ..store import Aggregator, BulkWriter, index_records, map_vote, map_vote_group
from .common import BasePipeline, DataUnavailableError, DataUnchangedError
from .common import BasePipeline, DataUnavailableError, DataUnchangedError, compute_response_checksum

log = get_logger(__name__)

Expand Down Expand Up @@ -98,13 +98,13 @@ def _scrape_rcv_list(self) -> None:

if (
self.last_run_checksum is not None
and self.last_run_checksum == scraper.response_checksum
and self.last_run_checksum == compute_response_checksum(scraper.response)
):
raise DataUnchangedError(
"The data source hasn't changed since the last pipeline run."
)

self.checksum = scraper.response_checksum
self.checksum = compute_response_checksum(scraper.response)

writer = BulkWriter()
writer.add(fragments)
Expand Down
15 changes: 5 additions & 10 deletions backend/howtheyvote/scrapers/common.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import hashlib
import html
import random
import time
Expand Down Expand Up @@ -95,18 +94,17 @@ def get_url(

class BaseScraper(ABC, Generic[ResourceType]):
REQUEST_MAX_RETRIES: int = 0
response_checksum: str | None
response: Response | None

def __init__(self, request_cache: RequestCache | None = None, **kwargs: Any) -> None:
self._request_cache = request_cache
self._log = log.bind(scraper=type(self).__name__, **kwargs)
self.response_checksum = None
self.response = None

def run(self) -> Any:
self._log.info("Running scraper")
self._response = self._fetch()
self.response_checksum = self._compute_checksum(self._response)
doc = self._parse(self._response)
self.response = self._fetch()
doc = self._parse(self.response)
return self._extract_data(doc)

@abstractmethod
Expand All @@ -132,7 +130,7 @@ def _fragment(
model=model.__name__,
source_id=source_id,
source_name=type(self).__name__,
source_url=self._response.request.url,
source_url=self.response.request.url,
group_key=group_key,
data=data,
)
Expand Down Expand Up @@ -168,9 +166,6 @@ def _headers(self) -> dict[str, str]:
"user-agent": random.choice(USER_AGENTS),
}

def _compute_checksum(self, response: Response) -> str:
return hashlib.sha256(response.content).hexdigest()


class BeautifulSoupScraper(BaseScraper[BeautifulSoup]):
BS_PARSER: str = "lxml"
Expand Down

0 comments on commit 0559d73

Please sign in to comment.