From 3c05563181d58f3c32a81f28d4c0e31e57c29090 Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sat, 7 Dec 2024 17:59:14 +0100 Subject: [PATCH] Allow passing a checksum of a previous run to `RCVListPipeline` and exit early if data has not changed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows us to repeatedly run the pipeline to check if the data has been updated, while stopping the pipeline as soon as possible if it remains unchanged. In case of this pipeline that means we send one request for the RCV list in XML format, but we do not send a requests to fetch pages from EUR-Lex, OEIL, etc. for each of the votes if the RCV list hasn’t changed. --- backend/howtheyvote/pipelines/common.py | 4 + backend/howtheyvote/pipelines/rcv_list.py | 24 ++++- backend/howtheyvote/scrapers/common.py | 7 ++ ...cv-list_pv-9-2024-04-24-rcv-fr-evening.xml | 92 +++++++++++++++++++ .../rcv-list_pv-9-2024-04-24-rcv-fr-noon.xml | 83 +++++++++++++++++ backend/tests/pipelines/test_rcv_list.py | 66 ++++++++++++- 6 files changed, 272 insertions(+), 4 deletions(-) create mode 100644 backend/tests/pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-evening.xml create mode 100644 backend/tests/pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-noon.xml diff --git a/backend/howtheyvote/pipelines/common.py b/backend/howtheyvote/pipelines/common.py index 26cf91486..ae5505b1e 100644 --- a/backend/howtheyvote/pipelines/common.py +++ b/backend/howtheyvote/pipelines/common.py @@ -4,3 +4,7 @@ class PipelineError(Exception): class DataUnavailableError(PipelineError): pass + + +class DataUnchangedError(PipelineError): + pass diff --git a/backend/howtheyvote/pipelines/rcv_list.py b/backend/howtheyvote/pipelines/rcv_list.py index 9c11ffcff..af7c0f4af 100644 --- a/backend/howtheyvote/pipelines/rcv_list.py +++ b/backend/howtheyvote/pipelines/rcv_list.py @@ -27,7 +27,7 @@ ) from ..sharepics import generate_vote_sharepic from ..store import Aggregator, BulkWriter, index_records, map_vote, map_vote_group -from .common import DataUnavailableError, PipelineError +from .common import DataUnavailableError, DataUnchangedError, PipelineError log = get_logger(__name__) @@ -37,9 +37,16 @@ class RCVListPipeline: extracted votes and scrapes additional information such as data about legislative procedures.""" - def __init__(self, term: int, date: datetime.date): + def __init__( + self, + term: int, + date: datetime.date, + last_run_checksum: str | None = None, + ): self.term = term self.date = date + self.last_run_checksum = last_run_checksum + self.checksum: str | None = None self._vote_ids: set[str] = set() self._vote_group_ids: set[str] = set() self._request_cache: RequestCache = LRUCache(maxsize=25) @@ -106,9 +113,20 @@ def _scrape_rcv_list(self) -> None: date=self.date, active_members=active_members, ) + fragments = scraper.run() + + if ( + self.last_run_checksum is not None + and self.last_run_checksum == scraper.response_checksum + ): + raise DataUnchangedError( + "The data source hasn't changed since the last pipeline run." + ) + + self.checksum = scraper.response_checksum writer = BulkWriter() - writer.add(scraper.run()) + writer.add(fragments) writer.flush() self._vote_ids = writer.get_touched() diff --git a/backend/howtheyvote/scrapers/common.py b/backend/howtheyvote/scrapers/common.py index 3b9e9db92..82b707792 100644 --- a/backend/howtheyvote/scrapers/common.py +++ b/backend/howtheyvote/scrapers/common.py @@ -1,3 +1,4 @@ +import hashlib import html import random import time @@ -94,14 +95,17 @@ def get_url( class BaseScraper(ABC, Generic[ResourceType]): REQUEST_MAX_RETRIES: int = 0 + response_checksum: str | None def __init__(self, request_cache: RequestCache | None = None, **kwargs: Any) -> None: self._request_cache = request_cache self._log = log.bind(scraper=type(self).__name__, **kwargs) + self.response_checksum = None def run(self) -> Any: self._log.info("Running scraper") self._response = self._fetch() + self.response_checksum = self._compute_checksum(self._response) doc = self._parse(self._response) return self._extract_data(doc) @@ -164,6 +168,9 @@ def _headers(self) -> dict[str, str]: "user-agent": random.choice(USER_AGENTS), } + def _compute_checksum(self, response: Response) -> str: + return hashlib.sha256(response.content).hexdigest() + class BeautifulSoupScraper(BaseScraper[BeautifulSoup]): BS_PARSER: str = "lxml" diff --git a/backend/tests/pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-evening.xml b/backend/tests/pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-evening.xml new file mode 100644 index 000000000..86b6e13d6 --- /dev/null +++ b/backend/tests/pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-evening.xml @@ -0,0 +1,92 @@ + + + + + AVERTISSEMENT + NOTICE + HINWEIS + + + Les corrections et intentions de vote sont mentionnées dans ce document sous les points de vote correspondants. Elles sont publiées pour information uniquement et ne modifient en rien le résultat de vote tel qu’annoncé en plénière. Pendant la session, les demandes de corrections et intentions de vote reçues avant 18h30 sont publiées le jour même. Les demandes ultérieures sont publiées à mesure des mises à jour successives de ce document, pendant une durée maximale de deux semaines. Signification des sigles: + (pour), - (contre), 0 (abstention) + + Corrections to votes and voting intentions appear below in the section relating to the vote concerned. They are published for information purposes only and do not alter the result of the vote as announced in plenary. During the part-session, requests for corrections to votes and voting intentions received before 18.30 will be published the same day. Subsequent requests will be included in this document each time it is updated in the two weeks following the part-session. Key to symbols: + (in favour), - (against), 0 (abstentions) + + In diesem Dokument sind unter den betreffenden Abstimmungspunkten die Berichtigungen des Stimmverhaltens und das beabsichtigte Stimmverhalten aufgeführt. Diese Angaben dienen ausschließlich der Information; keinesfalls wird durch sie das Abstimmungsergebnis geändert, das im Plenum bekannt gegeben wurde. Während der Tagung werden Anträge zu Berichtigungen des Stimmverhaltens und zum beabsichtigten Stimmverhalten, die bis 18.30 Uhr eingehen, am selben Tag veröffentlicht. Später eingehende Anträge werden sukzessive veröffentlicht, indem dieses Dokument während höchstens zwei Wochen regelmäßig aktualisiert wird. Zeichenerklärung: + (dafür), - (dagegen), 0 (Enthaltung) + + + + + ПРОТОКОЛРезултат от поименни гласувания - Приложение 2 + ZÁPISVýsledek jmenovitého hlasování - Příloha 2 + PROTOKOLResultat af afstemningerne ved navneopråb - Bilag 2 + PROTOKOLLErgebnis der namentlichen Abstimmungen - Anlage 2 + ΣΥΝΟΠΤIΚΑ ΠΡΑΚΤIΚΑΑποτέλεσμα των ψηφοφοριών με ονομαστική κλήση - Παράρτηµα 2 + MINUTESResult of roll-call votes - Annex 2 + ACTAResultados de las votaciones nominales - Anexo 2 + PROTOKOLLNimelise hääletuse tulemused - lisa 2 + PÖYTÄKIRJANimenhuutoäänestysten tulokset - Liite 2 + PROCÈS-VERBALRésultat des votes par appel nominal - Annexe 2 + MIONTUAIRISCÍTorthaí na vótála le glaoch rolla - Iarscríbhinn 2 + ZAPISNIKRezultat poimeničnog glasovanja - Prilog 2 + JEGYZŐKÖNYVA név szerinti szavazások eredménye - melléklet 2 + PROCESSO VERBALERisultato delle votazioni per appello nominale - Allegato 2 + PROTOKOLASVardinio balsavimo rezultatai - priedas 2 + PROTOKOLSRezultāti balsošanai pēc saraksta - pielikums 2 + MINUTIRiżultat tal-votazzjoni bis-sejħa tal-ismijiet - Anness 2 + NOTULENUitslag van de hoofdelijke stemmingen - Bijlage 2 + PROTOKÓŁWyniki głosowań imiennych - Załącznik 2 + ATAResultados das votações nominais - Anexo 2 + PROCES-VERBALRezultatul voturilor prin apel nominal - Anexa 2 + ZÁPISNICAVýsledok hlasovania podľa mien - Príloha 2 + ZAPISNIKIzid poimenskega glasovanja - Priloga 2 + PROTOKOLLResultat av omröstningarna med namnupprop - Bilaga 2 + + + A9-0163/2024 - Gabriele Bischoff - Article 10, § 6, alinéa 2 - Am 1 + + + Adamowicz + + + + + C9-0120/2024 - Rejet - Am 13= 23= + + + Adamowicz + + + + + Amendments to Parliament’s Rules of Procedure concerning the training on preventing conflict and harassment in the workplace and on good office management + Good agricultural and environmental condition standards, schemes for climate, environment and animal welfare + + + + BERICHTIGUNGEN DES STIMMVERHALTENS UND BEABSICHTIGTES STIMMVERHALTEN + RÄTTELSER/AVSIKTSFÖRKLARINGAR TILL AVGIVNA RÖSTER + ÄÄNESTYSKÄYTTÄYTYMISTÄ JA ÄÄNESTYSAIKEITA KOSKEVAT ILMOITUKSET + CORREÇÕES E INTENÇÕES DE VOTO + ПОПРАВКИ В ПОДАДЕНИТЕ ГЛАСОВЕ И НАМЕРЕНИЯ ЗА ГЛАСУВАНЕ + KORREZZJONIJIET U INTENZJONIJIET GĦALL-VOT + ΔΙΟΡΘΩΣΕΙΣ ΚΑΙ ΠΡΟΘΕΣΕΙΣ ΨΗΦΟΥ + BALSAVIMO PATAISYMAI IR KETINIMAI + CORRECTIONS TO VOTES AND VOTING INTENTIONS + BALSOJUMU LABOJUMI UN NODOMI BALSOT + IZMJENE DANIH GLASOVA I NAMJERE GLASAČA + CORREZIONI E INTENZIONI DI VOTO + CORRECTIONS ET INTENTIONS DE VOTE + SZAVAZATOK HELYESBÍTÉSEI ÉS SZAVAZÁSI SZÁNDÉKOK + CORRECCIONES E INTENCIONES DE VOTO + HÄÄLETUSE PARANDUSED JA HÄÄLETUSKAVATSUSED + OPRAVY HLASOVÁNÍ A SDĚLENÍ O ÚMYSLU HLASOVAT + OPRAVY HLASOVANIA A ZÁMERY PRI HLASOVANÍ + POPRAVKI IN NAMERE GLASOVANJA + CEARTÚCHÁIN AR AN VÓTA AGUS INTINNÍ VÓTÁLA + KOREKTY GŁOSOWANIA I ZAMIAR GŁOSOWANIA + CORECTĂRI ŞI INTENŢII DE VOT + STEMMERETTELSER OG -INTENTIONER + RECTIFICATIES STEMGEDRAG/ VOORGENOMEN STEMGEDRAG + + + diff --git a/backend/tests/pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-noon.xml b/backend/tests/pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-noon.xml new file mode 100644 index 000000000..42f4481af --- /dev/null +++ b/backend/tests/pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-noon.xml @@ -0,0 +1,83 @@ + + + + + AVERTISSEMENT + NOTICE + HINWEIS + + + Les corrections et intentions de vote sont mentionnées dans ce document sous les points de vote correspondants. Elles sont publiées pour information uniquement et ne modifient en rien le résultat de vote tel qu’annoncé en plénière. Pendant la session, les demandes de corrections et intentions de vote reçues avant 18h30 sont publiées le jour même. Les demandes ultérieures sont publiées à mesure des mises à jour successives de ce document, pendant une durée maximale de deux semaines. Signification des sigles: + (pour), - (contre), 0 (abstention) + + Corrections to votes and voting intentions appear below in the section relating to the vote concerned. They are published for information purposes only and do not alter the result of the vote as announced in plenary. During the part-session, requests for corrections to votes and voting intentions received before 18.30 will be published the same day. Subsequent requests will be included in this document each time it is updated in the two weeks following the part-session. Key to symbols: + (in favour), - (against), 0 (abstentions) + + In diesem Dokument sind unter den betreffenden Abstimmungspunkten die Berichtigungen des Stimmverhaltens und das beabsichtigte Stimmverhalten aufgeführt. Diese Angaben dienen ausschließlich der Information; keinesfalls wird durch sie das Abstimmungsergebnis geändert, das im Plenum bekannt gegeben wurde. Während der Tagung werden Anträge zu Berichtigungen des Stimmverhaltens und zum beabsichtigten Stimmverhalten, die bis 18.30 Uhr eingehen, am selben Tag veröffentlicht. Später eingehende Anträge werden sukzessive veröffentlicht, indem dieses Dokument während höchstens zwei Wochen regelmäßig aktualisiert wird. Zeichenerklärung: + (dafür), - (dagegen), 0 (Enthaltung) + + + + + ПРОТОКОЛРезултат от поименни гласувания - Приложение 2 + ZÁPISVýsledek jmenovitého hlasování - Příloha 2 + PROTOKOLResultat af afstemningerne ved navneopråb - Bilag 2 + PROTOKOLLErgebnis der namentlichen Abstimmungen - Anlage 2 + ΣΥΝΟΠΤIΚΑ ΠΡΑΚΤIΚΑΑποτέλεσμα των ψηφοφοριών με ονομαστική κλήση - Παράρτηµα 2 + MINUTESResult of roll-call votes - Annex 2 + ACTAResultados de las votaciones nominales - Anexo 2 + PROTOKOLLNimelise hääletuse tulemused - lisa 2 + PÖYTÄKIRJANimenhuutoäänestysten tulokset - Liite 2 + PROCÈS-VERBALRésultat des votes par appel nominal - Annexe 2 + MIONTUAIRISCÍTorthaí na vótála le glaoch rolla - Iarscríbhinn 2 + ZAPISNIKRezultat poimeničnog glasovanja - Prilog 2 + JEGYZŐKÖNYVA név szerinti szavazások eredménye - melléklet 2 + PROCESSO VERBALERisultato delle votazioni per appello nominale - Allegato 2 + PROTOKOLASVardinio balsavimo rezultatai - priedas 2 + PROTOKOLSRezultāti balsošanai pēc saraksta - pielikums 2 + MINUTIRiżultat tal-votazzjoni bis-sejħa tal-ismijiet - Anness 2 + NOTULENUitslag van de hoofdelijke stemmingen - Bijlage 2 + PROTOKÓŁWyniki głosowań imiennych - Załącznik 2 + ATAResultados das votações nominais - Anexo 2 + PROCES-VERBALRezultatul voturilor prin apel nominal - Anexa 2 + ZÁPISNICAVýsledok hlasovania podľa mien - Príloha 2 + ZAPISNIKIzid poimenskega glasovanja - Priloga 2 + PROTOKOLLResultat av omröstningarna med namnupprop - Bilaga 2 + + + A9-0163/2024 - Gabriele Bischoff - Article 10, § 6, alinéa 2 - Am 1 + + + Adamowicz + + + + + Amendments to Parliament’s Rules of Procedure concerning the training on preventing conflict and harassment in the workplace and on good office management + + + + BERICHTIGUNGEN DES STIMMVERHALTENS UND BEABSICHTIGTES STIMMVERHALTEN + RÄTTELSER/AVSIKTSFÖRKLARINGAR TILL AVGIVNA RÖSTER + ÄÄNESTYSKÄYTTÄYTYMISTÄ JA ÄÄNESTYSAIKEITA KOSKEVAT ILMOITUKSET + CORREÇÕES E INTENÇÕES DE VOTO + ПОПРАВКИ В ПОДАДЕНИТЕ ГЛАСОВЕ И НАМЕРЕНИЯ ЗА ГЛАСУВАНЕ + KORREZZJONIJIET U INTENZJONIJIET GĦALL-VOT + ΔΙΟΡΘΩΣΕΙΣ ΚΑΙ ΠΡΟΘΕΣΕΙΣ ΨΗΦΟΥ + BALSAVIMO PATAISYMAI IR KETINIMAI + CORRECTIONS TO VOTES AND VOTING INTENTIONS + BALSOJUMU LABOJUMI UN NODOMI BALSOT + IZMJENE DANIH GLASOVA I NAMJERE GLASAČA + CORREZIONI E INTENZIONI DI VOTO + CORRECTIONS ET INTENTIONS DE VOTE + SZAVAZATOK HELYESBÍTÉSEI ÉS SZAVAZÁSI SZÁNDÉKOK + CORRECCIONES E INTENCIONES DE VOTO + HÄÄLETUSE PARANDUSED JA HÄÄLETUSKAVATSUSED + OPRAVY HLASOVÁNÍ A SDĚLENÍ O ÚMYSLU HLASOVAT + OPRAVY HLASOVANIA A ZÁMERY PRI HLASOVANÍ + POPRAVKI IN NAMERE GLASOVANJA + CEARTÚCHÁIN AR AN VÓTA AGUS INTINNÍ VÓTÁLA + KOREKTY GŁOSOWANIA I ZAMIAR GŁOSOWANIA + CORECTĂRI ŞI INTENŢII DE VOT + STEMMERETTELSER OG -INTENTIONER + RECTIFICATIES STEMGEDRAG/ VOORGENOMEN STEMGEDRAG + + + diff --git a/backend/tests/pipelines/test_rcv_list.py b/backend/tests/pipelines/test_rcv_list.py index 4cbc363ca..a03809f4a 100644 --- a/backend/tests/pipelines/test_rcv_list.py +++ b/backend/tests/pipelines/test_rcv_list.py @@ -1,11 +1,75 @@ import datetime import pytest +from sqlalchemy import select -from howtheyvote.pipelines import DataUnavailableError, RCVListPipeline +from howtheyvote.models import Group, GroupMembership, Member, Vote +from howtheyvote.pipelines import DataUnavailableError, DataUnchangedError, RCVListPipeline + +from ..scrapers.helpers import load_fixture def test_run_source_not_available(responses, db_session): with pytest.raises(DataUnavailableError): pipe = RCVListPipeline(term=9, date=datetime.date(2024, 4, 10)) pipe.run() + + +def test_run_data_unchanged(responses, db_session): + responses.get( + "https://www.europarl.europa.eu/doceo/document/PV-9-2024-04-24-RCV_FR.xml", + body=load_fixture("../../pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-noon.xml"), + ) + + member = Member( + id=197490, + first_name="Magdalena", + last_name="ADAMOWICZ", + group_memberships=[ + GroupMembership( + term=9, + start_date=datetime.datetime(2019, 7, 2), + end_date=datetime.datetime(2024, 7, 15), + group=Group["EPP"], + ), + ], + ) + db_session.add(member) + db_session.commit() + + # Run the pipeline for the first time + pipe = RCVListPipeline( + term=9, + date=datetime.date(2024, 4, 24), + ) + pipe.run() + last_run_checksum = pipe.checksum + + vote_ids = list(db_session.execute(select(Vote.id)).scalars()) + assert vote_ids == [168834] + + # Run the pipeline again and provide the checksum of the first run + with pytest.raises(DataUnchangedError): + pipe = RCVListPipeline( + term=9, + date=datetime.date(2024, 4, 24), + last_run_checksum=last_run_checksum, + ) + pipe.run() + + # Simulate that the source data has been updated in the meantime + responses.get( + "https://www.europarl.europa.eu/doceo/document/PV-9-2024-04-24-RCV_FR.xml", + body=load_fixture("../../pipelines/data/rcv-list_pv-9-2024-04-24-rcv-fr-evening.xml"), + ) + + # Run the pipeline again and provide the checksum of the first run + pipe = RCVListPipeline( + term=9, + date=datetime.date(2024, 4, 24), + last_run_checksum=last_run_checksum, + ) + pipe.run() + + vote_ids = list(db_session.execute(select(Vote.id)).scalars()) + assert vote_ids == [168834, 168864]