From 73e0ffe1ad6f78c337f585782dfc2c614aa8b7f8 Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sun, 17 Nov 2024 14:02:39 +0100 Subject: [PATCH 1/8] Optionally disable all HTTP request mocks --- backend/tests/conftest.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 26bd85e14..2b1aa172c 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -5,8 +5,10 @@ executed and rollback any changes after execution. In order to test API routes, we make use of Flask’s built-in test client.""" +import os + import pytest -import responses as responses_lib +from responses import FirstMatchRegistry, RequestsMock from howtheyvote.db import Session, engine, migrate, session_factory from howtheyvote.meili import configure_indexes, delete_indexes @@ -55,8 +57,33 @@ def api(app): yield app.test_client() +class DummyRegistry(FirstMatchRegistry): + """A registry that ignores any requests that are added.""" + + def add(self, response): + return response + + @pytest.fixture def responses(): """Allows mocking HTTP requests made with requests.""" - with responses_lib.RequestsMock() as r: - yield r + + mock_requests = os.environ.get("HTV_TEST_MOCK_REQUESTS", "true").lower() in ["true", "1"] + + if not mock_requests: + # In most cases, we want HTTP requests in tests to be mocked. The `responses` package + # doesn’t seem to provide a global configuration option to disable all mocks and pass + # through the request. + # + # When calling `responses.get("http://...", body="Lorem ipsum")` in a test to register + # a mock response, the mock is stored in a registry. When the tested then tries to send + # a matching request, `responses` tries to find a matching mock in the registry. To + # disable all mocks, we simply pass a dummy registry that never actually registers any + # mocks and allow all unmatched requests to pass to the original source. + with RequestsMock(registry=DummyRegistry) as r: + r.add_passthru("http") + yield r + else: + # Return a "normal" requests mock that fails any request that isn’t explicitly mocked. + with RequestsMock() as r: + yield r From 52120e164abdce55f90ef01838be97379d0a047b Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sun, 17 Nov 2024 15:59:13 +0100 Subject: [PATCH 2/8] Run scraper tests against live data sources every Tuesday night --- .github/workflows/scheduled.yml | 48 +++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 .github/workflows/scheduled.yml diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml new file mode 100644 index 000000000..08943857e --- /dev/null +++ b/.github/workflows/scheduled.yml @@ -0,0 +1,48 @@ +name: Scheduled scraper tests + +on: + pull_request: {} + workflow_dispatch: {} + schedule: + - cron: "0 0 * * 2" # every Tuesday at 00:00 + +jobs: + build: + runs-on: ubuntu-latest + + defaults: + run: + working-directory: ./backend + + services: + meilisearch: + image: "getmeili/meilisearch:v1.3.1" + ports: ["7700:7700"] + env: + MEILI_MASTER_KEY: "1234567890" + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Install poetry + run: pipx install poetry + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "poetry" + cache-dependency-path: "./backend/poetry.lock" + + - name: Install dependencies + run: poetry install + + - name: Run tests against live data sources + run: make test + env: + HTV_TEST_MOCK_REQUESTS: "false" + HTV_BACKEND_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/database.sqlite3" + HTV_BACKEND_USERS_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/users.sqlite3" + MEILI_MASTER_KEY: "1234567890" + MEILI_URL: "http://localhost:7700" From 3e72aa8c0aff52a2a651bc88238a2da151e05ab6 Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sun, 17 Nov 2024 16:40:48 +0100 Subject: [PATCH 3/8] Update procedure scraper URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isn’t a breaking change in practice as the old URLs still work and redirect to the new URLs. --- backend/howtheyvote/scrapers/votes.py | 5 +---- ...html => oeil-procedure-file_2022-2201-ini.html} | 0 ...html => oeil-procedure-file_2022-2852-rsp.html} | 0 ...html => oeil-procedure-file_2023-2019-ini.html} | 0 backend/tests/scrapers/test_votes.py | 14 +++++++------- 5 files changed, 8 insertions(+), 11 deletions(-) rename backend/tests/scrapers/data/votes/{procedure_ficheprocedure-2022-2201-ini.html => oeil-procedure-file_2022-2201-ini.html} (100%) rename backend/tests/scrapers/data/votes/{procedure_ficheprocedure-2022-2852-rsp.html => oeil-procedure-file_2022-2852-rsp.html} (100%) rename backend/tests/scrapers/data/votes/{procedure_ficheprocedure-2023-2019-ini.html => oeil-procedure-file_2023-2019-ini.html} (100%) diff --git a/backend/howtheyvote/scrapers/votes.py b/backend/howtheyvote/scrapers/votes.py index f92a4475a..8fdb7dbd3 100644 --- a/backend/howtheyvote/scrapers/votes.py +++ b/backend/howtheyvote/scrapers/votes.py @@ -380,10 +380,7 @@ def _procedure_reference(self, doc: BeautifulSoup) -> str | None: class ProcedureScraper(BeautifulSoupScraper): BS_PARSER = "lxml" - BASE_URL = ( - "https://oeil.secure.europarl.europa.eu/" - "oeil/popups/ficheprocedure.do?lang=en&reference=" - ) + BASE_URL = "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=" TITLE_PREFIXES = ["Resolution on", "Motion"] diff --git a/backend/tests/scrapers/data/votes/procedure_ficheprocedure-2022-2201-ini.html b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html similarity index 100% rename from backend/tests/scrapers/data/votes/procedure_ficheprocedure-2022-2201-ini.html rename to backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html diff --git a/backend/tests/scrapers/data/votes/procedure_ficheprocedure-2022-2852-rsp.html b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2852-rsp.html similarity index 100% rename from backend/tests/scrapers/data/votes/procedure_ficheprocedure-2022-2852-rsp.html rename to backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2852-rsp.html diff --git a/backend/tests/scrapers/data/votes/procedure_ficheprocedure-2023-2019-ini.html b/backend/tests/scrapers/data/votes/oeil-procedure-file_2023-2019-ini.html similarity index 100% rename from backend/tests/scrapers/data/votes/procedure_ficheprocedure-2023-2019-ini.html rename to backend/tests/scrapers/data/votes/oeil-procedure-file_2023-2019-ini.html diff --git a/backend/tests/scrapers/test_votes.py b/backend/tests/scrapers/test_votes.py index d3c953faf..d276c3013 100644 --- a/backend/tests/scrapers/test_votes.py +++ b/backend/tests/scrapers/test_votes.py @@ -224,8 +224,8 @@ def test_rcv_list_scraper_timestamp_from_text(responses): def test_procedure_scraper(responses): responses.get( - "https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2023/2019(INI)", - body=load_fixture("votes/procedure_ficheprocedure-2023-2019-ini.html"), + "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2023/2019(INI)", + body=load_fixture("votes/oeil-procedure-file_2023-2019-ini.html"), ) scraper = ProcedureScraper(vote_id=162214, procedure_reference="2023/2019(INI)") @@ -236,7 +236,7 @@ def test_procedure_scraper(responses): source_name="ProcedureScraper", source_id=162214, group_key=162214, - source_url="https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2023/2019(INI)", + source_url="https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2023/2019(INI)", data={ "procedure_title": "Implementation of the 2018 Geoblocking Regulation in the Digital Single Market", "geo_areas": [], @@ -248,8 +248,8 @@ def test_procedure_scraper(responses): def test_procedure_scraper_geo_areas(responses): responses.get( - "https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2022/2852(RSP)", - body=load_fixture("votes/procedure_ficheprocedure-2022-2852-rsp.html"), + "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2022/2852(RSP)", + body=load_fixture("votes/oeil-procedure-file_2022-2852-rsp.html"), ) scraper = ProcedureScraper(vote_id=149218, procedure_reference="2022/2852(RSP)") @@ -259,8 +259,8 @@ def test_procedure_scraper_geo_areas(responses): def test_procedure_scraper_geo_areas_fuzzy(responses): responses.get( - "https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2022/2201(INI)", - body=load_fixture("votes/procedure_ficheprocedure-2022-2201-ini.html"), + "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2022/2201(INI)", + body=load_fixture("votes/oeil-procedure-file_2022-2201-ini.html"), ) scraper = ProcedureScraper(vote_id=155056, procedure_reference="2022/2201(INI)") From b10164e0446c4f642050884af6a500373e61d2cc Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sun, 17 Nov 2024 17:04:50 +0100 Subject: [PATCH 4/8] Fix extraction of geographic areas after OEIL markup change --- backend/howtheyvote/scrapers/votes.py | 16 +- .../oeil-procedure-file_2022-2201-ini.html | 4216 ++++++++--------- .../oeil-procedure-file_2022-2852-rsp.html | 3361 ++++++------- backend/tests/scrapers/test_votes.py | 2 +- 4 files changed, 3318 insertions(+), 4277 deletions(-) diff --git a/backend/howtheyvote/scrapers/votes.py b/backend/howtheyvote/scrapers/votes.py index 8fdb7dbd3..eba7258e3 100644 --- a/backend/howtheyvote/scrapers/votes.py +++ b/backend/howtheyvote/scrapers/votes.py @@ -434,23 +434,23 @@ def _title(self, doc: BeautifulSoup) -> str | None: return normalized_title[:1].upper() + normalized_title[1:] def _geo_areas(self, doc: BeautifulSoup) -> list[str]: - start = doc.select_one( - '#basic-information-data strong:-soup-contains("Geographical area")' + # The website unfortunately doesn't use semantic markup, so we have + # to rely on visual properties + wrapper = doc.select_one( + '#section1 p.font-weight-bold:-soup-contains("Geographical area") + p' ) - if not start: + if not wrapper: return [] geo_areas = [] - for sibling in start.next_siblings: - if isinstance(sibling, Tag) and sibling.name == "strong": - break + for node in wrapper.children: + country_name = node.get_text(strip=True) - if not sibling.get_text(strip=True): + if not country_name: continue - country_name = sibling.get_text(strip=True) country = Country.from_label(country_name, fuzzy=True) if not country: diff --git a/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html index 23eccd29e..430ca2391 100644 --- a/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html +++ b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html @@ -1,2376 +1,1924 @@ - - - -
-