From 73e0ffe1ad6f78c337f585782dfc2c614aa8b7f8 Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sun, 17 Nov 2024 14:02:39 +0100 Subject: [PATCH 1/8] Optionally disable all HTTP request mocks --- backend/tests/conftest.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/backend/tests/conftest.py b/backend/tests/conftest.py index 26bd85e14..2b1aa172c 100644 --- a/backend/tests/conftest.py +++ b/backend/tests/conftest.py @@ -5,8 +5,10 @@ executed and rollback any changes after execution. In order to test API routes, we make use of Flask’s built-in test client.""" +import os + import pytest -import responses as responses_lib +from responses import FirstMatchRegistry, RequestsMock from howtheyvote.db import Session, engine, migrate, session_factory from howtheyvote.meili import configure_indexes, delete_indexes @@ -55,8 +57,33 @@ def api(app): yield app.test_client() +class DummyRegistry(FirstMatchRegistry): + """A registry that ignores any requests that are added.""" + + def add(self, response): + return response + + @pytest.fixture def responses(): """Allows mocking HTTP requests made with requests.""" - with responses_lib.RequestsMock() as r: - yield r + + mock_requests = os.environ.get("HTV_TEST_MOCK_REQUESTS", "true").lower() in ["true", "1"] + + if not mock_requests: + # In most cases, we want HTTP requests in tests to be mocked. The `responses` package + # doesn’t seem to provide a global configuration option to disable all mocks and pass + # through the request. + # + # When calling `responses.get("http://...", body="Lorem ipsum")` in a test to register + # a mock response, the mock is stored in a registry. When the tested then tries to send + # a matching request, `responses` tries to find a matching mock in the registry. To + # disable all mocks, we simply pass a dummy registry that never actually registers any + # mocks and allow all unmatched requests to pass to the original source. + with RequestsMock(registry=DummyRegistry) as r: + r.add_passthru("http") + yield r + else: + # Return a "normal" requests mock that fails any request that isn’t explicitly mocked. + with RequestsMock() as r: + yield r From 52120e164abdce55f90ef01838be97379d0a047b Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sun, 17 Nov 2024 15:59:13 +0100 Subject: [PATCH 2/8] Run scraper tests against live data sources every Tuesday night --- .github/workflows/scheduled.yml | 48 +++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 .github/workflows/scheduled.yml diff --git a/.github/workflows/scheduled.yml b/.github/workflows/scheduled.yml new file mode 100644 index 000000000..08943857e --- /dev/null +++ b/.github/workflows/scheduled.yml @@ -0,0 +1,48 @@ +name: Scheduled scraper tests + +on: + pull_request: {} + workflow_dispatch: {} + schedule: + - cron: "0 0 * * 2" # every Tuesday at 00:00 + +jobs: + build: + runs-on: ubuntu-latest + + defaults: + run: + working-directory: ./backend + + services: + meilisearch: + image: "getmeili/meilisearch:v1.3.1" + ports: ["7700:7700"] + env: + MEILI_MASTER_KEY: "1234567890" + + steps: + - name: Checkout repo + uses: actions/checkout@v4 + + - name: Install poetry + run: pipx install poetry + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: "poetry" + cache-dependency-path: "./backend/poetry.lock" + + - name: Install dependencies + run: poetry install + + - name: Run tests against live data sources + run: make test + env: + HTV_TEST_MOCK_REQUESTS: "false" + HTV_BACKEND_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/database.sqlite3" + HTV_BACKEND_USERS_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/users.sqlite3" + MEILI_MASTER_KEY: "1234567890" + MEILI_URL: "http://localhost:7700" From 3e72aa8c0aff52a2a651bc88238a2da151e05ab6 Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sun, 17 Nov 2024 16:40:48 +0100 Subject: [PATCH 3/8] Update procedure scraper URLs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isn’t a breaking change in practice as the old URLs still work and redirect to the new URLs. --- backend/howtheyvote/scrapers/votes.py | 5 +---- ...html => oeil-procedure-file_2022-2201-ini.html} | 0 ...html => oeil-procedure-file_2022-2852-rsp.html} | 0 ...html => oeil-procedure-file_2023-2019-ini.html} | 0 backend/tests/scrapers/test_votes.py | 14 +++++++------- 5 files changed, 8 insertions(+), 11 deletions(-) rename backend/tests/scrapers/data/votes/{procedure_ficheprocedure-2022-2201-ini.html => oeil-procedure-file_2022-2201-ini.html} (100%) rename backend/tests/scrapers/data/votes/{procedure_ficheprocedure-2022-2852-rsp.html => oeil-procedure-file_2022-2852-rsp.html} (100%) rename backend/tests/scrapers/data/votes/{procedure_ficheprocedure-2023-2019-ini.html => oeil-procedure-file_2023-2019-ini.html} (100%) diff --git a/backend/howtheyvote/scrapers/votes.py b/backend/howtheyvote/scrapers/votes.py index f92a4475a..8fdb7dbd3 100644 --- a/backend/howtheyvote/scrapers/votes.py +++ b/backend/howtheyvote/scrapers/votes.py @@ -380,10 +380,7 @@ def _procedure_reference(self, doc: BeautifulSoup) -> str | None: class ProcedureScraper(BeautifulSoupScraper): BS_PARSER = "lxml" - BASE_URL = ( - "https://oeil.secure.europarl.europa.eu/" - "oeil/popups/ficheprocedure.do?lang=en&reference=" - ) + BASE_URL = "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=" TITLE_PREFIXES = ["Resolution on", "Motion"] diff --git a/backend/tests/scrapers/data/votes/procedure_ficheprocedure-2022-2201-ini.html b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html similarity index 100% rename from backend/tests/scrapers/data/votes/procedure_ficheprocedure-2022-2201-ini.html rename to backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html diff --git a/backend/tests/scrapers/data/votes/procedure_ficheprocedure-2022-2852-rsp.html b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2852-rsp.html similarity index 100% rename from backend/tests/scrapers/data/votes/procedure_ficheprocedure-2022-2852-rsp.html rename to backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2852-rsp.html diff --git a/backend/tests/scrapers/data/votes/procedure_ficheprocedure-2023-2019-ini.html b/backend/tests/scrapers/data/votes/oeil-procedure-file_2023-2019-ini.html similarity index 100% rename from backend/tests/scrapers/data/votes/procedure_ficheprocedure-2023-2019-ini.html rename to backend/tests/scrapers/data/votes/oeil-procedure-file_2023-2019-ini.html diff --git a/backend/tests/scrapers/test_votes.py b/backend/tests/scrapers/test_votes.py index d3c953faf..d276c3013 100644 --- a/backend/tests/scrapers/test_votes.py +++ b/backend/tests/scrapers/test_votes.py @@ -224,8 +224,8 @@ def test_rcv_list_scraper_timestamp_from_text(responses): def test_procedure_scraper(responses): responses.get( - "https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2023/2019(INI)", - body=load_fixture("votes/procedure_ficheprocedure-2023-2019-ini.html"), + "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2023/2019(INI)", + body=load_fixture("votes/oeil-procedure-file_2023-2019-ini.html"), ) scraper = ProcedureScraper(vote_id=162214, procedure_reference="2023/2019(INI)") @@ -236,7 +236,7 @@ def test_procedure_scraper(responses): source_name="ProcedureScraper", source_id=162214, group_key=162214, - source_url="https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2023/2019(INI)", + source_url="https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2023/2019(INI)", data={ "procedure_title": "Implementation of the 2018 Geoblocking Regulation in the Digital Single Market", "geo_areas": [], @@ -248,8 +248,8 @@ def test_procedure_scraper(responses): def test_procedure_scraper_geo_areas(responses): responses.get( - "https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2022/2852(RSP)", - body=load_fixture("votes/procedure_ficheprocedure-2022-2852-rsp.html"), + "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2022/2852(RSP)", + body=load_fixture("votes/oeil-procedure-file_2022-2852-rsp.html"), ) scraper = ProcedureScraper(vote_id=149218, procedure_reference="2022/2852(RSP)") @@ -259,8 +259,8 @@ def test_procedure_scraper_geo_areas(responses): def test_procedure_scraper_geo_areas_fuzzy(responses): responses.get( - "https://oeil.secure.europarl.europa.eu/oeil/popups/ficheprocedure.do?lang=en&reference=2022/2201(INI)", - body=load_fixture("votes/procedure_ficheprocedure-2022-2201-ini.html"), + "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference=2022/2201(INI)", + body=load_fixture("votes/oeil-procedure-file_2022-2201-ini.html"), ) scraper = ProcedureScraper(vote_id=155056, procedure_reference="2022/2201(INI)") From b10164e0446c4f642050884af6a500373e61d2cc Mon Sep 17 00:00:00 2001 From: Till Prochaska <1512805+tillprochaska@users.noreply.github.com> Date: Sun, 17 Nov 2024 17:04:50 +0100 Subject: [PATCH 4/8] Fix extraction of geographic areas after OEIL markup change --- backend/howtheyvote/scrapers/votes.py | 16 +- .../oeil-procedure-file_2022-2201-ini.html | 4216 ++++++++--------- .../oeil-procedure-file_2022-2852-rsp.html | 3361 ++++++------- backend/tests/scrapers/test_votes.py | 2 +- 4 files changed, 3318 insertions(+), 4277 deletions(-) diff --git a/backend/howtheyvote/scrapers/votes.py b/backend/howtheyvote/scrapers/votes.py index 8fdb7dbd3..eba7258e3 100644 --- a/backend/howtheyvote/scrapers/votes.py +++ b/backend/howtheyvote/scrapers/votes.py @@ -434,23 +434,23 @@ def _title(self, doc: BeautifulSoup) -> str | None: return normalized_title[:1].upper() + normalized_title[1:] def _geo_areas(self, doc: BeautifulSoup) -> list[str]: - start = doc.select_one( - '#basic-information-data strong:-soup-contains("Geographical area")' + # The website unfortunately doesn't use semantic markup, so we have + # to rely on visual properties + wrapper = doc.select_one( + '#section1 p.font-weight-bold:-soup-contains("Geographical area") + p' ) - if not start: + if not wrapper: return [] geo_areas = [] - for sibling in start.next_siblings: - if isinstance(sibling, Tag) and sibling.name == "strong": - break + for node in wrapper.children: + country_name = node.get_text(strip=True) - if not sibling.get_text(strip=True): + if not country_name: continue - country_name = sibling.get_text(strip=True) country = Country.from_label(country_name, fuzzy=True) if not country: diff --git a/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html index 23eccd29e..430ca2391 100644 --- a/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html +++ b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2201-ini.html @@ -1,2376 +1,1924 @@ - - - - - Procedure File: 2022/2201(INI) | Legislative Observatory | European Parliament - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
- -
+ + +
+ + + + +
+ +
+ + +
+
+
+
+
+

2022/2201(INI)

+
+ +
+ +

2022 Commission Report on Kosovo

+
+ + +
+ + + +
+
+
+
+
+ + + + +
+
+
+
+

Basic information

+ +
+
+
+

2022/2201(INI)

+ +

INI - Own-initiative procedure

+ + + + - - + + +

Subject

+

+ + 8.20 Enlargement of the Union + + +

+ + + +

Geographical area

+

+ + Kosovo under UNSCR 1244/1999 + + +

+ + + +
+ +
+ +

Status

+

Procedure completed

+ + +
- - - +
+ + + +
+
+
+
+

Key players

+
+ +
+
+ +
+
+ +
+ + + + + + + +
+
+
+
+
+

Key events

+
+
-
- +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
DateEventReferenceSummary
19/01/2023Committee referral announced in Parliament + + + + + + + + + + + + + + + + +
26/04/2023Vote in committee + + + + + + + + + + + + + + + + +
03/05/2023Committee report tabled for plenary + + A9-0174/2023 + + + + + + + + + + + + + + +
09/05/2023Debate in Parliament + + + + + + + + + + + + + + + + + + + + +
10/05/2023Decision by Parliament + + T9-0193/2023 + + + + + + + + + + + + + + + + +
10/05/2023Results of vote in Parliament + + + + + + + + + + + + + + +
+
-   - - + + +
  • + +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    Document typeReferenceDateSummary
    + Commission response to text adopted in plenary + + SP(2023)618 + + + + + 31/01/2024 + +
    -
    -
    - -
    - - - - - - - +
    +
  • + + +
    +
    +
    + + + + + + + + + + + +
    +
    +
    +
    +

    Transparency

    +
    +
    + - - - - + +

    Meetings with interest representatives published in line with the Rules of Procedure

    +
    + +
      +
    • + +
      +
      +
      + + + + + + + + + + + + + + + + + + + +
      NameDateInterest representatives
      + STRIK Tineke + 28/02/2023 + + representative of the government of Kosovo + + +
      +
      +
      - +
    • +
    +
    +
    +
    + + + + + + + + + + + + + + + + + + + + +
    - - + + + +
    +
    + + +
    + + +
    + + +
    - -
    - - - -
    -
    - - -
    -
    -
    -
    -
    -
    -
    -
    -
    - -
    - -
    -
    -
    -
    -
    - -
    -
    -
    -
    -
    - - -
    - -
    - - - - - -
    -
    -
    -
    - - - - - - - -
    - - -

    Document reference

    -
    -
    -
    -
    -
    -
    -
    - - 2022/2201(INI) - -   -
    -
    -
    -
    -
    -
    -
    - - - -
    -
    - -
    -
    -
    -
    -
    - - 2022 Commission Report on Kosovo - -   -
    -
    -
    -
    -
    -
    -
    - - - - -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    - - Basic information - -   -
    -
    -
    -
    -
    - -
    -
    -
    -
    -
    - -
    -
    - -
    -
    -
    -
    -
    -
    -

    2022/2201(INI)

    -

    - INI - Own-initiative procedure -

    - -

    - -
    Subject
    - 8.20 Enlargement of the Union
    -
    Geographical area
    - Kosovo under UNSCR 1244/1999
    -

    -
    -
    -
    -
    -
    -
    -

    Status

    -

    - Procedure completed -

    -

     

    - - - -
    -
    -
    -
    -
    -
    - - -
    -
    - - - - -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    - - - Key players - -   -
    -
    -
    -
    -
    - -
    -
    -
    -
    - -
    - -
    -
    -
    - -
    - -
    - - - - - -
    -
    -
    - - - - - - - - -
    -
    -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    - - - Key events - -   -
    -
    -
    -
    -
    -
    - -
    -
    -
    -
    -
    - -
    - - -
    -
    -
    - - -
    -
    -
    - 19/01/2023 -
    -
    -
    -
    - Committee referral announced in Parliament -
    -
    -
    -
    - - - -
    -
    -
    -
    -
    - -
    -
    -
    - 26/04/2023 -
    -
    -
    -
    - Vote in committee -
    -
    -
    -
    - - - -
    -
    -
    -
    -
    - -
    -
    -
    - 03/05/2023 -
    -
    -
    -
    - Committee report tabled for plenary -
    -
    -
    -
    - - A9-0174/2023 - -
    -
    -
    -
    -
    - -
    -
    -
    - 09/05/2023 -
    -
    -
    -
    - Debate in Parliament -
    -
    -
    -
    - - - -
    -
    -
    -
    -
    - -
    -
    -
    - 10/05/2023 -
    -
    -
    -
    - Results of vote in Parliament -
    -
    -
    -
    - - -
    -
    -
    -
    -
    - -
    -
    -
    - 10/05/2023 -
    -
    -
    -
    - Decision by Parliament -
    -
    -
    -
    - - T9-0193/2023 - -
    -
    -
    -
    -
    - -
    -
    -
    -
    - - - -
    -
    -
    - - - - - - - - - - - -
    -
    -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    - - - Technical information - -   -
    -
    -
    -
    -
    - -
    -
    -
    -
    - -
    - -
    -
    -
    - -
    -
    -
    - - - Procedure reference - - -
    -
    -
    -
    - 2022/2201(INI) -
    -
    -
    - -
    -
    -
    - - Procedure type - -
    -
    -
    -
    - INI - Own-initiative procedure -
    -
    -
    - -
    -
    -
    - Procedure subtype -
    -
    -
    -
    - Annual report -
    -
    -
    - - - - -
    -
    -
    - Legal basis -
    -
    -
    -
    - Rules of Procedure EP 54 -
    -
    -
    - - - -
    -
    -
    - Stage reached in procedure -
    -
    -
    -
    - Procedure completed -
    -
    -
    - -
    -
    -
    - Committee dossier -
    -
    -
    -
    - AFET/9/10922 -
    -
    -
    - - - - -
    -
    -
    - - - - -
    -
    - - -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    - - - Documentation gateway - -   -
    -
    -
    -
    -
    - -
    -
    -
    -
    - -
    - - - - - -
    -
    -
    -
    -
      - -
    • -
      - European Parliament  -
      -
      -
      - -
      - - -
      -
      - - Committee draft report - -
      -
      - - -
      -
      - -   - -   -
      -
      - - - - - -
      -
      - 25/01/2023 -
      -
      - - -
      -
      - - EP - -
      -
      - - -
      -
      -
      -
      -   -
      -
      -
      -
      -
      - -
      - - -
      -
      - - Amendments tabled in committee - -
      -
      - - -
      -
      - -   - -   -
      -
      - - - - - -
      -
      - 16/02/2023 -
      -
      - - -
      -
      - - EP - -
      -
      - - -
      -
      -
      -
      -   -
      -
      -
      -
      -
      - -
      - - -
      -
      - - Committee report tabled for plenary, single reading - -
      -
      - - -
      -
      - -   - -   -
      -
      - - - - - -
      -
      - 03/05/2023 -
      -
      - - -
      -
      - - EP - -
      -
      - - -
      -
      -
      -
      -   -
      -
      -
      -
      -
      - -
      - - -
      -
      - - Text adopted by Parliament, single reading - -
      -
      - - -
      -
      - -   - -   -
      -
      - - - - - -
      -
      - 10/05/2023 -
      -
      - - -
      -
      - - EP - -
      -
      - - -
      -
      -
      - -
      -
      -
      -
      -
      -
      -
    • - - - - - - - - - - - -
    • -
      - All  -
      -
      -
      - -
      - - -
      -
      - - Committee draft report - -
      -
      - - -
      -
      - -   - -   -
      -
      - - - - - -
      -
      - 25/01/2023 -
      -
      - - -
      -
      - - EP - -
      -
      - - -
      -
      -
      -
      -   -
      -
      -
      -
      -
      - -
      - - -
      -
      - - Amendments tabled in committee - -
      -
      - - -
      -
      - -   - -   -
      -
      - - - - - -
      -
      - 16/02/2023 -
      -
      - - -
      -
      - - EP - -
      -
      - - -
      -
      -
      -
      -   -
      -
      -
      -
      -
      - -
      - - -
      -
      - - Committee report tabled for plenary, single reading - -
      -
      - - -
      -
      - -   - -   -
      -
      - - - - - -
      -
      - 03/05/2023 -
      -
      - - -
      -
      - - EP - -
      -
      - - -
      -
      -
      -
      -   -
      -
      -
      -
      -
      - -
      - - -
      -
      - - Text adopted by Parliament, single reading - -
      -
      - - -
      -
      - -   - -   -
      -
      - - - - - -
      -
      - 10/05/2023 -
      -
      - - -
      -
      - - EP - -
      -
      - - -
      -
      -
      - -
      -
      -
      -
      -
      -
      -
    • - -
    -
    -
    -
    -
    - - - - - - - - - -
    -
    - -
    -
    - -
    -
    -
    -
    -
    -
    -
    -
    - - - Transparency - -   -
    -
    -
    -
    -
    - -
    -
    - - Meetings with interest representatives published in line with the Rules of Procedure - -
    -
    -
    -
    - -
    - - -
    -
    -
    -
    - -
      -
    • -
      - Rapporteurs, Shadow Rapporteurs and Committee Chairs  -
      -
      -
      -
      - - - - - -
      -
      - - Rapporteur - -
      -
      - - -
      -
      - - -   -
      -
      - - -
      -
      - 20/03/2023 -
      -
      - - -
      -
      - - Platforma Civikos
      -
      -
      -
      - -
      -
      - - -
      - -
      - - -
      -
      - - Shadow rapporteur - -
      -
      - - -
      -
      - - -   -
      -
      - - -
      -
      - 18/01/2023 -
      -
      - - -
      -
      - - Cabinet to the First Deputy Prime Minister of the Republic of Kosovo
      -
      -
      -
      - -
      -
      -
      -
    • -
    • -
      - Other Members  -
      -
      -
      -
      - - -
      -
      - - STRIK Tineke - -
      -
      - - -
      -
      - 28/02/2023 -
      -
      - - -
      -
      - - representative of the government of Kosovo
      -
      -
      -
      -
      -
      -
      -
    • -
    -
    -
    -
    -
    - - - -
    - - -
    - -
    - -
    - - - - - - - + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2852-rsp.html b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2852-rsp.html index 315597140..d91a95771 100644 --- a/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2852-rsp.html +++ b/backend/tests/scrapers/data/votes/oeil-procedure-file_2022-2852-rsp.html @@ -1,1973 +1,1466 @@ - - - - - Procedure File: 2022/2852(RSP) | Legislative Observatory | European Parliament - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    - - - -
    - - +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - + - - - - + + +
    + + +
    + + + +
    + + + + + +