Skip to content

Commit

Permalink
Run scrapers against live data sources (#1059)
Browse files Browse the repository at this point in the history
  • Loading branch information
tillprochaska authored Dec 15, 2024
2 parents 90c72bb + 4d27c62 commit 1ef6c24
Show file tree
Hide file tree
Showing 17 changed files with 6,164 additions and 6,177 deletions.
47 changes: 47 additions & 0 deletions .github/workflows/scheduled.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
name: Scheduled scraper tests

on:
workflow_dispatch: {}
schedule:
- cron: "0 0 * * 2" # every Tuesday at 00:00

jobs:
build:
runs-on: ubuntu-latest

defaults:
run:
working-directory: ./backend

services:
meilisearch:
image: "getmeili/meilisearch:v1.3.1"
ports: ["7700:7700"]
env:
MEILI_MASTER_KEY: "1234567890"

steps:
- name: Checkout repo
uses: actions/checkout@v4

- name: Install poetry
run: pipx install poetry

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
cache: "poetry"
cache-dependency-path: "./backend/poetry.lock"

- name: Install dependencies
run: poetry install

- name: Run tests against live data sources
run: make test
env:
HTV_TEST_MOCK_REQUESTS: "false"
HTV_BACKEND_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/database.sqlite3"
HTV_BACKEND_USERS_DATABASE_URI: "sqlite:///${{ github.workspace }}/storage/database/users.sqlite3"
MEILI_MASTER_KEY: "1234567890"
MEILI_URL: "http://localhost:7700"
21 changes: 9 additions & 12 deletions backend/howtheyvote/scrapers/votes.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,10 +380,7 @@ def _procedure_reference(self, doc: BeautifulSoup) -> str | None:

class ProcedureScraper(BeautifulSoupScraper):
BS_PARSER = "lxml"
BASE_URL = (
"https://oeil.secure.europarl.europa.eu/"
"oeil/popups/ficheprocedure.do?lang=en&reference="
)
BASE_URL = "https://oeil.secure.europarl.europa.eu/oeil/en/procedure-file?reference="

TITLE_PREFIXES = ["Resolution on", "Motion"]

Expand Down Expand Up @@ -437,23 +434,23 @@ def _title(self, doc: BeautifulSoup) -> str | None:
return normalized_title[:1].upper() + normalized_title[1:]

def _geo_areas(self, doc: BeautifulSoup) -> list[str]:
start = doc.select_one(
'#basic-information-data strong:-soup-contains("Geographical area")'
# The website unfortunately doesn't use semantic markup, so we have
# to rely on visual properties
wrapper = doc.select_one(
'#section1 p.font-weight-bold:-soup-contains("Geographical area") + p'
)

if not start:
if not wrapper:
return []

geo_areas = []

for sibling in start.next_siblings:
if isinstance(sibling, Tag) and sibling.name == "strong":
break
for node in wrapper.children:
country_name = node.get_text(strip=True)

if not sibling.get_text(strip=True):
if not country_name:
continue

country_name = sibling.get_text(strip=True)
country = Country.from_label(country_name, fuzzy=True)

if not country:
Expand Down
3 changes: 3 additions & 0 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ env = [
"HTV_BACKEND_PUBLIC_URL=https://example.org/api",
"HTV_SEARCH_INDEX_PREFIX=test",
]
markers = [
"always_mock_requests: Always mock HTTP requests, even when request mocks are disabled globally"
]
addopts = [
"--import-mode=importlib",
]
39 changes: 35 additions & 4 deletions backend/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
executed and rollback any changes after execution. In order to test API routes, we make use
of Flask’s built-in test client."""

import os

import pytest
import responses as responses_lib
from responses import FirstMatchRegistry, RequestsMock

from howtheyvote.db import Session, engine, migrate, session_factory
from howtheyvote.meili import configure_indexes, delete_indexes
Expand Down Expand Up @@ -55,8 +57,37 @@ def api(app):
yield app.test_client()


class DummyRegistry(FirstMatchRegistry):
"""A registry that ignores any requests that are added."""

def add(self, response):
return response


@pytest.fixture
def responses():
"""Allows mocking HTTP requests made with requests."""
with responses_lib.RequestsMock() as r:
def responses(request):
"""Allows mocking HTTP requests made with `requests`. Request mocking can be
disabled globally using the `HTV_TEST_MOCK_REQUESTS=false` env variable to
run tests against the live sources rather than test fixtures. Individual tests
can be marked using `always_mock_requests` to mock them even if requests mocks
are disabled globally. However, it’s preferred to write tests that can be run
against the live data sources."""

mock_requests = os.environ.get("HTV_TEST_MOCK_REQUESTS", "true").lower() in ["true", "1"]
marks = [m.name for m in request.node.iter_markers()]
always_mock_requests = "always_mock_requests" in marks

if always_mock_requests or mock_requests:
with RequestsMock() as r:
# Yield a "normal" requests mock that fails any request that isn’t explicitly mocked.
yield r
return

# When calling `responses.get("http://...", body="Lorem ipsum")` in a test to register
# a mock response, the mock is stored in a registry. When the tested then tries to send
# a matching request, `responses` tries to find a matching mock in the registry. To
# disable all mocks, we simply pass a dummy registry that never actually registers any
# mocks and allow all unmatched requests to pass to the original source.
with RequestsMock(registry=DummyRegistry) as r:
r.add_passthru("http")
yield r
1 change: 1 addition & 0 deletions backend/tests/pipelines/test_rcv_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from howtheyvote.pipelines import DataUnavailableError, RCVListPipeline


@pytest.mark.always_mock_requests
def test_run_source_not_available(responses, db_session):
with pytest.raises(DataUnavailableError):
pipe = RCVListPipeline(term=9, date=datetime.date(2024, 4, 10))
Expand Down
Loading

0 comments on commit 1ef6c24

Please sign in to comment.