Skip to content

Commit

Permalink
okfn-brasil#1207 Cria spider para Rio das Ostras/RJ
Browse files Browse the repository at this point in the history
  • Loading branch information
AllexLima10 committed Sep 2, 2024
1 parent d588f77 commit 5394171
Showing 1 changed file with 49 additions and 0 deletions.
49 changes: 49 additions & 0 deletions data_collection/gazette/spiders/rj/rj_rio_das_ostras.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import re
from datetime import date, datetime as dt

import scrapy

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class RjRioDasOstrasSpider(BaseGazetteSpider):
name = "rj_rio_das_ostras"
TERRITORY_ID = "3304524"
allowed_domains = ["riodasostras.rj.gov.br"]
start_date = date(2001, 7, 13)

def start_requests(self):
for year in range(self.start_date.year, self.end_date.year + 1):
base_url = f"https://appro.riodasostras.rj.gov.br/riodasostrasapp_server/api/jornais/search/site?&limit=600&offset=0&orderBy=data&orderDir=desc&ano={year}"
yield scrapy.Request(base_url)

def parse(self, response):
for gazette_data in response.json():
raw_gazette_date = gazette_data["data"][:10]
gazette_date = dt.strptime(raw_gazette_date, "%Y-%m-%d").date()

if gazette_date > self.end_date:
continue

if gazette_date < self.start_date:
return

match = re.search(r"Edição.*?(\d+)", gazette_data["edicao"])
gazette_edition_number = "" if match is None else match.group(1)
is_extra_edition = bool(
re.search(
r"anex|encar|loa|ppa|ldo|conc",
gazette_data["edicao"],
re.IGNORECASE,
)
)
gazette_url = gazette_data["link"]

yield Gazette(
date=gazette_date,
edition_number=gazette_edition_number,
is_extra_edition=is_extra_edition,
file_urls=[gazette_url],
power="executive_legislative",
)

0 comments on commit 5394171

Please sign in to comment.