Skip to content

Commit

Permalink
Add Uberaba/MG spider (#132)
Browse files Browse the repository at this point in the history
* Add mg_uberaba spider

* Update CITIES.md

* Fix domain and date extraction
  • Loading branch information
giovanisleite authored and Irio committed Oct 29, 2019
1 parent 928212e commit 9c1dfa0
Showing 2 changed files with 64 additions and 1 deletion.
2 changes: 1 addition & 1 deletion CITIES.md
Original file line number Diff line number Diff line change
@@ -90,7 +90,7 @@ The municipality id (IBGE code) can be found on [Wikipedia](https://pt.wikipedia
| 80 | Boa Vista | :soon: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/101) |
| 81 | Ribeirão das Neves | | | | |
| 82 | Paulista | | | | |
| 83 | Uberaba | | | | |
| 83 | Uberaba | :white_check_mark: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/132) |
| 84 | Cascavel | :white_check_mark: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/92) |
| 85 | Guarujá | :white_check_mark: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/96) |
| 86 | Praia Grande | | | | |
63 changes: 63 additions & 0 deletions processing/data_collection/gazette/spiders/mg_uberaba.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import re
import datetime as dt

from dateparser import parse
from scrapy.http import FormRequest

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class MgUberaba(BaseGazetteSpider):
TERRITORY_ID = "3170107"
name = "mg_uberaba"
allowed_domains = ["uberaba.mg.gov.br"]

LIST_GAZETTES_URL = "http://www.uberaba.mg.gov.br/portal/listImagesHtml"
DOWNLOAD_URL_TEMPLATE = (
"http://www.uberaba.mg.gov.br:8080/portal/acervo/portavoz/arquivos/{}/{}"
)

def start_requests(self):
next_year = dt.datetime.today().year + 1

for year in range(2015, next_year):
yield FormRequest(
url=self.LIST_GAZETTES_URL,
method="POST",
formdata={
"desc": "1",
"type": "1",
"folder": f"portavoz/arquivos/{year}",
"limit": "5000",
"page": "1",
"types": "pdf",
"listAll": "1",
},
meta={"year": year},
)

def parse(self, response):
filenames = [
filename.strip()
for filename in response.xpath(
'//div[@class="claGaleriaBoxFileTable"]/text()'
).extract()
]
for filename in filenames:
date = self.extract_date(filename)
yield Gazette(
date=date,
file_urls=[self.mount_url(filename, date.year)],
is_extra_edition=False,
territory_id=self.TERRITORY_ID,
power="executive_legislature",
scraped_at=dt.datetime.utcnow(),
)

def extract_date(self, filename):
date_str = re.search(r"(\d{2}-\d{2}-\d{4})", filename).group(1)
return parse(date_str, languages=["pt"]).date()

def mount_url(self, filename, year):
return self.DOWNLOAD_URL_TEMPLATE.format(year, filename)

0 comments on commit 9c1dfa0

Please sign in to comment.