Add Uberaba/MG spider (#132)

* Add mg_uberaba spider * Update CITIES.md * Fix domain and date extraction
okfn-brasil · Oct 29, 2019 · 9c1dfa0 · 9c1dfa0
1 parent 928212e
commit 9c1dfa0
Showing 2 changed files with 64 additions and 1 deletion.
diff --git a/CITIES.md b/CITIES.md
@@ -90,7 +90,7 @@ The municipality id (IBGE code) can be found on [Wikipedia](https://pt.wikipedia
 | 80 | Boa Vista | :soon: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/101) |
 | 81 | Ribeirão das Neves | | | | |
 | 82 | Paulista | | | | |
-| 83 | Uberaba | | | | |
+| 83 | Uberaba | :white_check_mark: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/132) |
 | 84 | Cascavel | :white_check_mark: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/92) |
 | 85 | Guarujá | :white_check_mark: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/96) |
 | 86 | Praia Grande | | | | |

diff --git a/processing/data_collection/gazette/spiders/mg_uberaba.py b/processing/data_collection/gazette/spiders/mg_uberaba.py
@@ -0,0 +1,63 @@
+import re
+import datetime as dt
+
+from dateparser import parse
+from scrapy.http import FormRequest
+
+from gazette.items import Gazette
+from gazette.spiders.base import BaseGazetteSpider
+
+
+class MgUberaba(BaseGazetteSpider):
+    TERRITORY_ID = "3170107"
+    name = "mg_uberaba"
+    allowed_domains = ["uberaba.mg.gov.br"]
+
+    LIST_GAZETTES_URL = "http://www.uberaba.mg.gov.br/portal/listImagesHtml"
+    DOWNLOAD_URL_TEMPLATE = (
+        "http://www.uberaba.mg.gov.br:8080/portal/acervo/portavoz/arquivos/{}/{}"
+    )
+
+    def start_requests(self):
+        next_year = dt.datetime.today().year + 1
+
+        for year in range(2015, next_year):
+            yield FormRequest(
+                url=self.LIST_GAZETTES_URL,
+                method="POST",
+                formdata={
+                    "desc": "1",
+                    "type": "1",
+                    "folder": f"portavoz/arquivos/{year}",
+                    "limit": "5000",
+                    "page": "1",
+                    "types": "pdf",
+                    "listAll": "1",
+                },
+                meta={"year": year},
+            )
+
+    def parse(self, response):
+        filenames = [
+            filename.strip()
+            for filename in response.xpath(
+                '//div[@class="claGaleriaBoxFileTable"]/text()'
+            ).extract()
+        ]
+        for filename in filenames:
+            date = self.extract_date(filename)
+            yield Gazette(
+                date=date,
+                file_urls=[self.mount_url(filename, date.year)],
+                is_extra_edition=False,
+                territory_id=self.TERRITORY_ID,
+                power="executive_legislature",
+                scraped_at=dt.datetime.utcnow(),
+            )
+
+    def extract_date(self, filename):
+        date_str = re.search(r"(\d{2}-\d{2}-\d{4})", filename).group(1)
+        return parse(date_str, languages=["pt"]).date()
+
+    def mount_url(self, filename, year):
+        return self.DOWNLOAD_URL_TEMPLATE.format(year, filename)