okfn-brasil · cuducos · Jul 10, 2018 · Apr 29, 2018 · Apr 30, 2018 · May 2, 2018
diff --git a/CITIES.md b/CITIES.md
@@ -15,7 +15,7 @@ The municipality id (IBGE code) can be found on [Wikipedia](https://pt.wikipedia
 | 5 | Fortaleza | :soon: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/52) |
 | 6 | Belo Horizonte | :soon: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/33) |
 | 7 | Manaus | :soon:  | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/51) |
-| 8 | Curitiba | :soon: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/42) |
+| 8 | Curitiba | :white_check_mark: | | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/42) |
 | 9 | Recife | :soon: | | | |
 | 10 | Porto Alegre | :white_check_mark: | :white_check_mark: | | |
 | 11 | Goiânia | :white_check_mark: | :white_check_mark: | | [PR](https://github.com/okfn-brasil/diario-oficial/pull/6) |

diff --git a/processing/data_collection/gazette/spiders/pr_curitiba.py b/processing/data_collection/gazette/spiders/pr_curitiba.py
@@ -0,0 +1,117 @@
+from dateparser import parse
+import datetime as dt
+import re
+
+from gazette.spiders.base import BaseGazetteSpider
+
+import scrapy
+
+from gazette.items import Gazette
+
+
+class PrCuritibaSpider(BaseGazetteSpider):
+    TERRITORY_ID = '4106902'
+    name = 'pr_curitiba'
+    allowed_domains = ['legisladocexterno.curitiba.pr.gov.br']
+    custom_settings = {
+        'DEFAULT_REQUEST_HEADERS': {
+            'user-agent': 'Mozilla/5.0'
+        }
+    }
+
+    def start_requests(self):
+        """
+        The Curitiba website is a statefull page, so we can't just build the
+        request from zero, we have to resend the viewstate with every request.
+        @url http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Pesquisa.aspx
+        @returns requests 1
+        """
+        todays_date = dt.date.today()
+        current_year = todays_date.year
+        for year in range(current_year, 2006, -1):
+            yield scrapy.FormRequest(
+                'http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Pesquisa.aspx',
+                formdata={
+                    'ctl00$cphMasterPrincipal$ddlGrAno': str(year)
+                },
+                callback=self.parse_year
+            )
+
+    def parse_year(self, response):
+        for i in range(12):
+            yield self.scrape_month(response, i)
+
+    def scrape_month(self, response, month):
+        return scrapy.FormRequest.from_response(
+            response,
+            formdata={
+                '__EVENTTARGET': 'ctl00$cphMasterPrincipal$TabContainer1',
+                '__EVENTARGUMENT': 'activeTabChanged:{}'.format(month),
+                'ctl00_cphMasterPrincipal_TabContalegacyDealPooliner1_ClientState': '{{"ActiveTabIndex":{},"TabState":[true,true,true,true,true,true,true,true,true,true,true,true]}}'.format(month)
+            },
+            meta={"month": month},
+            callback=self.parse_month
+        )
+
+    def parse_month(self, response):
+        page_count = len(response.css(".grid_Pager:nth-child(1) table td").extract())
+        month = response.meta["month"]
+        # The first page of pagination cannot be accessed by page number
+        yield scrapy.FormRequest.from_response(
+            response,
+            formdata={
+                '__EVENTTARGET': 'ctl00$cphMasterPrincipal$TabContainer1',
+                'ctl00_cphMasterPrincipal_TabContalegacyDealPooliner1_ClientState': '{{"ActiveTabIndex":{},"TabState":[true,true,true,true,true,true,true,true,true,true,true,true]}}'.format(month),
+                '__EVENTARGUMENT': 'activeTabChanged:{}'.format(month),
+            },
+            callback=self.parse_page,
+        )
+        for page_number in range(2, page_count + 1):
+            yield scrapy.FormRequest.from_response(
+                response,
+                formdata={
+                    '__EVENTARGUMENT': 'Page${}'.format(page_number),
+                    '__EVENTTARGET': 'ctl00$cphMasterPrincipal$gdvGrid2'
+                },
+                callback=self.parse_page,
+            )
+
+    def parse_page(self, response):
+        for idx, row in enumerate(response.css(".grid_Row")):
+            pdf_date = row.css("td:nth-child(2) span ::text").extract_first()
+            gazette_id = row.css("td:nth-child(3) a ::attr(data-teste)").extract_first()
+            parsed_date = parse(f'{pdf_date}', languages=['pt']).date()
+            if gazette_id == '0':
+                starting_offset = 3
+                yield scrapy.FormRequest.from_response(
+                    response,
+                    formdata={
+                        '__LASTFOCUS': '',
+                        '__EVENTTARGET': 'ctl00$cphMasterPrincipal$gdvGrid2$ctl{num:02d}$lnkVisualizar'.format(num=(idx+starting_offset)),
+                        '__EVENTARGUMENT': '',
+                        '__ASYNCPOST': 'true'
+                    },
+                    callback=self.scrap_not_extra_edition,
+                    meta={"parsed_date": parsed_date}
+                )
+            else:
+                yield Gazette(
+                    date=parsed_date,
+                    file_urls=["http://legisladocexterno.curitiba.pr.gov.br/DiarioSuplementoConsultaExterna_Download.aspx?id={}".format(gazette_id)],
+                    is_extra_edition=True,
+                    territory_id=self.TERRITORY_ID,
+                    power='executive_legislature',
+                    scraped_at=dt.datetime.utcnow()
+                )
+
+    def scrap_not_extra_edition(self, response):
+        parsed_date = response.meta['parsed_date']
+        gazette_id = response.selector.re_first('Id=(\d+)')
+        return Gazette(
+            date=parsed_date,
+            file_urls=["http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Download.aspx?id={}".format(gazette_id)],
+            is_extra_edition=False,
+            territory_id=self.TERRITORY_ID,
+            power='executive_legislature',
+            scraped_at=dt.datetime.utcnow()
+        )