Skip to content

Commit

Permalink
Finished scraping Ponta Grossa
Browse files Browse the repository at this point in the history
  • Loading branch information
antoniovendramin committed May 19, 2018
1 parent 4f0c4f2 commit 33a78d1
Showing 1 changed file with 21 additions and 15 deletions.
36 changes: 21 additions & 15 deletions processing/data_collection/gazette/spiders/pr_ponta_grossa.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from gazette.items import Gazette


class PrPontaGrossaSpider(scrapy.Spider):
MUNICIPALITY_ID = '4119905'
name = 'pr_ponta_grossa'
Expand All @@ -16,19 +17,17 @@ class PrPontaGrossaSpider(scrapy.Spider):
def parse(self, response):
"""
@url http://www.pontagrossa.pr.gov.br/diario-oficial/
@returns requests 48
@returns requests 1
"""
self.scrape_page(response)
return self.scrape_page(response)

def scrape_page(self, response):
print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAA")
pdf_links = response.css(".view-content .field a")

pdf_infos = []
for pdf_link in pdf_links:
pdf_file_name = pdf_link.css("::attr(href)").extract_first()
pdf_link_text = pdf_link.css("::text").extract_first()
print(pdf_file_name)
if "sem_atos" in pdf_file_name:
continue
pdf_link_info = re.search('.*/diario-oficial/_?(\d{4})-(\d{2})-(\d{2}).*.pdf', pdf_file_name)
Expand All @@ -37,14 +36,21 @@ def scrape_page(self, response):
continue
mes = pdf_link_info.group(2)
dia = pdf_link_info.group(3)
#is_extra = True#Dapaende do texto
pdf_infos.append({ "ano" : ano, "mes" : mes, "dia":dia, "url": pdf_file_name })
menor_ano_da_pagina = min(map(lambda p: p.ano, pdf_infos))
print(menor_ano_da_pagina)
#if menor_ano_da_pagina >= self.ano_minimo:
# next_page_url = response.css(".pager-next::attr(href)").extract_first()
# yield scrapy.Request(next_page_url, self.scrape_page)

#for pdf_info in pdf_infos:
# print(pdf_info.url)

is_extra = "complementar" in pdf_link_text
pdf_infos.append({ "ano" : ano, "mes" : mes, "dia":dia, "url": pdf_file_name, "is_extra_edition": is_extra })

if pdf_infos:
menor_ano_da_pagina = min(map(lambda p: p["ano"], pdf_infos))
if menor_ano_da_pagina >= self.ano_minimo:
next_page_url = "{0}{1}".format("http://www.pontagrossa.pr.gov.br", response.css(".pager-next a::attr(href)").extract_first())
yield scrapy.Request(next_page_url, self.scrape_page)
for pdf_info in pdf_infos:
date = "{0}-{1}-{2}".format(pdf_info["ano"], pdf_info["mes"], pdf_info["dia"])
yield Gazette(
date = date,
file_urls=[pdf_info["url"]],
is_extra_edition=pdf_info["is_extra_edition"],
municipality_id=self.MUNICIPALITY_ID,
power="executive_legislature",
scraped_at=dt.datetime.utcnow()
)

0 comments on commit 33a78d1

Please sign in to comment.