Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Curitiba #42

Merged
merged 18 commits into from
Jul 10, 2018
Merged
Changes from 1 commit
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Create a gazette using the ID that returns from the server
antoniovendramin committed Jun 6, 2018
commit a7cee8d1ff71a672da5b25ddd5742d424ee751ce
52 changes: 29 additions & 23 deletions processing/data_collection/gazette/spiders/pr_curitiba.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from dateparser import parse
import datetime as dt
import re

import scrapy


from gazette.items import Gazette

class PrCuritibaSpider(scrapy.Spider):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@@ -73,10 +75,21 @@ def parse_page(self, response):
id = ids[i]
parsed_date = parse(f'{pdf_date}', languages=['pt']).date()
if id == '0':
print("Nao suplemento")
print("Number is {0} date is {1} is is {2}".format(number, parsed_date, id))
self.scrap_not_extra_edition(response, i)
gazettes.append(self.scrap_not_extra_edition(response, i))
yield scrapy.FormRequest.from_response(response,
headers = {
'user-agent': 'Mozilla/5.0',
},
formdata = {
'__LASTFOCUS': '',
'__EVENTTARGET': 'ctl00$cphMasterPrincipal$gdvGrid2$ctl{num:02d}$lnkVisualizar'.format(num=(i+3)),
'__EVENTARGUMENT': '',
'__ASYNCPOST': 'true'
},
callback=self.scrap_not_extra_edition,
meta={"gazettes": gazettes, "parsed_date": parsed_date}
)

#gazettes.append(yield self.scrap_not_extra_edition(response, i))
else:
gazettes.append(Gazette(
date = parsed_date,
@@ -86,25 +99,18 @@ def parse_page(self, response):
power='executive_legislature',
scraped_at=dt.datetime.utcnow()
))


return []#gazettes

def scrap_not_extra_edition(self, response, index):
#window.open('DiarioConsultaExterna_Download.aspx?Id=2508'
print('ctl00$cphMasterPrincipal$gdvGrid2$ctl{num:02d}$lnkVisualizar'.format(num=(index+3)))
print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA ==== {num:02d}".format(num=(index+3)))
yield scrapy.FormRequest.from_response(
response,
formdata={
'__EVENTTARGET' : 'ctl00$cphMasterPrincipal$gdvGrid2$ctl{num:02d}$lnkVisualizar'.format(num=(index+3)),
'ctl00$smrAjax' : 'ctl00$cphMasterPrincipal$upPesquisaExternaDO|ctl{num:02d}$cphMasterPrincipal$gdvGrid2$ctl05$lnkVisualizar'.format(num=(index+3)),
'__ASYNCPOST': 'true',
'__VIEWSTATEGENERATOR': 'B3FCDD96'
},
callback=self.parse_gazette_popup,
)

def parse_gazette_popup(self, response):
print("SDFFFFFFFFFFFFFFAFDEHJAERDJREJAJHGEARJ")
print(response.text)
def scrap_not_extra_edition(self, response):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parse_regular_edition is a better name for this method (it is scrape, not scrap anyway).

gazettes = response.meta['gazettes']
parsed_date = response.meta['parsed_date']
id = re.findall(r'Id=(\d+)', response.text )
gazettes.append(Gazette(
date = parsed_date,
file_urls=["http://legisladocexterno.curitiba.pr.gov.br/DiarioConsultaExterna_Download.aspx?id={}".format(id)],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id is a built-in function in Python (https://docs.python.org/3.6/library/functions.html#id). Don't use it as a variable name. Give a better name like pdf_id to avoid problems.

is_extra_edition= False,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed that we have gazettes like 92 Supl 1 and 92 . But some days don't have the Supl version.
Isn't it to be considered as an extra edition??? @Irio @cuducos any idea?

municipality_id=self.MUNICIPALITY_ID,
power='executive_legislature',
scraped_at=dt.datetime.utcnow()
))