-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
98 lines (84 loc) · 3.37 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful
import scraperwiki
import lxml.html
import mechanize
def scrape_table(root):
#grab all table rows <tr> in table class="tblSearchResults"
rows = root.cssselect("table.caseCourtTable tr")
#create an ID number set at 0 - will add 1 every time we store a record (below)
idno = 0
#create a record to hold the data
record = {}
#for each row, loop through this
for row in rows:
#create a list of all cells <td> in that row
table_cells = row.cssselect("td")
if table_cells:
#if there is a cell, record the contents in our dataset, the first cell [0] in 'recipient' and so on
record['Case Number'] = table_cells[0].text_content()
record['Date Filed'] = table_cells[1].text_content()
#this line adds 1 to the ID no. we set at 0 earlier
#idno=idno+1
#record['ID'] = idno
record['Caption'] = table_cells[2].text_content()
record['Found Party'] = table_cells[3].text_content()
table_cellsurls = table_cells[0].cssselect("a")
#grab the href=" attribute of the first <a ... and store
record['URL'] = table_cellsurls[0].attrib.get('href')
# Print out the data we've gathered
print record, '------------'
# Save the record to the datastore - 'ID' is our unique key -
scraperwiki.sqlite.save(["Case Number"], record)
br = mechanize.Browser()
#br.set_all_readonly(False) # allow everything to be written to
br.set_handle_robots(False) # ignore robots
br.set_handle_refresh(False) # can sometimes hang without this
br.addheaders = [('User-agent', 'Firefox')] # [('User-agent', 'Firefox')]
br.open("http://www.oscn.net/dockets/Search.aspx")
#for f in br.forms():
#print f
for form in br.forms():
print "Form name:", form.name
print form
'''formcount=0
for frm in br.forms():
if frm.attrs[class] == "search-form":
break
formcount=formcount+1
br.select_form(nr=formcount)
#br.select_form('form')
br.form[ 'db' ] = ['garfield',]
#Get the search results
br.submit()'''
#br.select_form(nr=0)
#print br.form
br.form = list(br.forms())[0]
br['db'] = ['garfield']
br['dcct'] = ['32']
br['lname'] = str('JONES')
#br['mname'] = str(['Middle name'])
#br['fname'] = str(['WILLIAM'])
print br
response = br.submit()
html = response.read()
print html
root = lxml.html.fromstring(html)
scrape_table(root)
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
# You don't have to do things with the ScraperWiki and lxml libraries.
# You can use whatever libraries you want: https://morph.io/documentation/python
# All that matters is that your final data is written to an SQLite database
# called "data.sqlite" in the current working directory which has at least a table
# called "data".