forked from jctanner/scrapers
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ralcham.py
71 lines (53 loc) · 2.46 KB
/
ralcham.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python
import sys
import requests
import requests_cache
from bs4 import BeautifulSoup
from pprint import pprint
from lib.csvtools import dict_to_csv
def main() :
requests_cache.install_cache('scraper_cache')
domain = "http://web.raleighchamber.org"
caturls = []
compurls = []
comps = dict()
r = requests.get(domain + "/Industrial-Manufacturing")
soup = BeautifulSoup(r.text, 'html.parser')
## get all category urls from top level site
for li in soup.findAll('li', {'class' : 'ListingCategories_AllCategories_CATEGORY'}):
link = li.findAll('a')
caturls.append(link[0].get('href'))
## get all company urls from category urls
for caturl in caturls:
r = requests.get(domain + caturl)
soup = BeautifulSoup(r.text, 'html.parser')
# some category urls redirect to a company url, if this is a company url, add it to the list, and move to the next category
if soup.find('span', {'class' : 'ListingDetails_Level5_MAINCONTACT'}):
compurls.append(caturl)
continue
for div in soup.findAll('div', {'class' : 'ListingResults_All_ENTRYTITLELEFTBOX'}):
link = div.findAll('a')
compurls.append(link[0].get('href'))
#import pdb; pdb.set_trace()
#TEMPcompurls = ["/TelecommunicationsEquipment-Service/Link-US,-LLC-20451", "/Asphalt-Crushing/Old-School-Crushing-Company-21400"]
itemkeys = {'street': ('span', 'itemprop', 'street-address'),
'city': ('span', 'itemprop', 'locality'),
'state': ('span', 'itemprop', 'region'),
'zipcode': ('span', 'itemprop', 'postal-code'),
'contact': ('span', 'class', 'ListingDetails_Level5_MAINCONTACT')}
## visit all the company urls and get company attributes
for idx,compurl in enumerate(compurls):
print idx,len(compurls),compurl
r = requests.get(domain + compurl)
soup = BeautifulSoup(r.text, 'html.parser')
name = soup.title.string.strip()
comps[name] = {}
comps[name]['name'] = name
for k,v in itemkeys.iteritems():
try:
comps[name][k] = soup.find(v[0], {v[1] : v[2]}).text.strip()
except Exception as e:
comps[name][k] = ""
dict_to_csv(comps, 'ralcham.csv')
if __name__ == "__main__" :
main()