-
Notifications
You must be signed in to change notification settings - Fork 12
/
crawler.py
64 lines (51 loc) · 1.67 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
import pandas as pd
from bs4 import BeautifulSoup
import lxml
import csv
import sys
import threading
class myThread(threading.Thread):
def __init__(self, thread_num, ticker_list_address):
threading.Thread.__init__(self)
self.thread_num = thread_num
self.ticker_list_address = ticker_list_address
def run(self):
print "start: thread " + str(self.thread_num)
multi_threads_crawl_and_save(self.thread_num, self.ticker_list_address)
print "end: thread" + str(self.thread_num)
def crawl_and_save(symbol, out):
count = 0 #the code keep printing a blank line: the 1st entry is empty. Use a counter to skip the first writing
url = 'http://www.nasdaq.com/symbol/%s/dividend-history' % symbol
res = requests.get(url)
#print res.text
soup = BeautifulSoup(res.text, "html.parser")
#print(soup.get_text())
if soup.find(id = 'quotes_content_left_dividendhistoryGrid') != None:
entries = soup.find(id = 'quotes_content_left_dividendhistoryGrid').find_all('tr')
for entry in entries:
for item in entry.find_all('td'):
out.write(item.get_text().strip() + ',')
if count != 0:
out.write(','+symbol)
out.write('\n')
count = count + 1
def multi_threads_crawl_and_save(thread_num, ticker_list_address):
output = open('dividendData/dividend'+str(thread_num)+'.csv', 'w')
f = open(ticker_list_address)
try:
reader = csv.reader(f)
for row in reader:
crawl_and_save(row[0], output)
print row[0]
finally:
f.close()
output.close()
def main():
thread_list = []
for i in range(1,12):
thread = myThread(i, str(i)+'.csv')
thread_list.append(thread)
for i in range(1,12):
thread_list[i-1].start()
if __name__ =='__main__':main()