-
Notifications
You must be signed in to change notification settings - Fork 16
/
extract_wsj.py
145 lines (115 loc) · 4.76 KB
/
extract_wsj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
from datetime import datetime
import multiprocessing as mp
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
import time
def extraction_wsj(folder, year,nb_days,nb_articles_per_day, days_list):
"""
Scrapping des articles de Wall Street Journal
:param folder: nom du répertoire de sauvegarde
:param year: l'année à scrapper
:param nb_days: le nombre de jours
:param nb_articles_per_day: nombre d'article par jour
:param days_list: la liste des jours
:return:
"""
year_str = str(year)
total = 0
year_pattern = "http://www.wsj.com/public/page/archive-" +year_str
article_pattern = 'http://www.wsj.com/articles/'
article_pattern2 = 'https://www.wsj.com/articles/'
journal = 'wsj/'
articles_list = []
day_count = 0
articles_count = 0
for link_d in days_list:
if day_count < nb_days:
date = link_d[39:49]
date = date.replace(".", "").replace("h", "").replace("m", "")
date = datetime.strptime(date, '%Y-%m-%d')
date = str(date)[:10]
print("-"*25)
print('\n scrapping WSJ for date ; {}\n'.format(date))
print('total articles scrapped : {} \n'.format(total))
date_dir = date + '/'
# on liste les dossiers présent dans le répertoire
day_path = os.path.join(folder, date_dir)
if not os.path.exists(day_path):
os.makedirs(day_path)
journal_dir = os.path.join(day_path, journal)
if not os.path.exists(journal_dir):
os.makedirs(journal_dir)
# on stock le texte de l'article
articles_count = 0
html_d = urlopen(link_d)
bs2 = BeautifulSoup(html_d.read())
l2 = [x.get('href') for x in bs2.find_all("a")][1:]
article_list = [x for x in l2 if (x[:len(article_pattern)]== article_pattern or
x[:len(article_pattern2)] == article_pattern2)]
day_count += 1
for link_a in article_list:
if articles_count < nb_articles_per_day:
#On récupère seulement les informations nécessaires de l'article
html_a = urlopen(link_a)
bs3 = BeautifulSoup(html_a.read())
title = bs3('h1')[0].text + ' '
title = title.replace("/", " ")
for tag in bs3():
del tag['class']
article = bs3('p')
del article[0]
article = [tex.text for tex in article if len(tex.text) > 150]
text = ''.join([x for x in article])
text = title + text
filename = title + '.txt'
file_path = os.path.join(journal_dir, filename)
f = open(file_path, 'w')
f.write(text)
f.close()
articles_count += 1
total += 1
else:
break
else:
break
print('Total articles scrapped : {}'.format(total))
return('Total articles scrapped : {}'.format(total))
def main():
"""
Lancement du scapping, initilialisation
:return:
"""
start_time = time.time()
# Répertoire ou stocker les articles
folder = '/home/zeninvest/text_data/'
# Nombre de jour à récupérer
nb_days = 1000
# nombre d'article max par jour
nb_articles_per_day = 10000
# liste d'années
year_list = [2017, 2016, 2015, 2014]
nb_process = 15
day_per_process = int(365 / nb_process)
for year in year_list:
year_str = str(year)
journal = 'wsj/'
html_year = urlopen("http://www.wsj.com/public/page/archive-" + year_str + "-1-1.html")
bs1 = BeautifulSoup(html_year.read())
year_pattern = "http://www.wsj.com/public/page/archive-" + year_str
article_pattern = 'http://www.wsj.com/articles/'
article_pattern2 = 'https://www.wsj.com/articles/'
l = [x.get('href') for x in bs1.find_all("a")][1:]
days_list = [x for x in l if x[:len(year_pattern)] == year_pattern]
day_repartition = []
for i in range(1, nb_process+1):
day_repartition.append(days_list[(i-1) * day_per_process:i * day_per_process])
day_repartition.append(days_list[i * day_per_process:])
arg_list = [ [folder, year, nb_days, nb_articles_per_day, x] for x in day_repartition]
process_list = [ mp.Process(target=extraction_wsj, args=arg) for arg in arg_list]
for t in process_list :
t.start()
for t in process_list :
t.join()
if __name__ == '__main__':
main()