forked from prince381/tweet_analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtweets_scraper.py
264 lines (227 loc) · 11 KB
/
tweets_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
from textblob import TextBlob
from time import sleep
import json
import re
from stopwords import final_stopwords
# this function uses requests and BeautifulSoup to scrape the html of the twitter
# web page and extract all the html tags that contain the tweet informations and
# put them all in a list.
def twitter_page_html(account_name):
# This function is to handle twitter's dynamic page content loading using selenium
site = 'https://twitter.com/{}'.format(account_name[1:])
# get the first page
page1 = requests.get(site)
# parser the html page using BeautifulSoup
soup = bs(page1.text,'html.parser')
# find all the html tags that contains the tweet information
lis_html = soup.find_all(name='li',attrs={'data-item-type':'tweet'})
# get the next pointer
next_pointer = str(soup.find("div", {"class": "stream-container"})["data-min-position"])
# perform loop to get the rest of the tweets
for i in range(5):
# get the next url
next_url = 'https://twitter.com/i/profiles/show/{}/timeline/tweets?include_available_features=1&include_entities=1&max_position={}&reset_error_state=false'.format(account_name[1:],next_pointer)
try:
next_response = requests.get(next_url)
except Exception:
return lis_html
tweet_data = next_response.text
tweets_obj = json.loads(tweet_data)
if not tweets_obj["has_more_items"] and not tweets_obj["min_position"]:
# using two checks here bcz in one case has_more_items was false but there were more items
break
next_pointer = tweets_obj["min_position"]
html = tweets_obj["items_html"]
new_soup = bs(html,'html.parser')
new_list = new_soup.find_all(name='li',attrs={'data-item-type':'tweet'})
# add the new tweet tags to the previous one
lis_html += new_list
return lis_html
# this function filters a given text and removes the stopwords as well as the links and tags from the text.
def remove_stopwords(text,stopwords=final_stopwords):
tlist = TextBlob(text.lower())
tlist = list(tlist.words)
symbs = ['@','#','https','http','www.','.com','=',',',"'",'the','and','\'s']
for i in symbs:
for j in tlist:
if i in j:
tlist.remove(j)
else:
continue
for wd in tlist:
if wd in stopwords:
tlist.remove(wd)
else:
continue
return ' '.join(tlist)
# this function takes a text as an argument and translates it if it's not already in english
# after that calculate the sentiment score of the text and classifies it as negative,neutral or positive
def sentiment_score(text):
#remove all stopwordsmlinks and tags from the tweet
text = remove_stopwords(text,final_stopwords)
# create a TextBlob object
blob = TextBlob(text)
# try and translate the text if it's not already in english
try:
new_blob = blob.translate(to='en')
except Exception:
new_blob = blob
# calculate the sentiment
sentiment = np.round(new_blob.sentiment.polarity,1)
# classify the sentiment of the text
if sentiment == 0:
sentiment_type = 'neutral'
elif sentiment < 0:
sentiment_type = 'negative'
else:
sentiment_type = 'positive'
# return the score and the sentiment type
return sentiment,sentiment_type
# function to parse/handle the HTML and extract data using BeautifulSoup and pandas.DataFrame()
# and returns a data frame.
def scrape_data(html_list):
# list for collecting tweet-texts
texts = []
# list for collecting tweet date
date_time = []
# list for collecting the sentiment score of each tweet
sentimnt_score = []
# list for collecting the sentiment type of each tweet
sentimnt_type = []
# list for collecting the number of replies
replies = []
# list for collecting the number of likes
fav_counts = []
# list for collecting the numbre of retweets
retweets = []
# loop through all the tweet tags and extract the neccessary information
for item in html_list:
try:
# get the text from each tweet
text_container = item.find(name='div',attrs={'class':'js-tweet-text-container'})
text = text_container.find('p').text
#remove all stopwordsmlinks and tags from the tweet
text = remove_stopwords(text,final_stopwords)
# append the results in the texts list
texts.append(text)
# get the sentiment score and type for each tweet
sentmt,smttp = sentiment_score(text)
sentimnt_score.append(sentmt)
sentimnt_type.append(smttp)
# get the date and time the tweet was made and parse it using the dateutil.parser
time_str = str(item.find(name='small',attrs={'class':'time'}).find('a'))
time_list = time_str.split(' ')[7:13]
time = time_list[0][-4:-1] +' '+ time_list[1]
day = time_list[3]
month = time_list[4]
year = time_list[-1][0:4]
fulldate = '{} {} {} {}'.format(day,month,year,time)
# append the results in the date_time list
date_time.append(fulldate)
# the total number of replies of the tweet
reply_container = item.find(name='button',attrs={'data-modal':'ProfileTweet-reply'})
reply2 = reply_container.find(name='span',attrs={'class':'ProfileTweet-actionCount'})
reply = reply2.find('span').text
# append the results in the list of replies
if reply == '':
reply = 0
else:
reply = reply.replace('.','')
reply = reply.replace('K','000')
reply = int(reply)
# get the total number of retweets
retweet_container = item.find(name='button',attrs={'data-modal':'ProfileTweet-retweet'})
retweet2 = retweet_container.find(name='span',attrs={'class':'ProfileTweet-actionCount'})
retweet = retweet2.find('span').text
# append the results in the list of retweets
if retweet == '':
retweet = 0
else:
retweet = retweet.replace('.','')
retweet = retweet.replace('K','000')
retweet = int(retweet)
# get the total number of likes of a tweet
likes_container = item.find(name='button',
attrs={'class':'ProfileTweet-actionButton js-actionButton js-actionFavorite'})
likes2 = likes_container.find(name='span',attrs={'class':'ProfileTweet-actionCount'})
likes = likes2.find('span').text
# append the results in the fav_counts list
if likes == '':
likes = 0
else:
likes = likes.replace('.','')
likes = likes.replace('K','000')
likes = int(likes)
# check if values were picked by the html queries
total_val = likes + reply + retweet
# if not, perform another html query using different tags and classes
if total_val == 0:
action_tag = item.find(name='div',
attrs={'class':'ProfileTweet-actionCountList u-hiddenVisually'})
rep_container = action_tag.find(name='span',
attrs={'class':'ProfileTweet-action--reply u-hiddenVisually'})
rep2 = rep_container.find(name='span',
attrs={'class':'ProfileTweet-actionCount'})['data-tweet-stat-count']
retw_container = action_tag.find(name='span',
attrs={'class':'ProfileTweet-action--retweet u-hiddenVisually'})
retw2 = retw_container.find(name='span',
attrs={'class':'ProfileTweet-actionCount'})['data-tweet-stat-count']
fav_container = action_tag.find(name='span',
attrs={'class':'ProfileTweet-action--favorite u-hiddenVisually'})
fav2 = fav_container.find(name='span',
attrs={'class':'ProfileTweet-actionCount'})['data-tweet-stat-count']
reply = int(rep2)
retweet = int(retw2)
likes = int(fav2)
retweets.append(retweet)
fav_counts.append(likes)
replies.append(reply)
except Exception:
continue
# we now put the whole information in a data frame
tweet_df = pd.DataFrame({'date':date_time,'tweet':texts,'sentiment_score':sentimnt_score,
'sentiment':sentimnt_type,'retweets':retweets,
'replies':replies,'likes':fav_counts})
tweet_df.date = pd.to_datetime(tweet_df.date)
return tweet_df
########################################################################################
# #
# The two functions below were not put to use in this project but I thought it would #
# be helpful to create them.One is being used inside the other and together,they #
# return the hastags used in a tweet and the number of times it appeared in the #
# tweets. #
########################################################################################
# this function extracts all the hashtags from a tweet and store the into a list
def get_tags(df):
# regular expression object for extracting the tags
tag_regex = re.compile(r'#\w+')
# loop over each tweet and get the hashtags
tweets = df['tweet'].values
all_tags = []
for i in tweets:
tags = tag_regex.findall(i)
tags = [i.replace('pic','') for i in tags]
tags = [i.lower() for i in tags]
all_tags += tags
# return a list of all hashtags
return all_tags
# this function returns a series of hashtags and the number of times each appears
def tags_data(df):
# get the list of all tags
tags_list = get_tags(df)
# create an empty dictionary for the tags
tag_count = {}
# loop over every hashtag in the list of tags and count the number of times each appears
for tag in tags_list:
tag_count.setdefault(tag,0)
tag_count[tag] += 1
# create the series of the hashtags
series = pd.Series(tag_count)
# sort in descending order
sorted_series = series.sort_values(ascending=False)
# return the series created
return sorted_series