-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCreate_stats_quickfix.py
517 lines (459 loc) · 19.7 KB
/
Create_stats_quickfix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import json
import time
import datetime
import emoji as emoji_module
import re
import pdb
import seaborn
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as pltly
import plotly.graph_objs as go
import warnings
import logging
logging.basicConfig(filename='logger.log',level=logging.DEBUG)
# Sizes are hardcoded (for now)
plot_width = 1000
plot_height = 500
def get_messages(filename):
"""Puts all messages in lists
Written to be easy to follow, not to be compact"""
logging.info(f'Opening {filename}')
"""Main script that finds all data"""
# Open file in utf-8, or this will not work
with open(filename, 'r', encoding='utf-8') as file:
contents = file.read()
logging.info(f'Parsing html with bs4')
# "Soupify" contents, so it can be used in parsing
parsed_html = BeautifulSoup(contents, 'lxml')
logging.info('Parsing messages')
# Find all div tags with class is some wierd stuff, all info can be found with this
all_messages = parsed_html.body.find_all('div', attrs={'class': 'pam _3-95 _2pi0 _2lej uiBoxWhite noborder'})
# was previously 'message' in class
logging.info('Fixing corrupted unicode')
# For bad unicode symbols
with open('conversion/bad_unicode_fix.txt', 'r', encoding='utf-8-sig') as file:
contents = file.read()
char_dict = json.loads(contents)
# For changing ascii smileys to unicode emojis
with open('conversion/ascii_to_emoji.txt', 'r', encoding='utf-8-sig') as file:
contents = file.read().replace("\\", "\\\\") # Json cant load \\ :/
ascii_to_emoji_dict = json.loads(contents)
# only way to fix this :/
for key in ascii_to_emoji_dict.keys():
if "\\\\" in key:
new_key = key.replace('\\\\', '\\')
ascii_to_emoji_dict[new_key] = ascii_to_emoji_dict[key]
del ascii_to_emoji_dict[key]
all_texts = []
all_users = []
all_timestamps = []
total_call_time = 0
logging.info('Extracting messages')
total_time = 0
for number,message in enumerate(all_messages):
# ETA time
time_cycle = time.time()
# Texts
text = find_text(message)
text, tags = text_cleaner(text, char_dict, ascii_to_emoji_dict)
all_texts.append(text)
# Users
# Was once 'span'
user = message.find('div', attrs={'class': '_3-96 _2pio _2lek _2lel'}).get_text()
# Class was once 'user'
all_users.append(user)
# Call time (easiest to do it here)
call_time = 0
if message.find('span', attrs={'class': '_idm'}) is not None:
captured_call_string = message.find('span', attrs={'class': '_idm'}).get_text()
captured_call_string = captured_call_string.split(' ')
# captured_call_string[0] = Längd or some other junk
call_time = int(captured_call_string[1])
if 'sec' in captured_call_string[2] or 'sek' in captured_call_string[2]:
call_time = call_time / 60 # if second
else:
pass
total_call_time += call_time
# Timestamps
timestamp = message.find('div', attrs={'class': '_3-94 _2lem'}).get_text()
# Class was once 'meta'
timestamp = timestamp_fixer(timestamp)
all_timestamps.append(timestamp)
# Progress
progressed_time = time.time() - time_cycle
total_time += progressed_time
# number + 1 since it starts at 0
mean_progressed_time = total_time/(number + 1)
eta = round(mean_progressed_time*(len(all_messages) - number))
s = f'{number + 1} out of {len(all_messages)} messages done - ETA {eta} seconds'
print(' ' * 100, end='\r') # This is quite stupid, but it works when clearing previous line
print(s, end='\r')
print('') # Clear \r
return all_texts, all_users, all_timestamps, total_call_time
def find_text(message):
"""
# Problem: Messages are not found in tags, plaintext between tags
# Solution: Capture everything that is not a div tag between div tags
message = soup object that lies between texts
"""
text = message.find('div', attrs={'class': '_3-96 _2let'}).get_text()
text = text.replace('<div>', '')
text = text.replace('</div>', '')
return text
"""
# old, was before with <p>
text = u''
while True:
p = message.find_next_sibling()
if p == None: # Prone to buggyness, fix for end of all texts
break
elif p.name == 'div':
break
elif p.name == 'p':
text += str(p)
message = p # overwrite section, or it will loop forever
else:
text += str(p)
message = p # overwrite section, or it will loop forever
return text
"""
def text_cleaner(text, char_dict={}, ascii_to_emoji_dict = {}):
"""
# Problem: Messages sometimes messy with ascii emojis, links, pictures, stickers and such
# Solution: Clean,replace and remove everything unwanted
"""
text = text.replace('<p>', '') # Remove <p>
text = text.replace('</p>', '') # Remove </p>
# Check for images, videos, stickers etc
warnings.filterwarnings("ignore", category=UserWarning, module='bs4') # Supress URL errors
if bool(BeautifulSoup(text, "html.parser").find()):
tags = BeautifulSoup(text, "html.parser").find_all()
for tag in tags:
text = text.replace(str(tag), '') # Needs to make tag into string
else:
tags = ['']
warnings.filterwarnings("always", category=UserWarning, module='bs4') # Restart warnings
# Fixed bugged symbols, japan/samsung
for char in text:
if char in char_dict.keys():
text = text.replace(char, char_dict[char])
# Fix ascii smileys to unicode
for ascii_emoji in ascii_to_emoji_dict.keys():
if ascii_emoji in text:
text = text.replace(ascii_emoji, ascii_to_emoji_dict[ascii_emoji])
return text, tags
def timestamp_fixer(timestamp):
"""
# Problem: Messages are in a bad format, cannot be used to make datetime objects
# Solution: Replace and reformat all dates, converting
den 15 februari 2018 kl. 12:40 UTC+01
to
2018-02-15 12:40:00 Thursday
Month dict is in swedish, change conversion file if another language is used"""
with open('conversion/month_converter.txt', 'r', encoding='utf-8') as file:
month_converter = json.loads(file.read())
timestamp = timestamp.split(' ')
# timestamp[0] = den
day = timestamp[1]
month = month_converter[timestamp[2]]
year = timestamp[3]
# timestamp[4] = kl. # <-- old had kl. first
hour_minute = timestamp[4] # Assume seconds are 00
# timestamp[6] = timezone, might be useful?
timestamp = f'{year}-{month}-{day} {hour_minute}:00'
datetime_obj = datetime.date(int(year), int(month), int(day))
weekday_dict = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday',
4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
timestamp = timestamp + ' ' + weekday_dict[datetime_obj.weekday()]
return timestamp
def check_old_emojis(filename):
with open(filename, 'r', encoding='utf-8') as file:
for line, contents in enumerate(file):
for i in contents:
if ord(i) > 1000000:
with open('conversion/fix_emojis.html', 'a', encoding='utf-8') as fix_file:
fix_file.write(i)
def fix_emoji_table(filename):
char_dict = {}
contents_error = []
contents_fixed = []
with open(filename, 'r', encoding='utf-8') as file:
for line, contents in enumerate(file):
if contents.startswith('1; '):
contents = contents.split('1; ')[1] # ignore first part
for i in contents:
contents_error.append(i)
elif contents.startswith('2; '):
contents = contents.split('2; ')[1]
for i in contents:
contents_fixed.append(i)
for err,fix in zip(contents_error, contents_fixed):
if err not in char_dict.keys():
char_dict[err] = fix
with open('conversion/bad_unicode_fix.txt', 'w', encoding='utf-8') as outfile:
json.dump(char_dict, outfile)
#char_dict = json.loads(contents)
def emoji_stats(all_texts, all_users):
"""Emojis starts at int('1f600',16)
counts all chars that fall in this range for each message"""
# Make dict for each user, smiley and value
emoji_dict = {}
emoji_total_dict = {}
for user, text in zip(all_users, all_texts):
for char in text:
if char in emoji_module.UNICODE_EMOJI or ord(char) > 128512: # <3 seems to be under 128512
if not user in emoji_dict.keys():
emoji_dict[user] = {}
if not char in emoji_dict[user].keys():
emoji_dict[user][char] = 0
if not char in emoji_total_dict.keys():
emoji_total_dict[char] = 0
emoji_dict[user][char] += 1
emoji_total_dict[char] += 1
sorted_values, sorted_emojis = zip(*sorted(zip(emoji_total_dict.values(),
emoji_total_dict.keys()),
reverse=True))
most_used_emojis = sorted_emojis[:20] # 20 most used
return emoji_dict, most_used_emojis
def count_words(all_texts):
amount_of_words = 0
for text in all_texts:
word_list = text.split(' ')
amount_of_words += len(word_list)
return amount_of_words
def unique(list_of_things):
"""Util, debug this!!
Problem: Set function inbuilt does NOT keep order...
Solution: Make function yourself that does this"""
unique_list = []
for item in list_of_things:
if item not in unique_list:
unique_list.append(item)
return unique_list
def plot_emoji_stats(emoji_dict, most_used_emojis):
data = []
for user in emoji_dict.keys():
y = []
for emoji in most_used_emojis:
if emoji in emoji_dict[user].keys():
y.append(emoji_dict[user][emoji])
else: # Bug could happen if one user has not written emoji. This should fix it
y.append(0)
bar_object = go.Bar(x=most_used_emojis,
y=y,
name=user)
data.append(bar_object)
layout = go.Layout(barmode='stack',
width=plot_width,
height=plot_height,
title='Emoji stats',
titlefont = dict(
size=26)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='results/emoji_stats.html', auto_open=False)
def plot_text_frequency_full(all_texts, all_timestamps, all_users):
#Only get day, ignore time
all_timestamps_copy = list(all_timestamps) # I need to deep-copy list, list() will do it
for index, timestamp in enumerate(all_timestamps_copy): # Fix dates
timestamp = timestamp.split(' ')
timestamp = ' '.join(timestamp[:1]) # Ignore weekday, join to string
timestamp = datetime.datetime.strptime(timestamp, "%Y-%m-%d") # Make into time object
all_timestamps_copy[index] = timestamp # Overwrite and make to datetime object
data = []
for user in set(all_users):
all_timestamps_fixed = []
all_texts_fixed = []
#pdb.set_trace()
for timestamp in unique(all_timestamps_copy):
amount_of_texts = 0
indices = [i for i, x in enumerate(all_timestamps_copy) if x == timestamp]
all_timestamps_fixed.append(timestamp)
for index in indices:
if user in all_users[index]:
amount_of_texts += 1
all_texts_fixed.append(amount_of_texts)
bar_object = go.Bar(x=all_timestamps_fixed,
y=all_texts_fixed,
name=user)
data.append(bar_object)
layout = go.Layout(barmode='stack',
title='Text stats',
width=plot_width,
height=plot_height,
titlefont=dict(
size=26),
yaxis=dict(title='Texts')
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='results/texts_stats_full.html', auto_open=False)
def plot_text_frequency_day(all_texts, all_timestamps, all_users):
#Only get day, ignore time
all_timestamps_copy = list(all_timestamps) # I need to deep-copy list, list() will do it
for index, timestamp in enumerate(all_timestamps_copy): # Fix dates
timestamp = timestamp.split(' ')
timestamp = timestamp[2] # Get weekday
all_timestamps_copy[index] = timestamp # Overwrite and make to datetime object
sort_order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
all_timestamps_copy = sorted(all_timestamps_copy, key=sort_order.index) # Sort day correct
data = []
for user in set(all_users):
all_timestamps_fixed = []
all_texts_fixed = []
for timestamp in unique(all_timestamps_copy):
amount_of_texts = 0
indices = [i for i, x in enumerate(all_timestamps_copy) if x == timestamp]
all_timestamps_fixed.append(timestamp)
for index in indices:
if user in all_users[index]:
amount_of_texts += 1
all_texts_fixed.append(amount_of_texts)
bar_object = go.Bar(x=all_timestamps_fixed,
y=all_texts_fixed,
name=user)
data.append(bar_object)
layout = go.Layout(barmode='stack',
title='Text stats',
width=plot_width,
height=plot_height,
titlefont=dict(
size=26),
yaxis=dict(title='Texts')
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='results/texts_stats_day.html', auto_open=False)
def plot_text_frequency_hour(all_texts, all_timestamps, all_users):
#Only get day, ignore time
all_timestamps_copy = list(all_timestamps) # I need to deep-copy list, list() will do it
for index, timestamp in enumerate(all_timestamps_copy): # Fix dates
timestamp = timestamp.split(' ')
timestamp = timestamp[1] # Remove seconds
timestamp = timestamp.split(':') # Get time
if int(timestamp[1]) < 15: # No way to round... round to closest half hour
timestamp[1] = '00'
elif int(timestamp[1]) < 30:
timestamp[1] = '30'
elif int(timestamp[1]) < 45:
timestamp[1] = '30'
elif int(timestamp[1]) < 60:
timestamp[1] = '00'
timestamp[0] = str(int(timestamp[0]) + 1)
if timestamp[0] == '24':
timestamp[0] = '00'
timestamp = ':'.join(timestamp[:2])
timestamp = '2012-12-12 ' + timestamp + ':00' # Need date in correct format to sort
all_timestamps_copy[index] = timestamp # Overwrite and make to datetime object
data = []
for user in set(all_users):
all_timestamps_fixed = []
all_texts_fixed = []
for timestamp in unique(all_timestamps_copy):
amount_of_texts = 0
indices = [i for i, x in enumerate(all_timestamps_copy) if x == timestamp]
all_timestamps_fixed.append(timestamp)
for index in indices:
if user in all_users[index]:
amount_of_texts += 1
all_texts_fixed.append(amount_of_texts)
bar_object = go.Bar(x=all_timestamps_fixed,
y=all_texts_fixed,
name=user)
data.append(bar_object)
layout = go.Layout(barmode='stack',
title='Text stats',
width=plot_width,
height=plot_height,
titlefont=dict(
size=26),
yaxis=dict(title='Texts')
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='results/texts_stats_hour.html', auto_open=False)
def plot_pie_chart(all_texts, all_users):
# Message count per user
labels = []
values_messages = []
values_word_count = []
for user in set(all_users):
labels.append(user)
values_messages.append(all_users.count(user))
words_user = 0
for index, u in enumerate(all_users):
if u == user:
words_user += len(all_texts[index].split(' '))
values_word_count.append(words_user)
data = [{
"values": values_messages,
"labels": labels,
"domain": {"x": [0, .48]},
"name": "Messages",
"hole": .4,
"type": "pie"
},
{
"values": values_word_count,
"labels": labels,
"domain": {"x": [.52, 1]},
"name": "Word count",
"hole": .4,
"type": "pie"
}]
layout = go.Layout(title='Piecharts',
width=plot_width,
height=plot_height,
titlefont=dict(size=26),
yaxis=dict(title='Texts'),
annotations=[{"showarrow": False,
"text": "Messages",
"x": 0.20,
"y": 0.5},
{
"showarrow": False,
"text": "Words",
"x": 0.8,
"y": 0.5
}]
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='results/piechart.html', auto_open=False)
if __name__ == "__main__":
import tkinter
from tkinter.filedialog import askopenfilename
root = tkinter.Tk()
root.withdraw()
print('Please choose a file')
filename = askopenfilename(initialdir = "data",title = "Select file")
root.destroy()
if filename == "":
print('Nothing choosen. Quitting program')
quit()
all_texts, all_users, all_timestamps, total_call_time = get_messages(filename)
# Get data
emoji_dict, most_used_emojis = emoji_stats(all_texts, all_users)
# Use data
amount_of_words = count_words(all_texts)
delta_days = (datetime.datetime.now() - datetime.datetime.strptime(all_timestamps[-1].split(' ')[0], "%Y-%m-%d")).days
#Prints
print("You have been in touch since:",
all_timestamps[-1],
" which is ",
delta_days,
" days from today")
print("You have sent: {} messages".format(len(all_texts)))
print("Average messages per day: {}".format(round(len(all_texts)/delta_days),3))
print("Total word count: {}".format(amount_of_words))
print("Average word count per message: {}".format(round((amount_of_words/len(all_texts)), 2)))
print("You have called for: {} minutes".format(round(total_call_time, 2)))
print('')
print('Plotting data')
# Plotting
plot_emoji_stats(emoji_dict, most_used_emojis)
plot_text_frequency_full(all_texts, all_timestamps, all_users)
plot_text_frequency_day(all_texts, all_timestamps, all_users)
plot_text_frequency_hour(all_texts, all_timestamps, all_users)
plot_pie_chart(all_texts, all_users)
print('DONE!!')