-
Notifications
You must be signed in to change notification settings - Fork 1
/
WhatsAppConversation_Grapher.py
188 lines (158 loc) · 8 KB
/
WhatsAppConversation_Grapher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import datetime
import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go
from plotly import subplots
from itertools import cycle
def collate_data(conversation, bin_size="W", start_date=pd.Timestamp("2010-01-01 00:00:00"), end_date=pd.Timestamp("2050-01-01 00:00:00")):
""" Expects a WhatsApp Chat log as a list item,
and returns x data (time) and y data (messages sent, as a dict)
Can return cumulative data for stack plots, or non cumulative for bar/line graphs.
For non cumulative data, you can group messages in bins (eg. bin_size=7 is messages grouped by week)
If someone tends to send lots of short messages at once, you can set group_messages=True to treat them as one """
# Create x axes of Time data
if start_date > end_date:
print("Error. start_date is after end_date")
exit()
# Filter dataframe by date
filt = (conversation.message_log["date"] >= start_date) & (conversation.message_log["date"] <= end_date)
df = conversation.message_log[filt]
# Group messages by sender and get message counts
df.set_index("date", inplace=True)
count_df = (df.groupby(["sender"])[["sender"]] # We group by sender, and only display sender column (we could actually display any column because we're going to count these later and use that data)
.resample(bin_size) # Resample to bin_size (eg. msgs sent per day). See pandas docs, can use D, 2W, M etc.
.count() # count messages sent per day (or per bin size)
.unstack(0, fill_value=0) # unstack grouped data to put senders as columns
.droplevel(0, axis=1) # We have multi indexed column. 1st index is "sender" so drop it.
)
cum_df = (df.groupby(["sender"])[["sender"]] # We group by sender, and only display sender column (we could actually display any column because we're going to count these later and use that data)
.resample("D") # Resample by day. By minute creates too much data.
.count() # count messages sent per day (or per bin size)
.unstack(0, fill_value=0) # unstack grouped data to put senders as columns
.droplevel(0, axis=1) # We have multi indexed column. 1st index is "sender" so drop it.
.cumsum()
)
return count_df, cum_df
def colour_selection(rainbow_colours, num_participants):
""" Expects a list of colours and an int for the number of participants in the chat.
Returns a list of colours in an order that provides better contrast for charts.
If you have a list of gradually shifting colours (eg. a rainbow) but only 2 participants,
it's better to use colours from the top and middle of the list (eg. red and blue)
rather than 2 adjacent colours (eg. red and orange) """
num_colours = len(rainbow_colours)
if num_participants > num_colours / 2: # This algorithm won't do anything in this case
return rainbow_colours # so just return the original colours
new_colours = [] # Create list of colours for result
if num_participants >= num_colours:
step = 1 # force step size = 1 if we have more people than colours
else:
step = num_colours // num_participants # step through the rainbow colours in larger steps if there are less people than colours
for i in range(0, min(num_colours,num_participants)): # list should only use each colours once
x = (i * step) % num_colours
new_colours.append(rainbow_colours[x])
return new_colours
def plot_messages_time(conversation, chart_types={"stack plot":True, "bar chart":True},
bin_size="W", barmode="stack",
start_date=pd.Timestamp("2010-01-01 00:00:00"), end_date=pd.Timestamp("2050-01-01 00:00:00")):
""" Expects a WhatsApp Conversation Log Class object and displays a graph of messages sent/received over time.
Optiona Parameters include graph types to display, bin_size for sampling data by day, week etc, and a time frame to plot """
# Get chart data
print("Collating data")
cnt_df, cum_df = collate_data(conversation, bin_size=bin_size,
start_date=start_date, end_date=end_date)
participants = list(cnt_df.columns) # as we may have narrowed down the time frame, we don't want to use conversation.participants
num_participants = len(participants)
print("Data successfully collated")
# Rainbow colours for graph lines
colours = [
"#e96841",
"#ed9a4a",
"#f0c054",
"#f4ef5f",
"#c5d966",
"#92cb6a",
"#44bb6b",
"#2fc0b9",
"#1cc5ec",
"#4398d1",
"#5577c1",
"#6153a8",
"#9158a7",
"#b75fab",
"#e566ab",
"#e16378",
]
# Run colour list through an algorithm to provide better contrast
colours = colour_selection(colours, num_participants) # comment this line out if you change the colours above
colours = cycle(colours) # turn into a generator that cycles through each colour
# Create a colour map dictionary
colour_map = {}
for participant in participants:
colour_map[participant] = next(colours)
print("Plotting Data")
# Bar Chart
if chart_types["bar chart"]:
traces1 = []
for participant in participants:
traces1.append(go.Bar(
x=cnt_df.index, y=cnt_df[participant],
name=participant,
marker={"color":colour_map[participant], "line_color":colour_map[participant]},
legendgroup=participant, # Ensures one legend for multiple plots
))
# Stack Plot
if chart_types["stack plot"]:
traces2 = []
for participant in participants:
traces2.append(go.Scatter(
x=cum_df.index, y=cum_df[participant],
name=participant,
marker={"color":colour_map[participant], "line_color":colour_map[participant]},
fillcolor=colour_map[participant],
hoveron = 'points+fills', # I think this is a known bug, doesn't work without fill=toself which messes up chart
stackgroup="one",
legendgroup=participant,
showlegend=False if chart_types["stack plot"] == True else True, # Only show legend if first legend is not shown
))
print("Formatting charts")
subplot_count = 2 if chart_types["bar chart"] and chart_types["stack plot"] else 1
# Plot Multiple subplots
fig = subplots.make_subplots(
rows=subplot_count, cols=1,
shared_yaxes=True,
vertical_spacing = 0.10, # 10%
)
if chart_types["bar chart"]:
for trace in traces1:
fig.append_trace(trace,1,1)
if chart_types["stack plot"]:
for trace in traces2:
fig.append_trace(trace,subplot_count,1) # if you add more plots, this needs editing
# TODO: Add minor Tick Marks
fig.update_layout(
title=f"{conversation.title}",
xaxis={"title":"Date", "gridcolor":"#aaaaaa",
"ticks":"outside"},
xaxis2={"title":"Date", "gridcolor":"#aaaaaa",
"ticks":"outside"},
yaxis={"title":"Messages sent", "gridcolor":"#aaaaaa"},
yaxis2={"title":"Messages sent", "gridcolor":"#aaaaaa"},
plot_bgcolor="#242424",
barmode=barmode,
hovermode='x', # compare all traces on hover
margin=dict(
b=30,
t=40
),
)
# Range Slider
# TODO: Tick marks not changing
# fig.update_xaxes(
# rangeslider_visible=True,
# tickformatstops = [
# dict(dtickrange=["D1", "W2"], value="%d %b"),
# dict(dtickrange=["W2", "M3"], value="%b '%y"),
# dict(dtickrange=["M6", None], value="%Y Y")
# ]
# )
pyo.plot(fig, filename=f"{conversation.title}.html")