This repository has been archived by the owner on Oct 16, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_db.py
94 lines (79 loc) · 3.93 KB
/
build_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
import click
import json
def get_conversations(data):
"""Return dict of unique conversation IDs and their participants."""
convos = {}
for conversation in data["conversation_state"]:
conversation_id = conversation["conversation_id"]["id"]
convo_participants = []
for participant in conversation["conversation_state"]["conversation"][
"participant_data"
]:
try:
convo_participants.append(participant["fallback_name"])
except KeyError:
pass
convos[conversation_id] = convo_participants
return convos
def generate_corpus(data, convo_id):
"""Return list containing all of the messages for the supplied conversation ID."""
corpus = []
states = data["conversation_state"]
for state in states:
conversation_state = state["conversation_state"]
if "event" in conversation_state:
conversations = conversation_state["event"]
for conversation in conversations:
if "chat_message" in conversation:
message_content = conversation["chat_message"]["message_content"]
if "segment" in message_content:
segment = message_content["segment"]
for line in segment:
conversation_id = conversation["conversation_id"]["id"]
if conversation_id == convo_id:
# timestamp_us = int(conversation['timestamp']) # epoch time in microseconds
# timestamp = dt.datetime.fromtimestamp(timestamp_us/1000000) # convert to epoch secs
# timestamp_str = timestamp.strftime("%Y%m%d_%Hh%Mm%Ss.%f")
if "text" in line:
message_text = line["text"].strip()
# If empty message then skip this pass of the loop
if not message_text:
continue
# Append period if sentence is not otherwise punctuated
if not message_text.endswith((".", "!", "?")):
message_text += "."
# Capitalise first letter
message_text = (
message_text[0].upper() + message_text[1:]
)
corpus.append(message_text)
return corpus
@click.command()
@click.argument("jsonfile", type=click.Path(exists=True))
def main(jsonfile):
"""Extracts a specific conversation from Hangouts data exported from Google Takeout, and readies it for use by Wynbot."""
with open(jsonfile, "rb") as file:
data = json.load(file)
# Get list of conversations and choose which one to use for data extraction. This may take some time.
convos = get_conversations(data)
selection_choices = {}
print("{:<3} {:<35} {:<50}".format("No.", "Convo ID", "Participants"))
for index, (key, value) in enumerate(convos.items(), start=1):
print(
"{num:<3} {convo_id:<35} {participants}".format(
num=index, convo_id=key, participants=", ".join(value)
)
)
selection_choices[index] = key
selection = int(input("Enter no. of conversation to use: "))
selected_convo_id = selection_choices[selection]
# Generate corpus of message text using the chosen conversation
corpus = generate_corpus(data, selected_convo_id)
# Output text file with each message on a new line.
with open("corpus.txt", "w", encoding="utf-8") as file:
for line in corpus:
file.write("{0}\n".format(line))
print('Corpus extracted to "corpus.txt".')
if __name__ == "__main__":
main()