Skip to content

Commit

Permalink
Simplify parsers to only preserve user messages with text-based body (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
joweich authored Dec 5, 2023
1 parent da8da3a commit 66969c0
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 82 deletions.
103 changes: 36 additions & 67 deletions chatminer/chatparsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,13 +196,12 @@ def _parse_message(self, mess: str):

if ": " in author_and_body:
author, body = [x.strip() for x in author_and_body.split(": ", 1)]
return ParsedMessage(time, author, body)
elif ":." in author_and_body:
author = [x.strip() for x in author_and_body.split(":.", 1)][0]
body = "<Disappearing Message>"
self._logger.info(f"Ignoring self-destroying message on {time}.")
else:
author = "System"
body = author_and_body.strip()
return ParsedMessage(time, author, body)
self._logger.info(f"Ignoring sytem message on {time}: {body}.")


class FacebookMessengerParser(Parser):
Expand All @@ -211,17 +210,11 @@ def _read_raw_messages_from_file(self):
self._raw_messages: List[Dict[str, Any]] = json.load(f)["messages"]

def _parse_message(self, mess: Dict[str, Any]):
body: str
if "type" in mess and mess["type"] == "Share":
body = mess["share"]["link"]
elif "sticker" in mess:
body = mess["sticker"]["uri"]
elif "content" in mess:
body = mess["content"]
else:
self._logger.warning("Skipped message with unknown format: %s", mess)
if "content" not in mess:
return None

body = mess["content"]

time = dt.datetime.utcfromtimestamp(mess["timestamp_ms"] / 1000)
author = mess["sender_name"].encode("latin-1").decode("utf-8")
body = body.encode("latin-1").decode("utf-8")
Expand All @@ -234,36 +227,23 @@ def _read_raw_messages_from_file(self):
self._raw_messages: List[Dict[str, Any]] = json.load(f)["messages"]

def _parse_message(self, mess: Dict[str, Any]):
if "share" in mess:
body = "sentshare"
elif "photos" in mess:
body = "sentphoto"
elif "videos" in mess:
body = "sentvideo"
elif "audio_files" in mess:
body = "sentaudio"
elif "content" in mess:
if any(
flag in mess["content"]
for flag in (
" to your message",
" in the poll.",
" created a poll: ",
" liked a message",
"This poll is no longer available.",
"'s poll has multiple updates.",
)
):
return None
body = mess["content"]
elif all(key in ("sender_name", "timestamp_ms", "reactions") for key in mess):
body = "disappearingmessage"
elif any(key == "is_unsent" for key in mess):
if "content" not in mess:
return None
else:
self._logger.warning("Skipped message with unknown format: %s", mess)

system_messages = [
"to your message",
"in the poll.",
"created a poll: ",
"liked a message",
"This poll is no longer available.",
"'s poll has multiple updates.",
]

if any(flag in mess["content"] for flag in system_messages):
return None

body = mess["content"]

time = dt.datetime.utcfromtimestamp(mess["timestamp_ms"] / 1000)
author = mess["sender_name"].encode("latin-1").decode("utf-8")
body = body.encode("latin-1").decode("utf-8")
Expand All @@ -280,44 +260,33 @@ def _read_raw_messages_from_file(self):
json_objects = json.load(f)

if "messages" in json_objects:
self._logger.info("Detected single chat export.")
self._raw_messages = json_objects["messages"]
else:
self._logger.info("Detected batch export.")
if self.chat_name:
self._logger.info("Searching for chat %s...", self.chat_name)
for chat in json_objects["chats"]["list"]:
if "name" in chat and chat["name"] == self.chat_name:
self._raw_messages = chat["messages"]
break
else:
self._logger.info(
'No chat name was specified, searching for chat "Saved Messages"...'
)
for chat in json_objects["chats"]["list"]:
if chat["type"] == "saved_messages":
self._raw_messages = chat["messages"]
break
if not self._raw_messages:
self._logger.error(
"Chat %s was not found.",
self.chat_name if self.chat_name else "Saved Messages",
)
raise ValueError(f"{self.chat_name} not found in {self._file}")

def _parse_message(self, mess: Dict[str, Any]):
if "from" in mess and "text" in mess:
if isinstance(mess["text"], str):
body = mess["text"]
elif isinstance(mess["text"], list):
text_elements = [
m["text"] if isinstance(m, dict) else m for m in mess["text"]
]
body = " ".join(text_elements)
else:
raise ValueError(f"Unable to parse type {type(mess['text'])} in {mess}")
if "from" not in mess or "text" not in mess:
return None

time = dt.datetime.utcfromtimestamp(int(mess["date_unixtime"]))
author = mess["from"]
return ParsedMessage(time, author, body)
return None
if isinstance(mess["text"], str):
body = mess["text"]
elif isinstance(mess["text"], list):
text_elements = [
m["text"] if isinstance(m, dict) else m for m in mess["text"]
]
body = " ".join(text_elements)

time = dt.datetime.utcfromtimestamp(int(mess["date_unixtime"]))
author = mess["from"]
return ParsedMessage(time, author, body)


class WhatsAppDateFormat:
Expand Down
15 changes: 0 additions & 15 deletions test/whatsapp/target.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,6 @@
"author": "John Doe 🤓",
"message": "Lorem ipsum 🤓"
},
{
"timestamp": "2020-06-30T09:10:00",
"author": "System",
"message": "You were added"
},
{
"timestamp": "2020-06-20T00:08:00",
"author": "System",
"message": "+12 345 578 created group \"Groupname\""
},
{
"timestamp": "2020-06-10T15:55:00",
"author": "John-John Doe",
Expand All @@ -34,11 +24,6 @@
"author": "Jahn Doe",
"message": "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Duis libero."
},
{
"timestamp": "2019-01-20T11:23:00",
"author": "John Doe",
"message": "<Disappearing Message>"
},
{
"timestamp": "2019-01-01T11:25:00",
"author": "John Doe",
Expand Down

0 comments on commit 66969c0

Please sign in to comment.