-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleaner.py
26 lines (20 loc) · 934 Bytes
/
cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#cleaner.py
import re
def clean_corpus(chat_export_file):
message_corpus = remove_chat_metadata(chat_export_file)
cleaned_corpus = remove_non_message_text(message_corpus)
return cleaned_corpus
def remove_chat_metadata(chat_export_file):
date_time = r"(\d+\/\d+\/\d+,\s\d+:\d+)" #e.g. "9/16/22, 06:34"
dash_whitespace = r"\s-\s" # " - "
username = r"([\w\s]+)" #e.g. "Pratima"
metadata_end = r":\s" # ": "
pattern = date_time + dash_whitespace + username + metadata_end
with open(chat_export_file, "r") as corpus_file:
content = corpus_file.read()
cleaned_corpus = re.sub(pattern, "", content)
return tuple(cleaned_corpus.split("\n"))
def remove_non_message_text(export_text_lines):
messages = export_text_lines[1:-1]
filter_out_msgs = ("<Media omitted>","You deleted this message")
return tuple((msg for msg in messages if msg not in filter_out_msgs))