-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtext_message_analyzer_nb.py
264 lines (178 loc) · 8.66 KB
/
text_message_analyzer_nb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# coding: utf-8
# In[29]:
import sqlite3
import pandas as pd
import nltk
import datetime
import re
pd.set_option('display.width', 1000)
# plot within the notebook
get_ipython().magic(u'pylab inline')
#/Library/Application Support/MobileSync/Backup/d68a00b1faac8ed4cab3d6bec36ea7b05d284ddd
# Read about getting this file here: http://osxdaily.com/2010/07/08/read-iphone-sms-backup/
# connect to the text messages db file.
conn = sqlite3.connect('3d0d7e5fb2ce288813306e4d4636395e047a3d28')
c = conn.cursor()
def getMessageDF():
''' get all messages from the message table.
return a pandas dataframe.'''
message_table_cols = ['guid', 'service',
'text', 'date',
'date_delivered',
'handle_id', 'type',
'is_read','is_sent',
'is_delivered','item_type',
'group_title']
message_table_cols_str = ','.join(map(str, message_table_cols))
res = c.execute("SELECT %s FROM message" % message_table_cols_str)
message_df = pd.DataFrame(res.fetchall())
message_df.columns = message_table_cols
return message_df
message_df = getMessageDF()
# In[30]:
## WRANGLE THE DATA AND CATEGORIZE IT ##
# turn all messages into a list, and then convert it to a giant str corpus.
ALL_MESSAGES_LIST = message_df.text.tolist()
ALL_MESSAGES_STR= ','.join([x.lower() for x in ALL_MESSAGES_LIST if x is not None])
def getTokensAndPOS():
''' for all messages, grouped as a single str,
get the tokens and the parts of speech of each word'''
tokens = nltk.word_tokenize(ALL_MESSAGES_STR)
#TODO: fix count of tokens.
pos_tags = nltk.pos_tag(tokens)
return tokens, pos_tags
tokens, pos_tags = getTokensAndPOS()
def getMasterWords():
'''get the list of master words to perform the analysis on'''
# parts_of_speech that we don't care about because they're kind of boring.
dont_care_about = ['IN','CC','DT','RB']
# remove boring stuff.
for index, item in enumerate(pos_tags):
if item[1] in dont_care_about:
pos_tags.pop(index)
# assemble word list from POS list. take words where word > 2 characters
word_list = [tag[0] for tag in pos_tags if len(tag[0]) > 2]
master_words_tuple = nltk.FreqDist(word_list).most_common()
return master_words_tuple
master_words_tuple = getMasterWords()
## CREATE MASTER WORDS DATAFRAME. ##
# create a dataframe with the master_words_tuple. the tuple included the frequency it was used.
master_words_set_df = pd.DataFrame(master_words_tuple,columns=['word','count_times'])
master_words_set_df['pct_of_total'] = (master_words_set_df.count_times / len(master_words_set_df)) * 100
# In[31]:
### GENERAL DATASET INFO ###
print "Total text messages in dataset: {}".format(len(message_df.text))
print "Total words before filtering: {}".format(len(pos_tags))
print "Total words after filtering: {}".format(len(master_words_tuple))
# In[13]:
### BAD WORDS ANALYSIS ###
with open('bad_words.txt', 'r') as f:
bad_word_list = f.readlines()
bad_word_list = [word.strip('\n') for word in bad_word_list]
used_bad_words = []
for entry in master_words_tuple:
if entry[0] in bad_word_list:
used_bad_words.append(entry)
# bad words dataframe
bad_words_df = pd.DataFrame(used_bad_words, columns=['word','count_times'])
bad_words_df['pct_of_total'] = (bad_words_df.count_times / len(pos_tags)) * 100
master_words_set_df['bad_word'] = master_words_set_df.word.apply(lambda x: 1 if x in bad_words_df.word.tolist() else 0)
# In[14]:
# plot top 20 bad words. uncomment this !
#bad_words_df[:20].plot(kind='barh', x='word',figsize=(10,5))
# In[15]:
# slice the master dataframe by bad words. uncomment this!
#master_words_set_df[master_words_set_df.bad_word==1].head()
# In[16]:
## PARTS OF SPEECH ANALYSIS ##
pos_dict = dict(pos_tags)
master_words_set_df['pos'] = master_words_set_df.word.apply(lambda x: pos_dict[x])
# create a mapping dictionary so we know what we are reading!
POS_MAP_DICT = {'CC': ' conjunction, coordinating',
'CD': ' numeral, cardinal',
'DT': ' determiner',
'EX': ' existential there',
'FW': ' foreign word',
'IN': ' preposition or conjunction, subordinating',
'JJ': ' adjective or numeral, ordinal',
'JJR': ' adjective, comparative',
'JJS': ' adjective, superlative',
'LS': ' list item marker',
'MD': ' modal auxiliary',
'NN': ' noun, common, singular or mass',
'NNP': ' noun, proper, singular',
'NNPS': ' noun, proper, plural',
'NNS': ' noun, common, plural',
'PDT': ' pre-determiner',
'POS': ' genitive marker',
'PRP': ' pronoun, personal',
'PRP$': ' pronoun, possessive',
'RB': ' adverb',
'RBR': ' adverb, comparative',
'RBS': ' adverb, superlative',
'RP': ' particle',
'SYM': ' symbol',
'TO': ' "to" as preposition or infinitive marker',
'UH': ' interjection',
'VB': ' verb, base form',
'VBD': ' verb, past tense',
'VBG': ' verb, present participle or gerund',
'VBN': ' verb, past participle',
'VBP': ' verb, present tense, not 3rd person singular',
'VBZ': ' verb, present tense, 3rd person singular',
'WDT': ' WH-determiner',
'WP': ' WH-pronoun',
'WP$': ' WH-pronoun, possessive',
'WRB': ' Wh-adverb'}
# In[17]:
master_words_set_df['pos_map'] = master_words_set_df.pos.apply(lambda x: POS_MAP_DICT[x] if x in POS_MAP_DICT else None)
# In[18]:
# get a dataframe with the counts of each part of speech
pos_count_df = master_words_set_df.groupby(['pos_map']) .agg({'pos_map':len}) .rename(columns={'pos_map':'pos_count'}) .sort('pos_count', ascending=False)
pos_count_df.head()
# In[19]:
## MESSAGE-BASED ANALYSIS. ##
# information about the messages, their length, is_read, etc.
# create conversion functions for the unix timestamps, which is the format stored in the SQLlite file.
def getHumanReadableDateTime(unix_timestamp):
'''get human readable datetime from unix timestamp'''
return datetime.datetime.fromtimestamp(int(unix_timestamp)).strftime('%Y-%m-%d %H:%M:%S')
def getHumanReadableDate(unix_timestamp):
'''get human readable date from unix timestamp'''
return datetime.datetime.fromtimestamp(int(unix_timestamp)).strftime('%Y-%m-%d')
# In[20]:
# prepend a "1" to all date timestamps to make it a unix timestamp.
# doesn't seem to yield correct results always, though.
message_df['date'] = message_df.date.apply(lambda x: '1'+str(x))
# In[21]:
message_df['datetime_readable'] = message_df.date.apply(getHumanReadableDateTime).astype('datetime64[ns]')
message_df['date_readable'] = message_df.date.apply(getHumanReadableDate).astype('datetime64')
# In[22]:
message_df.head()
# In[23]:
# create a dataframe that counts the number of texts sent per day
texts_per_day_df = message_df.groupby(['date_readable']) .agg({'text':len}) .rename(columns={'text':'count_of_texts'})
# In[24]:
texts_per_day_df.plot(kind='line', figsize=(15,10))
# In[25]:
# Sent Versus Unsent. you sent is is_sent=1, you received is is_sent=0
pd.DataFrame(message_df.is_sent.value_counts(), columns=['count_of_messages'])
# In[26]:
# Get the Number of Words in Each Text Message
message_df['message_length'] = message_df.text.apply(lambda x: len(re.findall(r"[\w']+", x)) if x is not None else 0)
# In[27]:
# AVERAGE MESSAGE LENGTHS
sent_msg_length_mean = message_df[message_df.is_sent==1].message_length.mean()
received_msg_length_mean = message_df[message_df.is_sent==0].message_length.mean()
print "The average length of messages I sent was {}".format(sent_msg_length_mean)
print "The average length of messages I received was {}".format(received_msg_length_mean)
# In[28]:
## Fields I found were useless: ##
# country. message_df.country.unique(). array([None], dtype=object)
# is_emote. message_df.is_emote.unique(). everything was array([0])
# message_source. message_df.message_source.unique(). everything was array([0])
# share_status: message_df.share_status.value_counts(). {0:46890,1:4}
## Fields that were useful ##
# text. the message content from the text message.
# group_title. if you use groups, this is the assigned title of the group.
# In[ ]: