-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprocessTexts.py
152 lines (140 loc) · 4.64 KB
/
processTexts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import messengerScraper
import json, csv
import re
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 \
import Features, EmotionOptions, SentimentOptions, KeywordsOptions
watsonCreds = 'ibm-key.json'
clumpMins = 20
endPunctuation = set(['!', ',', '.', ';', '?'])
regexAbb = {
'aka': 'also known as',
'btw': 'by the way',
'bc': 'because',
'fyi': 'for your information',
'idk':"I don't know",
'imo': 'in my opinion',
'omg': 'oh my gosh',
'omfg': 'oh my gosh',
'tba': 'to be announced',
'tbd': 'to be decided',
'thx':'thanks',
'wtf':'what the heck',
'lol': 'haha',
'ppl': 'people',
'brb': 'be right back' }
# get stopwords from file
def loadStopWords(file):
stopwords = set()
with open(file) as f:
for line in f:
stopwords.add(line.lower().rstrip())
return stopwords
# get instance of Watson NaturalLanguageUnderstanding
def initWatson():
with open(watsonCreds, 'r') as f:
creds = json.load(f)
return NaturalLanguageUnderstandingV1(
username = creds['username'],
password = creds['password'],
version = '2017-02-27')
# combine conversations into larger clumps for watson processing
def makeClumps(convos):
clumped = []
for convo in convos:
allMsgs = []
userMsgs = []
clumpEndTime = convo['msg_list'][0]['date'] + clumpMins * 60
for msg in convo['msg_list']:
if msg['date'] > clumpEndTime:
if len(userMsgs) > 0:
clumped.append((
concatToClump(convo['person'], allMsgs),
concatToClump(convo['person'], userMsgs)))
allMsgs = []
userMsgs = []
clumpEndTime = msg['date'] + clumpMins * 60
allMsgs.append(msg)
if msg['user_speaking']:
userMsgs.append(msg)
if len(userMsgs) > 0:
clumped.append((
concatToClump(convo['person'], allMsgs),
concatToClump(convo['person'], userMsgs)))
return clumped
# concatenate msgs together into one with correct user attribution
def concatToClump(user, msgs):
clump = {'time': msgs[0]['date'], 'user': user, 'text': ''}
for msg in msgs:
clump['text'] += msg['body']
if msg['body'][-1:] not in endPunctuation:
clump['text'] += '. '
else:
clump['text'] += ' '
return clump
# returns information for stats package given clumps to analyze with Watson
def analyzeClumps(clumps, stopWords):
natural_language_understanding = initWatson()
data = []
for allClump, userClump in clumps:
userResp = natural_language_understanding.analyze(
text = userClump['text'],
features = Features(
sentiment = SentimentOptions(),
emotion = EmotionOptions()),
language='en')
allResp = natural_language_understanding.analyze(
text = allClump['text'],
features = Features(
keywords = KeywordsOptions()),
language = 'en')
score = riskScore(userClump, userResp)
keywords = getKeywords(allResp, stopWords)
data.append({
'time': allClump['time'],
'user': allClump['user'],
'score': score,
'keywords': keywords})
return data
# calculates a risk score for a message
def riskScore(clump, response):
emotions = response['emotion']['document']['emotion']
sentiment = response['sentiment']['document']
if sentiment['label'] == 'positive':
return 3.5 + emotions['anger'] - emotions['sadness'] + 1.5 * emotions['joy']
elif sentiment['label'] == 'negative':
return 5.0 - 4.0 * (emotions['anger'] + emotions['sadness'])
else:
return 4.0
return emotions['sadness'] + emotions['fear'] + emotions['anger'] - emotions['joy']
# extracts relevant keywords from the text
def getKeywords(response, stopWords):
words = [{'term': term['text'], 'relevance': term['relevance']} \
for term in response['keywords']]
keywords = []
for word in words:
text = word['term'].lower()
if text not in stopWords:
keywords.append({'term': text, 'relevance': word['relevance']})
return keywords
# preprocess conversation to normalize text
def preprocess(convo):
for i in range(len(convo['msg_list'])):
target = convo['msg_list'][i]
target['body'] = re.sub(r'\S+',
lambda g: regexAbb[g.group(0).lower()] if g.group(0) in regexAbb else g.group(0),
target['body'])
return convo
# calculates risk scores and writes them to csv file along with associated meta-data
def writeDataToFile(convos, file):
stopWords = loadStopWords('stopwords.txt')
loads = [preprocess(load) for load in json.loads(convos)]
data = sorted(analyzeClumps(makeClumps(loads), stopWords), key = lambda x: x['time'])
with open(file, 'w') as f:
writer = csv.writer(f)
for i in range(len(data)):
curr = data[i]
writer.writerow([i // 10, curr['time'], curr['score'], curr['user'], curr['keywords']])
if __name__ == "__main__":
convos = messengerScraper.scrapeAll('data')
writeDataToFile(convos, 'data.csv')