-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse.py
181 lines (159 loc) · 6.79 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import sys
import re
import string
import codes
import os
def remove_non_verbals(txt): # (* ... *)
nv_pattern = re.compile("\(\*[^\*\)]+\*\)")
results = nv_pattern.subn('',txt)
return results
def remove_timestamps(txt):
# the dot M means multiline
ts_pattern = re.compile("^[^\[]*\[\d\d:\d\d:\d\d\.\d\d\] ", re.M)
results = ts_pattern.subn('', txt)
return results
def remove_speakers(txt):
speaker_pattern = re.compile("(?:Male|Female|Participant A|Participant B); ")
results = speaker_pattern.subn('', txt)
return results
def remove_codes(txt):
all_codes = []
for code_type in codes.codes:
all_codes.extend(codes.codes[code_type])
codes_pattern = re.compile("|".join(all_codes))
results = codes_pattern.subn('', txt)
return results
def strip_lines(txt):
whitespace_pattern = re.compile("^\s+", re.M)
results = whitespace_pattern.subn('', txt)
return results
def remove_empty_brackets(txt):
empty_pattern = re.compile("\[\]|\[\[\]\]|\[\[\[\]\]\]")
results = empty_pattern.subn('', txt)
return results
def get_participant_words(participant, txt):
return get_participant_words_and_utterances(participant, txt)[participant]['words']
def get_participant_words_and_utterances(participant, txt):
parsed_results = strip_lines(remove_empty_brackets(remove_story_teller_markers(remove_codes(remove_timestamps(remove_non_verbals(txt)[0])[0])[0])[0])[0])[0].strip()
total_participant_words = []
utterances = []
# print parsed_results
for line in parsed_results.split('\n'):
# print line
words = line.split()
# print words
# if words[0][:len(participant)] == participant:
if line[:len(participant)] == participant:
total_participant_words.extend(words[1:])
if len(words[1:]) > 0:
utterances.append(words[1:])
return {
participant: {
'words':len(total_participant_words),
'utterances': len(utterances)
}
}
def remove_story_teller_markers(txt):
pattern_of_st_markers = re.compile("|".join(codes.story_teller_markers))
results = pattern_of_st_markers.subn('', txt)
return results
def determine_story_teller(txt):
speakers = get_all_speakers(txt)
pattern_of_st_markers = "|".join(codes.story_teller_markers)
story_teller_pattern = re.compile(pattern_of_st_markers+".+("+"|".join(speakers)+");")
results = story_teller_pattern.findall(txt)
# print results
results = [r for r in results if len(r)>0 ]
# print results.group()
if len(results) < 1:
print "NO STORYTELLER MARKERS ("+",".join(codes.story_teller_markers)+") FOUND!"
return None
return results[0]
def get_all_speakers(txt):
speaker_pattern = re.compile("\d\d\] ([^;\n]+);")
distinct_speakers = set(speaker_pattern.findall(txt))
return distinct_speakers
def filter_out_laughs(li):
return [elem for elem in li if elem.find('laugh') < 0]
def get_listener_nv_count(txt):
listener_nvs = 0
parsed = strip_lines(remove_timestamps(remove_codes(txt)[0])[0])[0].strip()
# print parsed
all_speakers = get_all_speakers(txt)
simple_nv_pattern = re.compile("\(\*[^\*\)]+\*\)")
# if listener is None:
speaker = determine_story_teller(txt)
listener = all_speakers.difference([speaker]).pop()
# print all_speakers, speaker, listener
#find the ones on participant b's line
# find the ones without participant
listener_nv_pattern = re.compile("\(\* ?"+listener+"[^\*\)]+\*\)")
# print '\n'
for line in parsed.split('\n'):
words = line.split()
line_results = filter_out_laughs(listener_nv_pattern.findall(line))
# if len(line_results) > 0:
# print '<cond a>'
# print line
# print line_results
listener_nvs += len(line_results)
if len(words) > 1 and line[:len(listener)] == listener:
# print '<cond b>'
unid_results = filter_out_laughs(simple_nv_pattern.findall(line))
listener_nvs += len(unid_results)
# if len(unid_results) > 0:
# print unid_results
# listener_nvs += line_results
# noverbals may not belong to the speaker of the line, in this case the nv will start with participant's identifier
return listener_nvs
def get_listener_lul_count(txt, listener=None):
lul_count = 0
lul_pattern = re.compile("LUL")
if listener is None:
all_speakers = get_all_speakers(txt)
speaker = determine_story_teller(txt)
listener = all_speakers.difference([speaker]).pop()
parsed = strip_lines(remove_timestamps(txt)[0])[0].strip()
for line in parsed.split('\n'):
words = line.split()
if len(words[0]) >= len(listener) and words[0][:len(listener)] == listener:
lul_count += len(lul_pattern.findall(line))
return lul_count
def get_texts(filename):
with open(filename, 'r') as f:
raw_text = f.read()
# print remove_non_verbals(raw_text)
# print remove_timestamps(raw_text)
# print remove_speakers(raw_text)
parsed_results = strip_lines(remove_empty_brackets(remove_codes(remove_speakers(remove_timestamps(remove_non_verbals(raw_text)[0])[0])[0])[0])[0])[0].strip()
# print parsed_results
# Total Word Count (full word and disfluency instances)
total_word_count = len(parsed_results.split())
total_utterances = len(parsed_results.split('\n'))
# print total_utterances
# print determine_story_teller(raw_text)
# storyteller word count
# story_teller_count = get_participant_words(determine_story_teller(raw_text), raw_text)
# story_teller_count = get_participant_words_and_utterances(determine_story_teller(raw_text), raw_text)
all_speakers = get_all_speakers(raw_text)
print filename
print 'Total Word Count (full word and disfluency instances): ', total_word_count
print 'Total Utterance Count (full stop, continuer, try-marker, and truncated ending instances): ', total_utterances
the_speaker = determine_story_teller(raw_text)
if the_speaker is None:
return
print 'Speaker: ' + the_speaker
for speaker in all_speakers:
print get_participant_words_and_utterances(speaker, raw_text)
print 'Listener Non-verbal Count ( "(* *)"" instances, not including laughter)', get_listener_nv_count(raw_text)
print 'Listener Unilateral Laughter Count (LUL instances)', get_listener_lul_count(raw_text)
if __name__ == '__main__':
if len(sys.argv) < 2:
print 'usage: python parse.py input_dir'
else:
# print os.listdir(sys.argv[1])
for fname in os.listdir(sys.argv[1]):
if fname.find('.DS_Store') < 0:
print "-"*60
# print os.path.join(sys.argv[1],fname)
get_texts(os.path.join(sys.argv[1],fname))