-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecording_splitter.py
289 lines (213 loc) · 9.45 KB
/
recording_splitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
# TODO: Session XML header cfg.
from collections import deque
import os
import wave
import logging
from fnnvad import FFNNVAD
LOGGING_FORMAT = '%(asctime)s [%(levelname)s] %(name)s: %(message)s'
default_vad_cfg = {
'framesize': 512,
'frameshift': 160,
'sample_rate': 8000,
'usehamming': True,
'preemcoef': 0.97,
'numchans': 26,
'ceplifter': 22,
'numceps': 12,
'enormalise': True,
'zmeansource': True,
'usepower': True,
'usec0': False,
'usecmn': False,
'usedelta': False,
'useacc': False,
'n_last_frames': 30, # 15,
'n_prev_frames': 15,
'mel_banks_only': True,
'lofreq': 125,
'hifreq': 3800,
'model': 'vad_nnt_1196_hu512_hl1_hla3_pf30_nf15_acf_4.0_mfr32000000_mfl1000000_mfps0_ts0_usec00_usedelta0_useacc0_mbo1_bs1000.tffnn',
'filter_length': 2,
}
class RecordingSplitter(object):
CHANGE_TO_NON_SPEECH = 2
CHANGE_TO_SPEECH = 1
speech_thresh = 0.7
non_speech_thresh = 0.1
read_buffer_size = 128
def __init__(self, vad_cfg, speech_thresh=0.7, non_speech_thresh=0.1):
self.vad_cfg = vad_cfg
self.speech_thresh = speech_thresh
self.non_speech_thresh = non_speech_thresh
logging.info('Loading VAD model.')
self.vad = FFNNVAD(**vad_cfg)
def split_single_channel_wav(self, file_name, out_dir, out_prefix):
logging.info('Splitting %s' % file_name)
wave_in = wave.open(file_name)
sample_rate = wave_in.getframerate()
sample_width = wave_in.getsampwidth()
bytes_per_second = sample_rate * sample_width
frames_per_second = bytes_per_second / self.read_buffer_size
(detection_window_sil,
detection_window_speech,
pre_detection_buffer) = self._initialize_buffers(frames_per_second)
res_files = []
res_file_cntr = 0
frames = []
is_speech = False
n_read = 0
n_read_beg = None
while 1:
audio_data = wave_in.readframes(self.read_buffer_size)
n_read += self.read_buffer_size
if len(audio_data) == 0:
break
raw_vad_decision = self.vad.decide(audio_data)
is_speech, change = self._smoothe_decison(raw_vad_decision, is_speech, detection_window_speech, detection_window_sil)
if not is_speech:
pre_detection_buffer.append(audio_data)
if change == self.CHANGE_TO_SPEECH:
n_read_beg = n_read - self.read_buffer_size
frames = []
elif change == self.CHANGE_TO_NON_SPEECH:
#if not is_speech and len(frames) > 1:
self._save_part(res_file_cntr, list(pre_detection_buffer) + frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read, bytes_per_second)
res_file_cntr += 1
pre_detection_buffer.extend(frames[-pre_detection_buffer.maxlen:])
if is_speech:
frames.append(audio_data)
if n_read_beg:
self._save_part(res_file_cntr, frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read, bytes_per_second)
return res_files
def _initialize_buffers(self, frames_per_second):
pre_detection_buffer_frames = int(frames_per_second * 0.5)
smoothe_decision_window_sil = int(frames_per_second * 0.2)
smoothe_decision_window_speech = int(frames_per_second * 0.2)
detection_window_speech = deque(maxlen=smoothe_decision_window_speech)
detection_window_sil = deque(maxlen=smoothe_decision_window_sil)
pre_detection_buffer = deque(maxlen=pre_detection_buffer_frames)
return detection_window_sil, detection_window_speech, pre_detection_buffer
def _smoothe_decison(self, decision, last_vad, detection_window_speech, detection_window_sil):
detection_window_speech.append(decision)
detection_window_sil.append(decision)
speech = float(sum(detection_window_speech)) / (len(detection_window_speech) + 1.0)
sil = float(sum(detection_window_sil)) / (len(detection_window_sil) + 1.0)
vad = last_vad
change = None
if last_vad:
# last decision was speech
if sil < self.non_speech_thresh:
vad = False
change = self.CHANGE_TO_NON_SPEECH
else:
if speech > self.speech_thresh:
vad = True
change = self.CHANGE_TO_SPEECH
return vad, change
def _save_part(self, cntr, frames, out_dir, res_files, wave_in, out_prefix, n_read_beg, n_read_end, bytes_per_second):
content = b''.join(frames)
logging.info('Saving part %d (%.1f s).' % (cntr, len(content) * 1.0 / bytes_per_second))
res_file = os.path.join(out_dir, 'part.%s.%.3d.wav' % (out_prefix, cntr, ))
wf = wave.open(res_file, 'wb')
wf.setnchannels(wave_in.getnchannels())
wf.setsampwidth(wave_in.getsampwidth())
wf.setframerate(wave_in.getframerate())
wf.writeframes(content)
wf.close()
res_files.append(((n_read_beg * 1.0 / bytes_per_second, n_read_end * 1.0 / bytes_per_second), res_file))
def main(input_dir, pcm_sample_rate, output_dir, v):
if v:
logging.basicConfig(level=logging.DEBUG, format=LOGGING_FORMAT)
else:
logging.basicConfig(format=LOGGING_FORMAT)
_mkdir_if_not_exists(output_dir)
logging.info('Starting.')
to_process = _find_files_to_split(input_dir)
vad_cfg = default_vad_cfg
_download_vad_model_if_not_exists(vad_cfg)
rs = RecordingSplitter(vad_cfg=vad_cfg)
_split_files(rs, output_dir, to_process, pcm_sample_rate)
def _mkdir_if_not_exists(output_dir):
if not os.path.exists(output_dir):
os.makedirs(output_dir)
def _download_vad_model_if_not_exists(vad_cfg):
if not os.path.exists(vad_cfg['model']):
os.system('wget "%s"' % (
'https://vystadial.ms.mff.cuni.cz/download/alex/resources/vad'
'/voip/%s' %
vad_cfg['model'], ))
def _find_files_to_split(input_dir):
to_process = []
logging.info('Searching for files.')
for root, dirs, files in os.walk(input_dir):
for file_name in files:
if file_name.endswith('.pcm'):
to_process.append((file_name, os.path.relpath(root, start=input_dir), root))
return to_process
def _split_files(rs, output_dir, to_process, pcm_sample_rate):
logging.info('Processing files.')
for file_name, root, abs_root in to_process:
file_out_dir = os.path.join(output_dir, root, file_name)
print output_dir, file_name, root, file_out_dir
files = _split_2chan_pcm(rs, abs_root, file_name, file_out_dir, pcm_sample_rate, root)
_create_session_xml(file_out_dir, files)
def _split_2chan_pcm(rs, abs_root, file_name, out_dir, sample_rate, root):
file_path = os.path.join(abs_root, file_name)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
wav_path_a = os.path.join(out_dir, 'all.a.wav')
wav_path_b = os.path.join(out_dir, 'all.b.wav')
_convert_to_wav(file_path, sample_rate, wav_path_a, 1)
_convert_to_wav(file_path, sample_rate, wav_path_b, 2)
res_files1 = rs.split_single_channel_wav(wav_path_a, out_dir, "a")
res_files2 = rs.split_single_channel_wav(wav_path_b, out_dir, "b")
res = res_files1 + res_files2
res.sort(key=lambda ((tb, te, ), fn, ): tb)
return res
def _convert_to_wav(in_file, sample_rate, out_file, chan):
os.system('sox -e signed-integer -b 16 -r %d -c 2 -t raw "%s" "%s" remix %d' % (sample_rate, in_file, out_file, chan, ))
def _create_session_xml(output_dir, files):
res = """<?xml version="1.0" encoding="utf-8"?>
<dialogue>
<config>
</config>
<header>
<host>{host}</host>
<date>{date}</date>
<system>{system}</system>
<version>{version}</version>
<input_source type="voip"/>
</header>
{turns}
</dialogue>
"""
turn_tpl = """<turn speaker="user" time="{turn_time}" turn_number="{turn_num}">
<rec starttime="{rec_starttime}" endtime="{rec_endtime}" fname="{rec_filename}" />
</turn>"""
res_turns = []
for i, ((ts, te), fn) in enumerate(files):
turn = turn_tpl.format(turn_time=ts,
turn_num=i + 1,
rec_starttime=ts,
rec_endtime=te,
rec_filename=os.path.basename(fn))
res_turns.append(turn)
session_fn = os.path.join(output_dir, 'session.xml')
with open(session_fn, 'w') as f_out:
f_out.write(res.format(turns="\n".join(res_turns), host="", date="", system="", version=""))
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.usage = ('Recording Splitter takes an input directory with PCM '
'recordings and splits them using a voice activity detection'
' mechanism into output dir. It expects the PCM recordings '
'to be 2 channel, where each channel contains one side of a'
'dialog. The input directory structure is preserved in the '
'output directory, where each input file corresponds to an '
'output folder of the same name.')
parser.add_argument('input_dir')
parser.add_argument('output_dir')
parser.add_argument('--pcm_sample_rate', type=int, default=8000)
parser.add_argument('-v', default=False, action='store_true')
args = parser.parse_args()
main(**vars(args))