-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
130 lines (103 loc) · 4.62 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import numpy as np
import os
import librosa
from PIL import Image
from noise_filtering_algorithm import noise_filtering
# 全局变量
RANDOM_SEED = 1337
SAMPLE_RATE = 32000
SIGNAL_LENGTH = 5 # seconds
SPEC_SHAPE = (224, 224) # height x width
FMIN = 20
FMAX = 16000
# 音频预加重:增强高频部分
def audio_augment(signal):
'''
预加重是在提取特征之前对原始音频的处理,增强高频部分。
librosa的logmelspec特征并没有预加重的步骤,因此我们手动添加。
'''
pre_emphasis = 0.97
pre_emphasised_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
return pre_emphasised_signal
# Define a function that splits an audio file,
# extracts spectrograms and saves them in a working directory
def get_spectrograms(filepath, primary_label, output_dir):
# Open the file with librosa (limited to the first 55 seconds)
sig, rate = librosa.load(filepath, sr=SAMPLE_RATE, offset=None, duration=55)
# Split signal into five second chunks
sig_splits = []
for i in range(0, len(sig), int(SIGNAL_LENGTH * SAMPLE_RATE)):
split = sig[i:i + int(SIGNAL_LENGTH * SAMPLE_RATE)]
# End of signal?
if len(split) < int(SIGNAL_LENGTH * SAMPLE_RATE):
break
split = noise_filtering(split) # 进行噪声过滤
split = audio_augment(split)
sig_splits.append(split)
# Extract mel spectrograms for each audio chunk
s_cnt = 0
saved_samples = []
for chunk in sig_splits:
hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
mel_spec = librosa.feature.melspectrogram(y=chunk,
sr=SAMPLE_RATE,
n_fft=2048,
hop_length=hop_length,
n_mels=SPEC_SHAPE[0],
fmin=FMIN,
fmax=FMAX)
mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
# Normalize
mel_spec -= mel_spec.min()
mel_spec /= mel_spec.max()
# Save as image file
save_dir = os.path.join(output_dir, primary_label)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_path = os.path.join(save_dir, filepath.rsplit(os.sep, 1)[-1].rsplit('.', 1)[0] +
'_' + str(s_cnt) + '.png')
im = Image.fromarray(mel_spec * 255.0).convert("L")
im.save(save_path)
saved_samples.append(save_path)
s_cnt += 1
return saved_samples
def get_spectrograms_evaluation(filepath, primary_label, output_dir):
# noise_filtering(filepath)
sig, rate = librosa.load(filepath, sr=SAMPLE_RATE, offset=None, duration=20)
# sig = audio_augment(sig)
# Split signal into five second chunks
sig_splits = []
for i in range(0, len(sig), int(SIGNAL_LENGTH * SAMPLE_RATE)):
split = sig[i:i + int(SIGNAL_LENGTH * SAMPLE_RATE)]
# End of signal?
if len(split) < int(SIGNAL_LENGTH * SAMPLE_RATE):
break
split = noise_filtering(split)
split = audio_augment(split)
sig_splits.append(split)
# Extract mel spectrograms for each audio chunk
s_cnt = 0
saved_samples = []
for chunk in sig_splits:
hop_length = int(SIGNAL_LENGTH * SAMPLE_RATE / (SPEC_SHAPE[1] - 1))
mel_spec = librosa.feature.melspectrogram(y=chunk,
sr=SAMPLE_RATE,
n_fft=2048,
hop_length=hop_length,
n_mels=SPEC_SHAPE[0],
fmin=FMIN,
fmax=FMAX)
mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
# Normalize
mel_spec -= mel_spec.min()
mel_spec /= mel_spec.max()
# Save as image file
save_dir = os.path.join(output_dir, primary_label)
if not os.path.exists(save_dir):
os.makedirs(save_dir)
save_path = os.path.join(save_dir, filepath.rsplit(os.sep, 1)[-1].rsplit('.', 1)[0] +
'_' + str(s_cnt) + '.png')
im = Image.fromarray(mel_spec * 255.0).convert("L")
im.save(save_path)
saved_samples.append(save_path)
s_cnt += 1