-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathllsm.h
202 lines (175 loc) · 7.28 KB
/
llsm.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
/*
libllsm - Low Level Speech Model
===
Copyright (c) 2015-2017 Kanru Hua.
libllsm is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
libllsm is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with libllsm. If not, see <http://www.gnu.org/licenses/>.
*/
/*
Naming Convention
x input time domain signal
y output time domain signal
h FIR filter coefficient
nx/ny/... length of x/y/...
f0 fundamental frequency
fs sampling frequency
*/
#ifndef LLSM
#define LLSM
/*
llsm_sinframe: model parameters for a single frame of sinusoidal signal
*/
typedef struct {
FP_TYPE* ampl; // linear
FP_TYPE* phse; // rad
int nhar;
} llsm_sinframe;
/*
llsm_nosframe: model parameters for a single frame of noise signal
*/
typedef struct {
llsm_sinframe** eenv; // noise energy envelope for each channel
FP_TYPE* emin; // minimum noise energy contour for each channel
FP_TYPE* spec; // noise spectral envelope
int nchannel;
int nnos;
} llsm_nosframe;
/*
llsm_frame: model parameters for a single frame
*/
typedef struct {
llsm_sinframe* sinu; // harmonic component
llsm_nosframe* noise; // stochastic component
FP_TYPE f0; // in Hz
} llsm_frame;
/*
llsm_conf: model configurations
*/
typedef struct {
int nfrm; // number of frames
int nhop; // hop size in samples
int nhar; // number of harmonics
int nhare; // number of harmonics for noise energy
int nnos; // size of noise spectrum
FP_TYPE noswarp; // warpping factor for noise spectrum
FP_TYPE mvf; // maximum voiced frequency
FP_TYPE nosf; // upper bound of noise frequency
FP_TYPE thop; // hop size in seconds
FP_TYPE* nosbandf; // upper frequency of each noise band, excluding nosf;
// example: [2000, 4000, 8000], size = nnosband - 1
int nnosband; // number of noise bands
} llsm_conf;
/*
llsm_layer0: signal-domain model paramters
llsm_layer1: pseudo-acoustic domain model paramters
Note: llsm_layer1 is not complete on its own as it does not include the noise parameters.
Only partial conversion from llsm_layer1 down to llsm_layer0 is possible unless noise parameters
are retained.
*/
typedef struct {
llsm_conf conf; // configuration
llsm_frame** frames; // llsm -> frames[n] -> sinu -> ampl[k] | phse[k] | f0 | nhar
// noise -> eenv[n] | nchannel -> ampl[k] | phse[k] | f0 | nhar
// emin[n]
// spec[k]
} llsm_layer0;
typedef struct {
FP_TYPE** vt_resp_magn; // vocal tract (log) magnitude response
FP_TYPE** vs_har_ampl; // harmonic amplitudes without vocal tract contribution
FP_TYPE** vs_har_phse; // harmonic phases without vocal tract contribution
FP_TYPE* vs_rd; // Rd parameterization of LF model
FP_TYPE* lip_resp_magn; // lip (linear) magnitude response
FP_TYPE* lip_resp_phse; // lip phase response
int nfrm;
int nfft;
int fnyquist;
} llsm_layer1;
/*
llsm_parameters: configurations for analysis/synthesis processes
note: there might be a naming confusion between llsm and llsm_parameters. The former is the model
paramters (i.e. what we get after analyzing the speech); the later is a set of configurations for
analysis/synthesis processes. We name so to make it consistent with libpyin (see pyin_paramters).
*/
typedef struct {
// params for analysis
int a_nhop; // hop size in samples
int a_nhar; // number of harmonics
int a_nhare; // number of harmonics for noise energy
int a_nnos; // size of noise spectrum
int a_f0refine; // turn on/off f0 refinement
FP_TYPE a_wsize; // window size relative to period length
FP_TYPE a_noswarp; // warpping factor for noise spectrum
FP_TYPE a_tfft; // size of transformation in seconds
FP_TYPE a_mvf; // maximum voiced frequency
FP_TYPE a_nosf; // maximum noise frequency
FP_TYPE* a_nosbandf; // upper frequency of each noise band
int a_nnosband; // number of noise bands
// params for synthesis
int s_fs; // sampling frequency
int s_noiseonly; // synthesize noise component only
int s_n0; // range (starts from)
int s_n1; // range (ends at)
} llsm_parameters;
typedef struct {
// synthesis results
FP_TYPE* y;
FP_TYPE* y_sin;
FP_TYPE* y_nos;
int ny;
FP_TYPE fs;
} llsm_output;
// === LLSM Layer 0 functions ===
/*
llsm_create_frame: create a llsm parameter frame given number of harmonics,
number of noise energy harmonics, size of noise spectrum, number of noise energy channels
llsm_delete_frame: free a llsm parameter frame
*/
llsm_frame* llsm_create_frame(int nhar, int nhare, int nnos, int nchannel);
llsm_layer0* llsm_create_empty_layer0(llsm_conf conf);
void llsm_copy_frame(llsm_frame* dst, llsm_frame* src);
void llsm_copy_sinframe(llsm_sinframe* dst, llsm_sinframe* src);
void llsm_copy_nosframe(llsm_nosframe* dst, llsm_nosframe* src);
void llsm_delete_frame(llsm_frame* dst);
/*
llsm_init: obtain a configuration set initialized with default values
*/
llsm_parameters llsm_init(int nnosband);
void llsm_deinit(llsm_parameters dst);
/*
llsm_analyze
llsm_synthesize their names faithfully reflect what they do
llsm_delete
*/
llsm_layer0* llsm_layer0_analyze(llsm_parameters param, FP_TYPE* x, int nx, int fs,
FP_TYPE* f0, int nf0, FP_TYPE** xap);
llsm_output* llsm_layer0_synthesize(llsm_parameters param, llsm_layer0* model);
void llsm_delete_layer0(llsm_layer0* dst);
void llsm_delete_output(llsm_output* dst);
// === LLSM Layer 1 functions ===
/*
llsm_uniform_faxis generate a (nfft / 2 + 1)-sized array of uniformly spaced frequency values
llsm_liprad generate the approximated frequency response of lip radiation with radius
in cm (the default radius is 1.5 cm)
llsm_harmonic_cheaptrick estimate spectral envelope from harmonics using modified Cheaptrick algorithm
llsm_harmonic_minphase compute the phases of harmonics from amplitudes, assuming minimum phase
*/
FP_TYPE* llsm_uniform_faxis(int nfft, int fs);
FP_TYPE* llsm_liprad(FP_TYPE* freq, int nf, FP_TYPE radius, FP_TYPE* dst_phaseresp);
FP_TYPE* llsm_harmonic_cheaptrick(FP_TYPE* ampl, int nhar, FP_TYPE f0, int nfft, FP_TYPE fs);
FP_TYPE* llsm_harmonic_minphase(FP_TYPE* ampl, int nhar);
/*
llsm_layer1_from_layer0 convert from layer 0 representation to layer 1 representation
llsm_delete_layer1 deallocate memeory
*/
llsm_layer1* llsm_layer1_from_layer0(llsm_parameters param, llsm_layer0* model, int nfft, int fs);
void llsm_delete_layer1(llsm_layer1* dst);
void llsm_layer0_phaseshift(llsm_layer0* dst, FP_TYPE* phaseshift);
#endif