-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathfeatures.py
376 lines (289 loc) · 15 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
import python_speech_features
import numpy
import parameters
import numpy.fft
import sys
import scipy.signal
import scipy.io
import scipy.fftpack
import matplotlib.pyplot as plt
def getMFCC(audio_in):
assert isinstance(audio_in, numpy.ndarray)
assert (audio_in.size % parameters.WINDOW_STEP_SAMPLES) == 0
# Get single channel if input is dual channel
if(len(audio_in.shape) > 1):
audio_in = audio_in[:, 0]
mfcc_features = python_speech_features.mfcc(audio_in, samplerate=parameters.SAMPLE_RATE_HZ,
winlen=parameters.WINDOW_LENGTH_MS*(1e-3), winstep=parameters.WINDOW_STEP_MS*(1e-3),
numcep=parameters.MFCC_NUM_COEFF+1, lowfreq=parameters.MFCC_MIN_FREQ,
highfreq=parameters.MFCC_MAX_FREQ, nfilt=parameters.MFCC_NUM_COEFF*2)
assert isinstance(mfcc_features, numpy.ndarray)
return mfcc_features[:,1:mfcc_features.shape[1]]
def divideSignalToWindows(signal):
fs = parameters.SAMPLE_RATE_HZ
win_size = parameters.WINDOW_SIZE_SAMPLES
win_step = parameters.WINDOW_STEP_SAMPLES
num_of_windows = int((float(len(signal)-win_size)/float(win_step))+1)
divided_signal = numpy.zeros([num_of_windows, win_size])
for i in range(num_of_windows):
start_ind = i*win_step
#Stop ind is actually the stop index +1, because python indexing goes up to end index -1
stop_ind = start_ind+win_size
divided_signal[i, :] = signal[start_ind:stop_ind]
return divided_signal
def erb2hz(erb):
hz = (10**(erb / 21.4) - 1) / 4.37e-3
return hz
def hz2erb(hz):
erb = 21.4 * numpy.log10(4.37e-3 * hz + 1)
return erb
'''
Compute loudness level in Phons on the basis of equal-loudness functions.
It accounts a middle ear effect and is used for frequency-dependent gain adjustments.
This function uses linear interpolation of a lookup table to compute the loudness level,
in phons, of a pure tone of frequency freq using the reference curve for sound
pressure level dB. The equation is taken from section 4 of BS3383.
'''
def loudness(freq):
'''
Stores parameters of equal-loudness functions from BS3383,"Normal equal-loudness level
contours for pure tones under free-field listening conditions", table 1.
f (or ff) is the tone frequency, af and bf are frequency-dependent coefficients, and
tf is the threshold sound pressure level of the tone, in dBs
'''
data_file = scipy.io.loadmat('f_af_bf_cf.mat')
ff = data_file['ff']
ff = numpy.reshape(ff, ff.size)
af = data_file['af']
af = numpy.reshape(af, af.size)
bf = data_file['bf']
bf = numpy.reshape(bf, bf.size)
cf = data_file['cf']
cf = numpy.reshape(cf, cf.size)
dB = 60
if (freq < 20 or freq > 12500):
raise Exception('Cannot compute loudness for given frequency')
i = 0
while (ff[i] < freq):
i = i + 1
afy = af[i - 1] + (freq - ff[i - 1]) * (af[i] - af[i - 1]) / (ff[i] - ff[i - 1])
bfy = bf[i - 1] + (freq - ff[i - 1]) * (bf[i] - bf[i - 1]) / (ff[i] - ff[i - 1])
cfy = cf[i - 1] + (freq - ff[i - 1]) * (cf[i] - cf[i - 1]) / (ff[i] - ff[i - 1])
loud = 4.2 + afy * (dB - cfy) / (1 + bfy * (dB - cfy))
return loud
def getGammatoneCenterFreq(num_channels=parameters.CGRAM_NUM_CHANNELS):
# Find center frequencies- they are equally spaced on the erb scale
min_erb = hz2erb(parameters.CGRAM_MIN_FREQ)
max_erb = hz2erb(parameters.CGRAM_MAX_FREQ)
center_freq_erb = numpy.linspace(min_erb, max_erb, num_channels)
# Get center frequencies in hz. They are named fc in the gammatone definition
fc = erb2hz(center_freq_erb)
return fc
def getGammatoneFir(num_channels=parameters.CGRAM_NUM_CHANNELS):
GT_FILTER_ORDER = 4
GT_LENGTH_MS = 128 # Not sure what to use here. Saw this value is being used.
GT_LENGTH_SAMPLES = int(GT_LENGTH_MS * 1e-3 * parameters.SAMPLE_RATE_HZ)
fc = getGammatoneCenterFreq(num_channels)
# Get each filters bandwidth. named b in the gammatone definition
b = 1.019 * 24.7 * (4.37 * fc / 1000 + 1)
# Build filters impulse responses and filter the signal with them
time = numpy.array(range(GT_LENGTH_SAMPLES)) / parameters.SAMPLE_RATE_HZ
gt_fir = numpy.zeros((num_channels, GT_LENGTH_SAMPLES))
for channel in range(num_channels):
gain = 10 ** ((loudness(fc[channel]) - 60) / 20) / 3 * (2 * numpy.pi * b[channel] / parameters.SAMPLE_RATE_HZ) ** 4
# In the impulse response we multiply the time by the sampling frequency to make it unit-less
gt_fir[channel,:] = gain*((time*parameters.SAMPLE_RATE_HZ)**(GT_FILTER_ORDER-1))*numpy.exp(-2*numpy.pi*b[channel]*time)*numpy.cos(2*numpy.pi*fc[channel]*time)
return gt_fir
def applyGammatoneFilterbank(signal_in, num_channels=parameters.CGRAM_NUM_CHANNELS):
gt_fir = getGammatoneFir(num_channels)
gt_filtered = numpy.zeros((num_channels, len(signal_in)))
for channel in range(num_channels):
gt_filtered[channel,:] = scipy.signal.convolve(signal_in, gt_fir[channel,:], 'same')
return gt_filtered
def applyIbmToSignalStft(signal_in, ibm=None):
assert isinstance(signal_in, numpy.ndarray)
if(ibm is None):
num_of_windows = int((float(len(signal_in) - parameters.WINDOW_SIZE_SAMPLES) / float(parameters.WINDOW_STEP_SAMPLES)) + 1)
ibm = numpy.ones((num_of_windows, parameters.SGRAM_NUM_CHANNELS))
assert isinstance(ibm, numpy.ndarray)
stft = getStft(signal_in)
masked_stft = stft*ibm
return getIstft(masked_stft)
def applyIbmToSignalCgram(signal_in, ibm=None):
assert isinstance(signal_in, numpy.ndarray)
if (ibm is None):
num_of_windows = int(
(float(len(signal_in) - parameters.WINDOW_SIZE_SAMPLES) / float(parameters.WINDOW_STEP_SAMPLES)) + 1)
ibm = numpy.ones((num_of_windows, parameters.CGRAM_NUM_CHANNELS))
assert isinstance(ibm, numpy.ndarray)
gt_filtered = applyGammatoneFilterbank(signal_in)
gt_fir = getGammatoneFir()
fc = getGammatoneCenterFreq()
# Get raised cosine window
# Get time vector in range [0,1) in size of window
time = numpy.array(range(parameters.WINDOW_SIZE_SAMPLES)) / parameters.WINDOW_SIZE_SAMPLES
cos_win = (1 + numpy.cos(2 * numpy.pi * time - numpy.pi)) / 2
signal_out = numpy.zeros(signal_in.shape)
total_gt = numpy.zeros(signal_in.shape)
for channel in range(parameters.CGRAM_NUM_CHANNELS):
# Apply inverse filter in time, to get rid of phase
in_ear_coeffs = 10 ** ((loudness(fc[channel]) - 60) / 20)
channel_signal = scipy.signal.convolve(gt_filtered[channel, :], gt_fir[channel, ::-1], 'same') / (in_ear_coeffs ** 2)
# Add to the total gammatone filter response
zero_pad_fir = numpy.zeros(gt_filtered.shape[1])
zero_pad_fir[0:gt_fir.shape[1]] = gt_fir[channel, :]
total_gt += scipy.signal.convolve(zero_pad_fir, gt_fir[channel, ::-1], 'same') / (in_ear_coeffs ** 2)
# Get signal channel weight
win_len = parameters.WINDOW_SIZE_SAMPLES
win_shift = parameters.WINDOW_STEP_SAMPLES
weight = numpy.zeros(len(signal_in))
for frame_ind in range(ibm.shape[0]):
start_ind = frame_ind * win_shift
weight[start_ind:start_ind + win_len] += ibm[frame_ind, channel] * cos_win
# The first and last win shifts of the signal didn't get summed up twice, so we need to deal with it separately.
# We don't have the ibm for it, but we can use the last ibm and assume it stays unchanged
weight[-1 * win_shift:] += ibm[-1, channel] * cos_win[0:win_shift]
weight[0:win_shift] += ibm[0, channel] * cos_win[-1 * win_shift:]
signal_out = signal_out + weight * channel_signal
# Apply inverse weighting of the total gt filter response
signal_out_fft = numpy.fft.rfft(signal_out)
gt_resp_fft_abs = numpy.abs(numpy.fft.rfft(total_gt))
signal_out_fft[gt_resp_fft_abs > 0.01] = signal_out_fft[gt_resp_fft_abs > 0.01] / gt_resp_fft_abs[gt_resp_fft_abs > 0.01]
signal_out_fft[gt_resp_fft_abs <= 0.01] = 0
recon = numpy.fft.irfft(signal_out_fft)
# Shift left recon 1 sample, to fix phase difference
recon[0:-1] = recon[1:]
recon[-1] = 0
return recon
def applyIbmToSignal(signal_in, ibm=None):
if (parameters.SGRAM_TYPE == 'STFT'):
return applyIbmToSignalStft(signal_in, ibm)
if (parameters.SGRAM_TYPE == 'CGRAM'):
return applyIbmToSignalCgram(signal_in, ibm)
raise Exception('Invalid parameters.SGRAM_TYPE')
def getCochleagram(audio_in):
assert isinstance(audio_in, numpy.ndarray)
assert (audio_in.size % parameters.WINDOW_STEP_SAMPLES) == 0
# Get single channel if input is dual channel
if(len(audio_in.shape) > 1):
audio_in = audio_in[:,0]
gt_filtered = applyGammatoneFilterbank(audio_in)
assert isinstance(gt_filtered, numpy.ndarray)
# Get Cochleagram from gammatone filtered signals
num_of_windows = int((float(len(audio_in) - parameters.WINDOW_SIZE_SAMPLES) / float(parameters.WINDOW_STEP_SAMPLES)) + 1)
cgram = numpy.zeros([num_of_windows,parameters.CGRAM_NUM_CHANNELS])
for i in range(parameters.CGRAM_NUM_CHANNELS):
divided_channel = divideSignalToWindows(gt_filtered[i,:])
channel_energy = numpy.linalg.norm(divided_channel, axis=1)
channel_energy = channel_energy*channel_energy #Cochleagram needs the energy, which is the norm squared
cgram[:,i] = channel_energy
return cgram
def getIPD(audio_in):
assert isinstance(audio_in, numpy.ndarray)
assert (audio_in.shape[1] == 2)
channel1_fft = getStft(audio_in[:,0])
channel2_fft = getStft(audio_in[:,1])
#Avoid dividing by zero- change every elemnt equal to zero to a very small value
channel2_fft[channel2_fft == 0] = sys.float_info.min
return numpy.angle(channel1_fft/channel2_fft)
def getILD(audio_in):
assert isinstance(audio_in, numpy.ndarray)
assert (audio_in.shape[1] == 2)
channel1_fft = getStft(audio_in[:, 0])
channel2_fft = getStft(audio_in[:, 1])
# Avoid dividing by zero- change every elemnt equal to zero to a very small value
channel2_fft[channel2_fft == 0] = sys.float_info.min
amplitude_ratio = numpy.abs(channel1_fft/channel2_fft)
#Avoid taking log of zero
amplitude_ratio[amplitude_ratio == 0] = sys.float_info.min
return 20*numpy.log10(amplitude_ratio)
def getMV(audio_in):
MV_FEATURES_PER_BIN = 4
assert isinstance(audio_in, numpy.ndarray)
channel1_fft = getStft(audio_in[:, 0])
channel2_fft = getStft(audio_in[:, 1])
mv = numpy.zeros([channel1_fft.shape[0], MV_FEATURES_PER_BIN*channel1_fft.shape[1]])
for channel_ind in range(channel1_fft.shape[1]):
x_vec = numpy.column_stack((channel1_fft[:, channel_ind], channel2_fft[:, channel_ind]))
x_norm = numpy.array(2*[numpy.linalg.norm(x_vec, axis=1)]).T
# This comes to avoid dividing by zero. The normalization will give zero in any case, because x_vec is zero
x_norm[x_norm == 0] = sys.float_info.min
x_gal = x_vec/x_norm
# Get W matrix
W_matrix_mean = numpy.zeros(2*[x_gal.shape[1]], dtype=numpy.complex128)
for time_ind in range(x_gal.shape[0]):
W_matrix_mean += numpy.outer(x_gal[time_ind,:], x_gal[time_ind,:].conj().T)
W_matrix_mean /= x_gal.shape[0]
# Take only the eigen-vectors, ignore the eigen-values
# We use transpose because the eigen-vectors should be the rows of W, but the eig function returns them as columns
W = numpy.linalg.eig(W_matrix_mean)[1].T
# Get MV features
for time_ind in range(x_gal.shape[0]):
cur_mv = W.dot(x_gal[time_ind,:])
mv_norm = numpy.linalg.norm(cur_mv)
# This comes to avoid dividing by zero
if(mv_norm != 0):
cur_mv /= mv_norm
start_ind = MV_FEATURES_PER_BIN*channel_ind
stop_ind = start_ind + MV_FEATURES_PER_BIN
mv[time_ind, start_ind:stop_ind] = numpy.array([cur_mv[0].real, cur_mv[0].imag, cur_mv[1].real, cur_mv[1].imag])
return mv
# N is the number of elements you look on in the past and in the future
# The implementation of this function is according to this:
# http://practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/
def getDeltas(feature, N=2):
assert isinstance(feature, numpy.ndarray)
padded_feature = numpy.concatenate((feature[0:N,:], feature, feature[-N:,:]), axis=0)
deltas = numpy.zeros(feature.shape)
sum_N = sum([n**2 for n in range(1,N+1)])
for time_ind in range(deltas.shape[0]):
feature_time_ind = time_ind+N
for n in range(1,N+1):
deltas[time_ind,:] += n*(padded_feature[feature_time_ind+n,:]-padded_feature[feature_time_ind-n,:])/2
deltas[time_ind, :] /= sum_N
return deltas
def getStft(in_signal):
assert isinstance(in_signal, numpy.ndarray)
assert len(in_signal.shape) == 1
win_size = parameters.WINDOW_SIZE_SAMPLES
win_shift = parameters.WINDOW_STEP_SAMPLES
window = numpy.hanning(win_size)
stft = numpy.array([numpy.fft.rfft(window * in_signal[i:i+win_size])
for i in range(0, len(in_signal)-win_size+1, win_shift)])
return stft
def getSpectrogram(in_signal):
if(parameters.SGRAM_TYPE == 'STFT'):
return numpy.abs(getStft(in_signal))
if(parameters.SGRAM_TYPE == 'CGRAM'):
return getCochleagram(in_signal)
raise Exception('Invalid parameters.SGRAM_TYPE')
def getIstft(in_stft):
assert isinstance(in_stft, numpy.ndarray)
win_size = parameters.WINDOW_SIZE_SAMPLES
win_shift = parameters.WINDOW_STEP_SAMPLES
out = numpy.zeros(win_shift*in_stft.shape[0] + (win_size-win_shift))
for n, i in enumerate(range(0, len(out) - win_size + 1, win_shift)):
out[i:i + win_size] += numpy.fft.irfft(in_stft[n,:])
return out
def getGF(signal_in):
assert isinstance(signal_in, numpy.ndarray)
if(len(signal_in.shape) > 1):
signal_in = signal_in[:,0]
gt_filtered = applyGammatoneFilterbank(signal_in, parameters.GFCC_NUM_CHANNELS)
# Downsample the gammatone filtered signal to 100Hz
num_sampled = int(gt_filtered.shape[1]*(100.0/parameters.SAMPLE_RATE_HZ))
gt_100Hz = scipy.signal.resample(gt_filtered, num_sampled, axis=1)
# We need to align the sampled signal to our windows structure. 100Hz corresponds to a sample every 10ms (win_shift),
# but this will give us one extra window. The first sample should come after 20ms (after a win_len), and then a sample
# should come every 10ms. This means we need to get rid of the first sample, and the signal will be aligned.
gt_100Hz_aligned = gt_100Hz[:,1:]
# Apply cubic root compression to the signal
gf_feature = numpy.abs(gt_100Hz_aligned)**(1/3)
# Take the transpose of the signal, because currently axis 1 is the time axis, and the convention is axis 0
gf_feature = gf_feature.T
return gf_feature
def getGFCC(signal_in):
gf_feature = getGF(signal_in)
gf_dct = scipy.fftpack.dct(gf_feature, axis=1, type=2, norm='ortho')
gfcc_feature = gf_dct[:,0:parameters.GFCC_NUM_COEFF]
return gfcc_feature