imansarraf commited on
Commit
66bfc57
·
verified ·
1 Parent(s): e45ad60

Delete sad_tf

Browse files
sad_tf/__init__.py DELETED
@@ -1,2 +0,0 @@
1
- from .segmenter import Segmenter,filter_output,filter_sig
2
- from .export_funcs import seg2aud,seg2json,seg2Gender_Info,seg2Info
 
 
 
sad_tf/export_funcs.py DELETED
@@ -1,238 +0,0 @@
1
- #!/usr/bin/env python
2
- # encoding: utf-8
3
-
4
- # The MIT License
5
-
6
- # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
-
8
- # Permission is hereby granted, free of charge, to any person obtaining a copy
9
- # of this software and associated documentation files (the "Software"), to deal
10
- # in the Software without restriction, including without limitation the rights
11
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
- # copies of the Software, and to permit persons to whom the Software is
13
- # furnished to do so, subject to the following conditions:
14
-
15
- # The above copyright notice and this permission notice shall be included in
16
- # all copies or substantial portions of the Software.
17
-
18
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
- # THE SOFTWARE.
25
-
26
- import pandas as pd
27
- from pytextgrid.PraatTextGrid import PraatTextGrid, Interval, Tier
28
- import os
29
- import json
30
-
31
- def seg2csv(lseg, fout=None):
32
- df = pd.DataFrame.from_records(lseg, columns=['labels', 'start', 'stop'])
33
- df.to_csv(fout, sep='\t', index=False)
34
-
35
- def seg2textgrid1(lseg, fout=None):
36
- tier = Tier(name='inaSpeechSegmenter')
37
- for label, start, stop,_ in lseg:
38
- if (label=='noEnergy'):
39
- label=''
40
- tier.append(Interval(start, stop, label))
41
- ptg = PraatTextGrid(xmin=lseg[0][1], xmax=lseg[-1][2])
42
- ptg.append(tier)
43
- ptg.save(fout)
44
-
45
-
46
- def seg2json(lseg) :
47
- try:
48
- return(seg2json5(lseg))
49
- except:
50
- return(seg2json4(lseg))
51
-
52
-
53
-
54
-
55
- def seg2Info(lseg):
56
-
57
-
58
- x=[]
59
- nch=0
60
- for segs in lseg:
61
- f=0
62
- nch = nch+1
63
- data_list=[]
64
- if (segs!=-1):
65
- for y in segs:
66
- if (y[0]!='noEnergy'):
67
- f = f + y[2] - y[1]
68
-
69
-
70
- data = {
71
- 'channel' : nch,
72
- 'speech': f
73
- }
74
- x.append(data)
75
- return(json.dumps(x))
76
-
77
-
78
- def seg2Gender_Info(lseg):
79
-
80
-
81
- x=[]
82
- nch=0
83
- for segs in lseg:
84
- f=0
85
- m=0
86
- nch = nch+1
87
- data_list=[]
88
- if (segs!=-1):
89
- for y in segs:
90
- if (y[0]!='noEnergy'):
91
- if (y[0] == "female"):
92
- f = f + y[2] - y[1]
93
- elif(y[0] == "male"):
94
- m = m + y[2] - y[1]
95
-
96
-
97
- data = {
98
- 'channel' : nch,
99
- 'male': m,
100
- 'female': f
101
- }
102
- x.append(data)
103
- return(json.dumps(x))
104
-
105
- def seg2json5(lseg):
106
-
107
-
108
- x=[]
109
- nch=0
110
- for segs in lseg:
111
- nch = nch+1
112
- data_list=[]
113
- if (segs!=-1):
114
- for label, start, stop ,_,_ in segs:
115
- if (label!='noEnergy'):
116
- data = {
117
- 'startTime': start,
118
- 'endTime': stop,
119
- 'gender': label[0]
120
- }
121
- data_list.append(data)
122
- data = {
123
- 'channel' : nch,
124
- 'segments' : data_list
125
- }
126
- x.append(data)
127
- return(json.dumps(x))
128
-
129
- def seg2json4(lseg):
130
-
131
- x=[]
132
- nch=0
133
- for segs in lseg:
134
- nch = nch+1
135
- data_list=[]
136
- if (segs!=-1):
137
- for label, start, stop ,_ in segs:
138
- if (label!='noEnergy'):
139
- data = {
140
- 'startTime': start,
141
- 'endTime': stop,
142
- 'gender': label[0]
143
- }
144
- data_list.append(data)
145
- data = {
146
- 'channel' : nch,
147
- 'segments' : data_list
148
- }
149
- x.append(data)
150
- return(json.dumps(x))
151
-
152
-
153
-
154
-
155
- def seg2aud(lseg , fout=None) :
156
- try:
157
- seg2aud5(lseg , fout)
158
- except:
159
- seg2aud4(lseg , fout)
160
-
161
- def seg2aud5(lseg , fout=None):
162
- if (lseg==-1):
163
- return
164
- with open(fout , 'w') as fid:
165
- for label, start, stop ,_,_ in lseg:
166
- if (label!='noEnergy'):
167
- fid.write('%s\t%s\t%s\n' %(start , stop , label))
168
-
169
- def seg2aud4(lseg , fout=None):
170
- if (lseg==-1):
171
- return
172
- with open(fout , 'w') as fid:
173
- for label, start, stop ,_ in lseg:
174
- if (label!='noEnergy'):
175
- fid.write('%s\t%s\t%s\n' %(start , stop , label))
176
-
177
- def seg2textgrid(data , fout=None):
178
- ghabli=False
179
- kh=[]
180
- if (True):
181
- kh.append('File type = "ooTextFile"\n')
182
- kh.append('Object class = "TextGrid"\n')
183
- kh.append('\n')
184
- kh.append('xmin = 0 \n')
185
- kh.append('xmax = %s \n' %(data[-1][2]))
186
- kh.append('tiers? <exists> \n')
187
- kh.append('size = 1 \n')
188
- kh.append('item []: \n')
189
- kh.append(' item [1]:\n')
190
- kh.append(' class = "IntervalTier" \n')
191
- kh.append(' name = "sen" \n')
192
- kh.append(' xmin = 0 \n')
193
- kh.append(' xmax = %s \n' %(data[-1][2]))
194
- kh.append(' intervals: size = %s \n' %(0))
195
- x=1
196
-
197
- if (float(data[0][1])>0):
198
- kh.append(' intervals [%s]:\n' %(x))
199
- kh.append(' xmin = 0\n')
200
- kh.append(' xmax = %s \n' %(data[0][1]))
201
- kh.append(' text = "" \n')
202
- x=x+1
203
-
204
-
205
- for i in range(len(data)):
206
- kh.append(' intervals [%s]:\n' %(x))
207
- if (ghabli):
208
- kh.append(' xmin = %s \n' %(data[i-1][2]))
209
- else:
210
- kh.append(' xmin = %s \n' %(data[i][1]))
211
- kh.append(' xmax = %s \n' %(data[i][2]))
212
- kh.append(' text = "%s" \n' %(data[i][0].strip()))
213
- x=x+1
214
-
215
- if (i+1 >= len(data)):
216
- break
217
-
218
- if (data[i][2] != data[i+1][1]):
219
-
220
-
221
- if (float(data[i+1][1]) - float(data[i][2]) > 0.5):
222
- kh.append(' intervals [%s]:\n' %(x))
223
-
224
- kh.append(' xmin = %s \n' %(data[i][2]))
225
- kh.append(' xmax = %s \n' %(data[i+1][1]))
226
- kh.append(' text = "" \n')
227
- x=x+1
228
- ghabli=False
229
- else:
230
- ghabli=True
231
-
232
-
233
- kh[13] = (' intervals: size = %s \n' %(kh[-4].strip().split(' ')[1].replace('[','').replace(']','').replace(':','')))
234
-
235
-
236
- with open(fout, mode='w') as fid:
237
- for line in kh:
238
- fid.write(line)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sad_tf/features.py DELETED
@@ -1,62 +0,0 @@
1
- #!/usr/bin/env python
2
- # encoding: utf-8
3
-
4
- # The MIT License
5
-
6
- # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
-
8
- # Permission is hereby granted, free of charge, to any person obtaining a copy
9
- # of this software and associated documentation files (the "Software"), to deal
10
- # in the Software without restriction, including without limitation the rights
11
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
- # copies of the Software, and to permit persons to whom the Software is
13
- # furnished to do so, subject to the following conditions:
14
-
15
- # The above copyright notice and this permission notice shall be included in
16
- # all copies or substantial portions of the Software.
17
-
18
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
- # THE SOFTWARE.
25
-
26
- import os
27
- import numpy as np
28
- from iman import Audio
29
-
30
- #os.environ['SIDEKIT'] = 'theano=false,libsvm=false,cuda=false'
31
- #from sidekit.frontend.io import read_wav
32
- #from sidekit.frontend.features import mfcc
33
- from .sidekit_mfcc import mfcc
34
-
35
-
36
- def _wav2feats(wavname,input_type='file',sr=16000,ffmpeg_path='c:\\ffmpeg.exe'):
37
- """
38
- Extract features for wav 16k mono
39
- """
40
-
41
- if (input_type == 'file'):
42
- sig = Audio.Read(wavname , sr,mono = True, ffmpeg_path=ffmpeg_path)
43
- else:
44
- sig = wavname
45
-
46
- read_framerate=sr
47
-
48
-
49
- _, loge, _, mspec = mfcc(sig.astype(np.float32), get_mspec=True,fs=sr, maxfreq=int(sr/2))
50
-
51
- # Management of short duration segments
52
- difflen = 0
53
- if len(loge) < 68:
54
- difflen = 68 - len(loge)
55
- mspec = np.concatenate((mspec, np.ones((difflen, 24)) * np.min(mspec)))
56
-
57
- return mspec, loge, difflen,sig
58
-
59
-
60
- def media2feats(medianame,input_type='file', sr=16000,ffmpeg_path='c:\\ffmpeg.exe'):
61
-
62
- return _wav2feats(medianame, input_type , sr,ffmpeg_path=ffmpeg_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sad_tf/segmenter.py DELETED
@@ -1,552 +0,0 @@
1
- #!/usr/bin/env python
2
- # encoding: utf-8
3
-
4
- # The MIT License
5
-
6
- # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
-
8
- # Permission is hereby granted, free of charge, to any person obtaining a copy
9
- # of this software and associated documentation files (the "Software"), to deal
10
- # in the Software without restriction, including without limitation the rights
11
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
- # copies of the Software, and to permit persons to whom the Software is
13
- # furnished to do so, subject to the following conditions:
14
-
15
- # The above copyright notice and this permission notice shall be included in
16
- # all copies or substantial portions of the Software.
17
-
18
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
- # THE SOFTWARE.
25
-
26
- import warnings
27
- warnings.filterwarnings("ignore")
28
- import os
29
- # os.environ["CUDA_DEVICE_ORDER"]= '0'
30
- import sys
31
- import math
32
- from iman import Audio
33
- import numpy as np
34
- from tensorflow import keras
35
-
36
- from .thread_returning import ThreadReturning
37
-
38
- import shutil
39
- import time
40
- import random
41
-
42
- from skimage.util import view_as_windows as vaw
43
-
44
-
45
- from .viterbi import viterbi_decoding
46
- from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp
47
-
48
- from .features import media2feats
49
- from .export_funcs import seg2csv, seg2textgrid
50
-
51
-
52
-
53
- def _energy_activity(loge, ratio=0.4): ##########0.9
54
-
55
- threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
56
- raw_activity = (loge > threshold)
57
- return viterbi_decoding(pred2logemission(raw_activity),
58
- log_trans_exp(50, cost0=-5))
59
-
60
- #exp(150, cost0=-5)
61
-
62
- def filter_sig(isig , wav , sr=16000):
63
-
64
- if (sr!=16000):
65
- wav = Audio.Resample(wav , 16000, sr)
66
-
67
-
68
- try:
69
- w=[]
70
- wn=[]
71
- wn.append(wav[0 : int(isig[0][1]*sr)])
72
- for i , xxx in enumerate(isig):
73
- a=xxx[1]
74
- b=xxx[2]
75
- w.append(wav[int(a*sr) : int(b*sr)])
76
- try:
77
- wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
78
- except:
79
- wn.append(wav[int(isig[i][2]*sr) : len(wav)])
80
-
81
- return (np.concatenate(w),np.concatenate(wn))
82
- except:
83
- w=[]
84
- wn=[]
85
- wn.append(wav[0 : int(isig[0][1]*sr)])
86
- for i , [_,a,b,_,_] in enumerate(isig):
87
- w.append(wav[int(a*sr) : int(b*sr)])
88
- try:
89
- wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
90
- except:
91
- wn.append(wav[int(isig[i][2]*sr) : len(wav)])
92
-
93
- return (np.concatenate(w),np.concatenate(wn))
94
-
95
- def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
96
-
97
- if (len(isig)==0):
98
- return -1
99
-
100
- # _dels=[]
101
- # for i , [_,_,_,_d] in enumerate(isig):
102
- # if (_d<=ignore_small_speech_segments) :
103
- # _dels.append(i)
104
- # _dels.reverse()
105
- # for i in _dels:
106
- # del isig[i]
107
-
108
- # if (len(isig)==0):
109
- # return -1
110
-
111
- isig = [list(x) for x in isig]
112
-
113
- for i in range(len(isig)-1):
114
- t = isig[i+1][1] - isig[i][2] # silence between towo chunk
115
- isig[i].append(t)
116
- isig[-1].append(-1)
117
-
118
-
119
- if (len(isig)>0):
120
-
121
- rang = np.arange(0.01,max_silence+0.1,0.1)
122
- for di in rang:
123
- for i , xxx in enumerate(isig):
124
-
125
- _t = xxx[-1]
126
- if (_t==-1):
127
- break
128
- if (_t <=di):
129
- try:
130
- if (isig[i+1][2] - isig[i][1] <= max_speech_len):
131
- isig[i] = [isig[i][0] , isig[i][1] , isig[i+1][2] , isig[i+1][2] - isig[i][1] , isig[i+1][4] ]
132
- del isig[i+1]
133
- except:
134
- pass
135
- _dels=[]
136
- for i , xxxx in enumerate(isig):
137
- _d = xxxx[3]
138
- if (_d<=ignore_small_speech_segments) :
139
- _dels.append(i)
140
- _dels.reverse()
141
-
142
- for i in _dels:
143
- del isig[i]
144
-
145
- if (len(isig)==0):
146
- return -1
147
-
148
-
149
- isign=[]
150
- for i , xxxxx in enumerate(isig):
151
- _d = xxxxx[3]
152
- if (_d> split_speech_bigger_than ) :
153
-
154
- _gc = math.ceil(_d/split_speech_bigger_than)
155
- m = _d/_gc
156
- print('Bigger-->' + str(_d) + '-->' + str(m))
157
- for jj in range(_gc):
158
- fas=0
159
- if (jj== _gc-1):
160
- fas= isig[i][4]
161
- isign.append( [isig[i][0] ,isig[i][1] + m*jj ,isig[i][1] + (m*(jj+1)), m, fas ] )
162
- else:
163
- isign.append(isig[i])
164
- for i,(a,b,c,d,e) in enumerate(isign):
165
- if (e==-1):
166
- break
167
- _addlen = min(e , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
168
- isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen]
169
-
170
- return(isign)
171
-
172
-
173
- def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
174
-
175
- isig = []
176
- i=0
177
- while (i <len(vad)):
178
-
179
- ml=0
180
- inn = i
181
- st = (vad[i][1])
182
-
183
- while ( (i<len(vad)-1 )and ( ( (vad[i+1][1]) - (vad[i][2]) ) <= max_silence)):
184
- ml = (vad[i][2]) - st
185
- if (ml > max_speech_len):
186
- if (i>inn and i>0):
187
- i=i-1
188
- break
189
- i=i+1
190
- en = (vad[i][2])
191
- fa = en-st
192
- if (fa > ignore_small_speech_segments):
193
- if (fa>split_speech_bigger_than):
194
- _gc = math.ceil(fa/split_speech_bigger_than)
195
- m = fa/_gc
196
- print('Bigger-->' + str(fa) + '-->' + str(m))
197
- for jj in range(_gc):
198
- isig.append(('speech' , st + (m*jj) , st+ (m*(jj+1)) , m))
199
- else:
200
- isig.append(('speech', st , en,fa))
201
- i=i+1
202
- isign=[]
203
- for i,(a,b,c,d) in enumerate(isig):
204
- if (i == len(isig)-1):
205
- isign.append(isig[i])
206
- break
207
- _addlen = min(isig[i+1][1]-c , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
208
- isign.append([a,b,c+_addlen ,d+_addlen])
209
-
210
- return(isign)
211
-
212
-
213
- def _get_patches(mspec, w, step):
214
- h = mspec.shape[1]
215
- data = vaw(mspec, (w,h), step=step)
216
- data.shape = (len(data), w*h)
217
- data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
218
- lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step))
219
- rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2)
220
- data = np.vstack(lfill + [data] + rfill )
221
- finite = np.all(np.isfinite(data), axis=1)
222
- data.shape = (len(data), w, h)
223
- return data, finite
224
-
225
-
226
- def _binidx2seglist(binidx):
227
- """
228
- ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
229
- Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]
230
-
231
- #TODO: is there a pandas alternative??
232
- """
233
- curlabel = None
234
- bseg = -1
235
- ret = []
236
- for i, e in enumerate(binidx):
237
- if e != curlabel:
238
- if curlabel is not None:
239
- ret.append((curlabel, bseg, i))
240
- curlabel = e
241
- bseg = i
242
- ret.append((curlabel, bseg, i + 1))
243
- return ret
244
-
245
-
246
- class DnnSegmenter:
247
- """
248
- DnnSegmenter is an abstract class allowing to perform Dnn-based
249
- segmentation using Keras serialized models using 24 mel spectrogram
250
- features obtained with SIDEKIT framework.
251
-
252
- Child classes MUST define the following class attributes:
253
- * nmel: the number of mel bands to used (max: 24)
254
- * viterbi_arg: the argument to be used with viterbi post-processing
255
- * model_fname: the filename of the serialized keras model to be used
256
- the model should be stored in the current directory
257
- * inlabel: only segments with label name inlabel will be analyzed.
258
- other labels will stay unchanged
259
- * outlabels: the labels associated the output of neural network models
260
- """
261
- def __init__(self, batch_size, vad_type,model_path):
262
- # load the DNN model
263
- if (vad_type!='vad'):
264
- self.nn = keras.models.load_model(model_path, compile=False)
265
- print('model Loded from--> ' + model_path)
266
- # self.nn.summary()
267
- self.batch_size = batch_size
268
-
269
- def __call__(self, mspec, lseg, difflen = 0):
270
- """
271
- *** input
272
- * mspec: mel spectrogram
273
- * lseg: list of tuples (label, start, stop) corresponding to previous segmentations
274
- * difflen: 0 if the original length of the mel spectrogram is >= 68
275
- otherwise it is set to 68 - length(mspec)
276
- *** output
277
- a list of adjacent tuples (label, start, stop)
278
- """
279
- if self.nmel < 24:
280
- mspec = mspec[:, :self.nmel].copy()
281
-
282
- patches, finite = _get_patches(mspec, 68, 2)
283
- if difflen > 0:
284
- patches = patches[:-int(difflen / 2), :, :]
285
- finite = finite[:-int(difflen / 2)]
286
-
287
- assert len(finite) == len(patches), (len(patches), len(finite))
288
-
289
- batch = []
290
- for lab, start, stop in lseg:
291
- if lab == self.inlabel:
292
- batch.append(patches[start:stop, :])
293
-
294
- if len(batch) > 0:
295
- batch = np.concatenate(batch)
296
- rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1)
297
-
298
- ret = []
299
- for lab, start, stop in lseg:
300
- if lab != self.inlabel:
301
- ret.append((lab, start, stop))
302
- continue
303
-
304
- l = stop - start
305
- r = rawpred[:l]
306
- rawpred = rawpred[l:]
307
- r[finite[start:stop] == False, :] = 0.5
308
- pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
309
- for lab2, start2, stop2 in _binidx2seglist(pred):
310
- ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
311
- return ret
312
-
313
-
314
- class SpeechMusic(DnnSegmenter):
315
- # Voice activity detection: requires energetic activity detection
316
- outlabels = ('speech', 'music')
317
- inlabel = 'energy'
318
- nmel = 21
319
- viterbi_arg = 150
320
-
321
-
322
- class SpeechMusicNoise(DnnSegmenter):
323
- # Voice activity detection: requires energetic activity detection
324
- outlabels = ('speech', 'music', 'noise')
325
- inlabel = 'energy'
326
- nmel = 21
327
- viterbi_arg = 80
328
-
329
-
330
- class Gender(DnnSegmenter):
331
- # Gender Segmentation, requires voice activity detection
332
- outlabels = ('female', 'male')
333
- inlabel = 'speech'
334
- nmel = 24
335
- viterbi_arg = 80
336
-
337
-
338
-
339
- class Segmenter:
340
-
341
-
342
- def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.hdf5",gender_path="c:\\keras_male_female_cnn.hdf5" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda' ,input_type="file"):
343
- """
344
- Load neural network models
345
-
346
- Input:
347
-
348
- 'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
349
- 'sm' was used in the results presented in ICASSP 2017 paper
350
- and in MIREX 2018 challenge submission
351
- 'smn' has been implemented more recently and has not been evaluated in papers
352
-
353
- 'detect_gender': if False, speech excerpts are return labelled as 'speech'
354
- if True, speech excerpts are splitted into 'male' and 'female' segments
355
- """
356
-
357
- if (device != 'cuda'):
358
- os.environ["CUDA_DEVICE_ORDER"]= '-1'
359
- else:
360
- pass
361
-
362
-
363
- import tensorflow as tf
364
-
365
- config = tf.compat.v1.ConfigProto()
366
- config.gpu_options.allow_growth = True
367
- config.log_device_placement = True
368
-
369
-
370
- self.complete_output = complete_output
371
- self.sample_rate = sr
372
- self.ffmpeg_path=ffmpeg_path
373
- self.input_type = input_type
374
- self.device = device
375
-
376
-
377
-
378
- # self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks
379
-
380
-
381
- # select speech/music or speech/music/noise voice activity detection engine
382
- assert vad_engine in ['sm', 'smn']
383
- if vad_engine == 'sm':
384
- self.vad = SpeechMusic(batch_size)
385
- elif vad_engine == 'smn':
386
- self.vad = SpeechMusicNoise(batch_size , vad_type,model_path)
387
-
388
- # load gender detection NN if required
389
- assert detect_gender in [True, False]
390
- self.detect_gender = detect_gender
391
- if detect_gender:
392
- self.gender = Gender(batch_size , vad_type ,gender_path)
393
- self.vad_type = vad_type
394
- self.model_path = model_path
395
- self.gender_path = gender_path
396
-
397
- def segment_feats(self, mspec, loge, difflen, start_sec):
398
- """
399
- do segmentation
400
- require input corresponding to wav file sampled at 16000Hz
401
- with a single channel
402
- """
403
-
404
-
405
-
406
-
407
- # perform energy-based activity detection
408
- lseg = []
409
- vadseg=[]
410
- for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]):
411
- if lab == 0:
412
- lab = 'noEnergy'
413
- else:
414
- lab = 'energy'
415
- vadseg.append(('speech', start, stop))
416
- lseg.append((lab, start, stop))
417
- if (self.vad_type == 'vad'):
418
- return [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in vadseg]
419
- # perform voice activity detection
420
- lseg = self.vad(mspec, lseg, difflen)
421
-
422
-
423
-
424
-
425
- # perform gender segmentation on speech segments
426
- if self.detect_gender:
427
- lseg = self.gender(mspec, lseg, difflen)
428
- if (self.complete_output):
429
- return [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ]
430
- else:
431
- return [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")]
432
-
433
-
434
- def __call__(self, medianame,start_sec=None, stop_sec=None):
435
- """
436
- Return segmentation of a given file
437
- * convert file to wav 16k mono with ffmpeg
438
- * call NN segmentation procedures
439
- * media_name: path to the media to be processed (including remote url)
440
- may include any format supported by ffmpeg
441
- * tmpdir: allow to define a custom path for storing temporary files
442
- fast read/write HD are a good choice
443
- * start_sec (seconds): sound stream before start_sec won't be processed
444
- * stop_sec (seconds): sound stream after stop_sec won't be processed
445
- """
446
-
447
-
448
- mspec, loge, difflen , me = media2feats(medianame, self.input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path)
449
-
450
- if start_sec is None:
451
- start_sec = 0
452
- # do segmentation
453
- return self.segment_feats(mspec, loge, difflen, start_sec),me
454
-
455
-
456
- def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
457
-
458
- if verbose:
459
- print('batch_processing %d files' % len(linput))
460
-
461
- if output_format == 'csv':
462
- fexport = seg2csv
463
- elif output_format == 'textgrid':
464
- fexport = seg2textgrid
465
- else:
466
- raise NotImplementedError()
467
-
468
- t_batch_start = time.time()
469
-
470
- lmsg = []
471
- fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay)
472
- i = 0
473
- for feats, msg in fg:
474
- lmsg += msg
475
- i += len(msg)
476
- if verbose:
477
- print('%d/%d' % (i, len(linput)), msg)
478
- if feats is None:
479
- break
480
- mspec, loge, difflen = feats
481
- #if verbose == True:
482
- # print(i, linput[i], loutput[i])
483
- b = time.time()
484
- lseg = self.segment_feats(mspec, loge, difflen, 0)
485
- fexport(lseg, loutput[len(lmsg) -1])
486
- lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))
487
-
488
- t_batch_dur = time.time() - t_batch_start
489
- nb_processed = len([e for e in lmsg if e[1] == 0])
490
- if nb_processed > 0:
491
- avg = t_batch_dur / nb_processed
492
- else:
493
- avg = -1
494
- return t_batch_dur, nb_processed, avg, lmsg
495
-
496
-
497
- def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000):
498
- """
499
- To be used when processing batches
500
- if resulting file exists, it is skipped
501
- in case of remote files, access is tried nbtry times
502
- """
503
- ret = None
504
- msg = []
505
- while ret is None and len(lin) > 0:
506
- src = lin.pop(0)
507
- dst = lout.pop(0)
508
- # print('popping', src)
509
-
510
- # if file exists: skipp
511
- if skipifexist and os.path.exists(dst):
512
- msg.append((dst, 1, 'already exists'))
513
- continue
514
-
515
- # create storing directory if required
516
- dname = os.path.dirname(dst)
517
- if not os.path.isdir(dname):
518
- os.makedirs(dname)
519
-
520
- itry = 0
521
- while ret is None and itry < nbtry:
522
- try:
523
- ret = media2feats(src, tmpdir, None, None, ffmpeg)
524
- except:
525
- itry += 1
526
- errmsg = sys.exc_info()[0]
527
- if itry != nbtry:
528
- time.sleep(random.random() * trydelay)
529
- if ret is None:
530
- msg.append((dst, 2, 'error: ' + str(errmsg)))
531
- else:
532
- msg.append((dst, 0, 'ok'))
533
-
534
- return ret, msg
535
-
536
-
537
- def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000):
538
- # print('init feat gen', len(ilist))
539
- thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
540
- thread.start()
541
- while True:
542
- ret, msg = thread.join()
543
- # print('join done', len(ilist))
544
- # print('new list', ilist)
545
- #ilist = ilist[len(msg):]
546
- #olist = olist[len(msg):]
547
- if len(ilist) == 0:
548
- break
549
- thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
550
- thread.start()
551
- yield ret, msg
552
- yield ret, msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sad_tf/segmentero.py DELETED
@@ -1,570 +0,0 @@
1
- #!/usr/bin/env python
2
- # encoding: utf-8
3
-
4
- # The MIT License
5
-
6
- # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
-
8
- # Permission is hereby granted, free of charge, to any person obtaining a copy
9
- # of this software and associated documentation files (the "Software"), to deal
10
- # in the Software without restriction, including without limitation the rights
11
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
- # copies of the Software, and to permit persons to whom the Software is
13
- # furnished to do so, subject to the following conditions:
14
-
15
- # The above copyright notice and this permission notice shall be included in
16
- # all copies or substantial portions of the Software.
17
-
18
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
- # THE SOFTWARE.
25
- import onnxruntime
26
- import warnings
27
- warnings.filterwarnings("ignore")
28
- import os
29
- # os.environ["CUDA_DEVICE_ORDER"]= '0'
30
- import sys
31
- import math
32
- from iman import Audio
33
- import numpy as np
34
- from tensorflow import keras
35
- from tensorflow.compat.v1.keras.backend import set_session
36
- from tqdm import tqdm
37
- from .thread_returning import ThreadReturning
38
-
39
- import shutil
40
- import time
41
- import random
42
-
43
- from skimage.util import view_as_windows as vaw
44
-
45
-
46
- from .viterbi import viterbi_decoding
47
- from .viterbi_utils import pred2logemission, diag_trans_exp, log_trans_exp
48
-
49
- from .features import media2feats
50
- from .export_funcs import seg2csv, seg2textgrid
51
-
52
-
53
-
54
- def _energy_activity(loge, ratio=0.4): ##########0.9
55
-
56
- threshold = np.mean(loge[np.isfinite(loge)]) + np.log(ratio)
57
- raw_activity = (loge > threshold)
58
- return viterbi_decoding(pred2logemission(raw_activity),
59
- log_trans_exp(50, cost0=-5))
60
-
61
- #exp(150, cost0=-5)
62
-
63
- def filter_sig(isig , wav , sr=16000):
64
-
65
- if (sr!=16000):
66
- wav = Audio.Resample(wav , 16000, sr)
67
-
68
-
69
- try:
70
- w=[]
71
- wn=[]
72
- wn.append(wav[0 : int(isig[0][1]*sr)])
73
- for i , [_,a,b,_] in enumerate(isig):
74
-
75
- w.append(wav[int(a*sr) : int(b*sr)])
76
- try:
77
- wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
78
- except:
79
- wn.append(wav[int(isig[i][2]*sr) : len(wav)])
80
-
81
- return (np.concatenate(w),np.concatenate(wn))
82
- except:
83
- w=[]
84
- wn=[]
85
- wn.append(wav[0 : int(isig[0][1]*sr)])
86
- for i , [_,a,b,_,_] in enumerate(isig):
87
- w.append(wav[int(a*sr) : int(b*sr)])
88
- try:
89
- wn.append(wav[ int(isig[i][2]*sr) : int(isig[i+1][1]*sr)])
90
- except:
91
- wn.append(wav[int(isig[i][2]*sr) : len(wav)])
92
-
93
- return (np.concatenate(w),np.concatenate(wn))
94
-
95
- def filter_output(isig , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
96
-
97
- if (len(isig)==0):
98
- return -1
99
-
100
- # _dels=[]
101
- # for i , [_,_,_,_d] in enumerate(isig):
102
- # if (_d<=ignore_small_speech_segments) :
103
- # _dels.append(i)
104
- # _dels.reverse()
105
- # for i in _dels:
106
- # del isig[i]
107
-
108
- # if (len(isig)==0):
109
- # return -1
110
-
111
-
112
- for i in range(len(isig)-1):
113
- t = isig[i+1][1] - isig[i][2] # silence between towo chunk
114
- isig[i].append(t)
115
- isig[-1].append(-1)
116
-
117
-
118
- if (len(isig)>0):
119
-
120
- rang = np.arange(0.01,max_silence+0.1,0.1)
121
- for di in rang:
122
- for i , [_,_,_,_,_t] in enumerate(isig):
123
- if (_t==-1):
124
- break
125
- if (_t <=di):
126
- try:
127
- if (isig[i+1][2] - isig[i][1] <= max_speech_len):
128
- isig[i] = [isig[i][0] , isig[i][1] , isig[i+1][2] , isig[i+1][2] - isig[i][1] , isig[i+1][4] ]
129
- del isig[i+1]
130
- except:
131
- pass
132
- _dels=[]
133
- for i , [_,_,_,_d,_] in enumerate(isig):
134
- if (_d<=ignore_small_speech_segments) :
135
- _dels.append(i)
136
- _dels.reverse()
137
-
138
- for i in _dels:
139
- del isig[i]
140
-
141
- if (len(isig)==0):
142
- return -1
143
-
144
-
145
- isign=[]
146
- for i , [_,_,_,_d,_] in enumerate(isig):
147
- if (_d> split_speech_bigger_than ) :
148
-
149
- _gc = math.ceil(_d/split_speech_bigger_than)
150
- m = _d/_gc
151
- print('Bigger-->' + str(_d) + '-->' + str(m))
152
- for jj in range(_gc):
153
- fas=0
154
- if (jj== _gc-1):
155
- fas= isig[i][4]
156
- isign.append( [isig[i][0] ,isig[i][1] + m*jj ,isig[i][1] + (m*(jj+1)), m, fas ] )
157
- else:
158
- isign.append(isig[i])
159
- for i,(a,b,c,d,e) in enumerate(isign):
160
- if (e==-1):
161
- break
162
- _addlen = min(e , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
163
- isign[i] = [a,b,c+_addlen,d+_addlen,e-_addlen]
164
-
165
- return(isign)
166
-
167
-
168
- def filter_output_1(vad , max_silence=1 ,ignore_small_speech_segments=0.5 , max_speech_len=15,split_speech_bigger_than=20):
169
-
170
- isig = []
171
- i=0
172
- while (i <len(vad)):
173
-
174
- ml=0
175
- inn = i
176
- st = (vad[i][1])
177
-
178
- while ( (i<len(vad)-1 )and ( ( (vad[i+1][1]) - (vad[i][2]) ) <= max_silence)):
179
- ml = (vad[i][2]) - st
180
- if (ml > max_speech_len):
181
- if (i>inn and i>0):
182
- i=i-1
183
- break
184
- i=i+1
185
- en = (vad[i][2])
186
- fa = en-st
187
- if (fa > ignore_small_speech_segments):
188
- if (fa>split_speech_bigger_than):
189
- _gc = math.ceil(fa/split_speech_bigger_than)
190
- m = fa/_gc
191
- print('Bigger-->' + str(fa) + '-->' + str(m))
192
- for jj in range(_gc):
193
- isig.append(('speech' , st + (m*jj) , st+ (m*(jj+1)) , m))
194
- else:
195
- isig.append(('speech', st , en,fa))
196
- i=i+1
197
- isign=[]
198
- for i,(a,b,c,d) in enumerate(isig):
199
- if (i == len(isig)-1):
200
- isign.append(isig[i])
201
- break
202
- _addlen = min(isig[i+1][1]-c , 1) / 2 #حداکثر نیم ثانیه به انتهای سگمنت افزوده میشود
203
- isign.append([a,b,c+_addlen ,d+_addlen])
204
-
205
- return(isign)
206
-
207
-
208
- def get_path_3d(data,batch_size):
209
- total_batches = data.shape[0] // batch_size
210
- last_batch_size = data.shape[0] % batch_size
211
- if last_batch_size != 0:
212
- batches = np.split(data[:total_batches * batch_size], total_batches)
213
- last_batch = np.expand_dims(data[total_batches * batch_size:], axis=0).squeeze()
214
- batches.append(last_batch)
215
- else:
216
- batches = np.split(data, total_batches)
217
- return batches
218
-
219
-
220
- def _get_patches(mspec, w, step):
221
- h = mspec.shape[1]
222
- data = vaw(mspec, (w,h), step=step)
223
- data.shape = (len(data), w*h)
224
- data = (data - np.mean(data, axis=1).reshape((len(data), 1))) / np.std(data, axis=1).reshape((len(data), 1))
225
- lfill = [data[0,:].reshape(1, h*w)] * (w // (2 * step))
226
- rfill = [data[-1,:].reshape(1, h*w)] * (w // (2* step) - 1 + len(mspec) % 2)
227
- data = np.vstack(lfill + [data] + rfill )
228
- finite = np.all(np.isfinite(data), axis=1)
229
- data.shape = (len(data), w, h)
230
- return data, finite
231
-
232
-
233
- def _binidx2seglist(binidx):
234
- """
235
- ss._binidx2seglist((['f'] * 5) + (['bbb'] * 10) + ['v'] * 5)
236
- Out: [('f', 0, 5), ('bbb', 5, 15), ('v', 15, 20)]
237
-
238
- #TODO: is there a pandas alternative??
239
- """
240
- curlabel = None
241
- bseg = -1
242
- ret = []
243
- for i, e in enumerate(binidx):
244
- if e != curlabel:
245
- if curlabel is not None:
246
- ret.append((curlabel, bseg, i))
247
- curlabel = e
248
- bseg = i
249
- ret.append((curlabel, bseg, i + 1))
250
- return ret
251
-
252
-
253
- class DnnSegmenter:
254
- """
255
- DnnSegmenter is an abstract class allowing to perform Dnn-based
256
- segmentation using Keras serialized models using 24 mel spectrogram
257
- features obtained with SIDEKIT framework.
258
-
259
- Child classes MUST define the following class attributes:
260
- * nmel: the number of mel bands to used (max: 24)
261
- * viterbi_arg: the argument to be used with viterbi post-processing
262
- * model_fname: the filename of the serialized keras model to be used
263
- the model should be stored in the current directory
264
- * inlabel: only segments with label name inlabel will be analyzed.
265
- other labels will stay unchanged
266
- * outlabels: the labels associated the output of neural network models
267
- """
268
- def __init__(self, batch_size, vad_type,model_path,EP_list):
269
- # load the DNN model
270
- if (vad_type!='vad'):
271
- self.session = onnxruntime.InferenceSession(model_path,providers=EP_list)
272
- #self.nn = keras.models.load_model(model_path, compile=False)
273
- print('model Loded from--> ' + model_path)
274
- # self.nn.summary()
275
- self.batch_size = batch_size
276
-
277
- def __call__(self, mspec, lseg, difflen = 0):
278
- """
279
- *** input
280
- * mspec: mel spectrogram
281
- * lseg: list of tuples (label, start, stop) corresponding to previous segmentations
282
- * difflen: 0 if the original length of the mel spectrogram is >= 68
283
- otherwise it is set to 68 - length(mspec)
284
- *** output
285
- a list of adjacent tuples (label, start, stop)
286
- """
287
- if self.nmel < 24:
288
- mspec = mspec[:, :self.nmel].copy()
289
-
290
- patches, finite = _get_patches(mspec, 68, 2)
291
- if difflen > 0:
292
- patches = patches[:-int(difflen / 2), :, :]
293
- finite = finite[:-int(difflen / 2)]
294
-
295
- assert len(finite) == len(patches), (len(patches), len(finite))
296
-
297
- batch = []
298
- for lab, start, stop in lseg:
299
- if lab == self.inlabel:
300
- batch.append(patches[start:stop, :])
301
-
302
- if len(batch) > 0:
303
-
304
- batch = np.concatenate(batch)
305
- batches = get_path_3d(batch , self.batch_size,)
306
-
307
-
308
- #rawpred = self.nn.predict(batch, batch_size=self.batch_size, verbose=1)
309
- input_name = self.session.get_inputs()[0].name
310
- rawpred=[]
311
- for batch in tqdm(batches):
312
- rawpred.append(self.session.run(None, {input_name: batch})[0])
313
-
314
- rawpred = np.concatenate(rawpred)
315
-
316
-
317
- ret = []
318
- for lab, start, stop in lseg:
319
- if lab != self.inlabel:
320
- ret.append((lab, start, stop))
321
- continue
322
-
323
- l = stop - start
324
- r = rawpred[:l]
325
- rawpred = rawpred[l:]
326
- r[finite[start:stop] == False, :] = 0.5
327
- pred = viterbi_decoding(np.log(r), diag_trans_exp(self.viterbi_arg, len(self.outlabels)))
328
- for lab2, start2, stop2 in _binidx2seglist(pred):
329
- ret.append((self.outlabels[int(lab2)], start2+start, stop2+start))
330
- return ret
331
-
332
-
333
- class SpeechMusic(DnnSegmenter):
334
- # Voice activity detection: requires energetic activity detection
335
- outlabels = ('speech', 'music')
336
- inlabel = 'energy'
337
- nmel = 21
338
- viterbi_arg = 150
339
-
340
-
341
- class SpeechMusicNoise(DnnSegmenter):
342
- # Voice activity detection: requires energetic activity detection
343
- outlabels = ('speech', 'music', 'noise')
344
- inlabel = 'energy'
345
- nmel = 21
346
- viterbi_arg = 80
347
-
348
-
349
- class Gender(DnnSegmenter):
350
- # Gender Segmentation, requires voice activity detection
351
- outlabels = ('female', 'male')
352
- inlabel = 'speech'
353
- nmel = 24
354
- viterbi_arg = 80
355
-
356
-
357
-
358
- class Segmenter:
359
-
360
-
361
- def __init__(self, vad_type = 'sad' , vad_engine='smn', detect_gender=False, sr=16000, batch_size=32 , complete_output=False,model_path="c:\\keras_speech_music_noise_cnn.onnx",gender_path="c:\\keras_male_female_cnn.onnx" , ffmpeg_path='c:\\ffmpeg.exe',device='cuda'):
362
- """
363
- Load neural network models
364
-
365
- Input:
366
-
367
- 'vad_engine' can be 'sm' (speech/music) or 'smn' (speech/music/noise)
368
- 'sm' was used in the results presented in ICASSP 2017 paper
369
- and in MIREX 2018 challenge submission
370
- 'smn' has been implemented more recently and has not been evaluated in papers
371
-
372
- 'detect_gender': if False, speech excerpts are return labelled as 'speech'
373
- if True, speech excerpts are splitted into 'male' and 'female' segments
374
- """
375
- self.complete_output = complete_output
376
- self.sample_rate = sr
377
- self.ffmpeg_path=ffmpeg_path
378
-
379
-
380
- if (device != 'cuda'):
381
- os.environ["CUDA_DEVICE_ORDER"]= '-1'
382
- EP_list=[ 'CPUExecutionProvider']
383
- else:
384
- EP_list=['CUDAExecutionProvider']
385
-
386
- import tensorflow as tf
387
-
388
- config = tf.compat.v1.ConfigProto()
389
- config.gpu_options.allow_growth = True
390
- config.log_device_placement = True
391
- sess = tf.compat.v1.Session(config=config)
392
- set_session(sess)
393
-
394
-
395
-
396
- # self.graph = KB.get_session().graph # To prevent the issue of keras with tensorflow backend for async tasks
397
-
398
-
399
- # select speech/music or speech/music/noise voice activity detection engine
400
- assert vad_engine in ['sm', 'smn']
401
- if vad_engine == 'sm':
402
- self.vad = SpeechMusic(batch_size)
403
- elif vad_engine == 'smn':
404
- self.vad = SpeechMusicNoise(batch_size , vad_type,model_path,EP_list)
405
-
406
- # load gender detection NN if required
407
- assert detect_gender in [True, False]
408
- self.detect_gender = detect_gender
409
- if detect_gender:
410
- self.gender = Gender(batch_size , vad_type ,gender_path,EP_list)
411
- self.vad_type = vad_type
412
- self.model_path = model_path
413
- self.gender_path = gender_path
414
-
415
- def segment_feats(self, mspec, loge, difflen, start_sec):
416
- """
417
- do segmentation
418
- require input corresponding to wav file sampled at 16000Hz
419
- with a single channel
420
- """
421
-
422
-
423
-
424
-
425
- # perform energy-based activity detection
426
- lseg = []
427
- vadseg=[]
428
- for lab, start, stop in _binidx2seglist(_energy_activity(loge)[::2]):
429
- if lab == 0:
430
- lab = 'noEnergy'
431
- else:
432
- lab = 'energy'
433
- vadseg.append(('speech', start, stop))
434
- lseg.append((lab, start, stop))
435
- if (self.vad_type == 'vad'):
436
- return [(lab, start_sec + start * .02, start_sec + stop * .02 , stop-start) for lab, start, stop in vadseg]
437
- # perform voice activity detection
438
- lseg = self.vad(mspec, lseg, difflen)
439
-
440
-
441
-
442
-
443
- # perform gender segmentation on speech segments
444
- if self.detect_gender:
445
- lseg = self.gender(mspec, lseg, difflen)
446
- if (self.complete_output):
447
- return [(lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02) for lab, start, stop in lseg ]
448
- else:
449
- return [[lab, start_sec + start * .02, start_sec + stop * .02 , (stop-start) * .02] for lab, start, stop in lseg if (lab=='male' or lab=="female" or lab=="speech")]
450
-
451
-
452
- def __call__(self, medianame, input_type='file',start_sec=None, stop_sec=None):
453
- """
454
- Return segmentation of a given file
455
- * convert file to wav 16k mono with ffmpeg
456
- * call NN segmentation procedures
457
- * media_name: path to the media to be processed (including remote url)
458
- may include any format supported by ffmpeg
459
- * tmpdir: allow to define a custom path for storing temporary files
460
- fast read/write HD are a good choice
461
- * start_sec (seconds): sound stream before start_sec won't be processed
462
- * stop_sec (seconds): sound stream after stop_sec won't be processed
463
- """
464
-
465
-
466
- mspec, loge, difflen , me = media2feats(medianame, input_type ,self.sample_rate,ffmpeg_path=self.ffmpeg_path)
467
-
468
- if start_sec is None:
469
- start_sec = 0
470
- # do segmentation
471
- return self.segment_feats(mspec, loge, difflen, start_sec),me
472
-
473
-
474
- def batch_process(self, linput, loutput, verbose=False, skipifexist=False, nbtry=1, trydelay=2., output_format='csv'):
475
-
476
- if verbose:
477
- print('batch_processing %d files' % len(linput))
478
-
479
- if output_format == 'csv':
480
- fexport = seg2csv
481
- elif output_format == 'textgrid':
482
- fexport = seg2textgrid
483
- else:
484
- raise NotImplementedError()
485
-
486
- t_batch_start = time.time()
487
-
488
- lmsg = []
489
- fg = featGenerator(linput.copy(), loutput.copy(), skipifexist, nbtry, trydelay)
490
- i = 0
491
- for feats, msg in fg:
492
- lmsg += msg
493
- i += len(msg)
494
- if verbose:
495
- print('%d/%d' % (i, len(linput)), msg)
496
- if feats is None:
497
- break
498
- mspec, loge, difflen = feats
499
- #if verbose == True:
500
- # print(i, linput[i], loutput[i])
501
- b = time.time()
502
- lseg = self.segment_feats(mspec, loge, difflen, 0)
503
- fexport(lseg, loutput[len(lmsg) -1])
504
- lmsg[-1] = (lmsg[-1][0], lmsg[-1][1], 'ok ' + str(time.time() -b))
505
-
506
- t_batch_dur = time.time() - t_batch_start
507
- nb_processed = len([e for e in lmsg if e[1] == 0])
508
- if nb_processed > 0:
509
- avg = t_batch_dur / nb_processed
510
- else:
511
- avg = -1
512
- return t_batch_dur, nb_processed, avg, lmsg
513
-
514
-
515
- def medialist2feats(lin, lout, skipifexist, nbtry, trydelay,sampling_rete=16000):
516
- """
517
- To be used when processing batches
518
- if resulting file exists, it is skipped
519
- in case of remote files, access is tried nbtry times
520
- """
521
- ret = None
522
- msg = []
523
- while ret is None and len(lin) > 0:
524
- src = lin.pop(0)
525
- dst = lout.pop(0)
526
- # print('popping', src)
527
-
528
- # if file exists: skipp
529
- if skipifexist and os.path.exists(dst):
530
- msg.append((dst, 1, 'already exists'))
531
- continue
532
-
533
- # create storing directory if required
534
- dname = os.path.dirname(dst)
535
- if not os.path.isdir(dname):
536
- os.makedirs(dname)
537
-
538
- itry = 0
539
- while ret is None and itry < nbtry:
540
- try:
541
- ret = media2feats(src, tmpdir, None, None, ffmpeg)
542
- except:
543
- itry += 1
544
- errmsg = sys.exc_info()[0]
545
- if itry != nbtry:
546
- time.sleep(random.random() * trydelay)
547
- if ret is None:
548
- msg.append((dst, 2, 'error: ' + str(errmsg)))
549
- else:
550
- msg.append((dst, 0, 'ok'))
551
-
552
- return ret, msg
553
-
554
-
555
- def featGenerator(ilist, olist, skipifexist=False, nbtry=1, trydelay=2., sampling_rate=16000):
556
- # print('init feat gen', len(ilist))
557
- thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
558
- thread.start()
559
- while True:
560
- ret, msg = thread.join()
561
- # print('join done', len(ilist))
562
- # print('new list', ilist)
563
- #ilist = ilist[len(msg):]
564
- #olist = olist[len(msg):]
565
- if len(ilist) == 0:
566
- break
567
- thread = ThreadReturning(target = medialist2feats, args=[ilist, olist, skipifexist, nbtry, trydelay,sampling_rate])
568
- thread.start()
569
- yield ret, msg
570
- yield ret, msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sad_tf/sidekit_mfcc.py DELETED
@@ -1,379 +0,0 @@
1
-
2
- # -*- coding: utf-8 -*-
3
- #
4
- # This file is part of SIDEKIT.
5
- #
6
- # The following code has been copy-pasted from SIDEKIT source files:
7
- # frontend/features.py frontend/io.py frontend/vad.py
8
- #
9
- # SIDEKIT is a python package for speaker verification.
10
- # Home page: http://www-lium.univ-lemans.fr/sidekit/
11
- #
12
- # SIDEKIT is a python package for speaker verification.
13
- # Home page: http://www-lium.univ-lemans.fr/sidekit/
14
- #
15
- # SIDEKIT is free software: you can redistribute it and/or modify
16
- # it under the terms of the GNU LLesser General Public License as
17
- # published by the Free Software Foundation, either version 3 of the License,
18
- # or (at your option) any later version.
19
- #
20
- # SIDEKIT is distributed in the hope that it will be useful,
21
- # but WITHOUT ANY WARRANTY; without even the implied warranty of
22
- # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23
- # GNU Lesser General Public License for more details.
24
- #
25
- # You should have received a copy of the GNU Lesser General Public License
26
- # along with SIDEKIT. If not, see <http://www.gnu.org/licenses/>.
27
-
28
- """
29
- Copyright 2014-2021 Anthony Larcher and Sylvain Meignier
30
-
31
- :mod:`frontend` provides methods to process an audio signal in order to extract
32
- useful parameters for speaker verification.
33
- """
34
-
35
-
36
- import numpy
37
- import soundfile
38
- import scipy
39
- from scipy.fftpack.realtransforms import dct
40
-
41
-
42
- __author__ = "Anthony Larcher and Sylvain Meignier"
43
- __copyright__ = "Copyright 2014-2021 Anthony Larcher and Sylvain Meignier"
44
- __license__ = "LGPL"
45
- __maintainer__ = "Anthony Larcher"
46
- __email__ = "[email protected]"
47
- __status__ = "Production"
48
- __docformat__ = 'reStructuredText'
49
-
50
-
51
-
52
-
53
- wav_flag = "float32" # Could be "int16"
54
- PARAM_TYPE = numpy.float32
55
-
56
-
57
- def read_wav(input_file_name):
58
- """
59
- :param input_file_name:
60
- :return:
61
- """
62
- #with wave.open(input_file_name, "r") as wfh:
63
- # (nchannels, sampwidth, framerate, nframes, comptype, compname) = wfh.getparams()
64
- # raw = wfh.readframes(nframes * nchannels)
65
- # out = struct.unpack_from("%dh" % nframes * nchannels, raw)
66
- # sig = numpy.reshape(numpy.array(out), (-1, nchannels)).squeeze()
67
- # return sig.astype(numpy.float32), framerate, sampwidth
68
- nfo = soundfile.info(input_file_name)
69
- sig, sample_rate = soundfile.read(input_file_name, dtype=wav_flag)
70
- sig = numpy.reshape(numpy.array(sig), (-1, nfo.channels)).squeeze()
71
- sig = sig.astype(numpy.float32)
72
- return sig, sample_rate, 4
73
-
74
-
75
-
76
-
77
- def hz2mel(f, htk=True):
78
- """Convert an array of frequency in Hz into mel.
79
-
80
- :param f: frequency to convert
81
-
82
- :return: the equivalence on the mel scale.
83
- """
84
- if htk:
85
- return 2595 * numpy.log10(1 + f / 700.)
86
- else:
87
- f = numpy.array(f)
88
-
89
- # Mel fn to match Slaney's Auditory Toolbox mfcc.m
90
- # Mel fn to match Slaney's Auditory Toolbox mfcc.m
91
- f_0 = 0.
92
- f_sp = 200. / 3.
93
- brkfrq = 1000.
94
- brkpt = (brkfrq - f_0) / f_sp
95
- logstep = numpy.exp(numpy.log(6.4) / 27)
96
-
97
- linpts = f < brkfrq
98
-
99
- z = numpy.zeros_like(f)
100
- # fill in parts separately
101
- z[linpts] = (f[linpts] - f_0) / f_sp
102
- z[~linpts] = brkpt + (numpy.log(f[~linpts] / brkfrq)) / numpy.log(logstep)
103
-
104
- if z.shape == (1,):
105
- return z[0]
106
- else:
107
- return z
108
-
109
- def mel2hz(z, htk=True):
110
- """Convert an array of mel values in Hz.
111
-
112
- :param m: ndarray of frequencies to convert in Hz.
113
-
114
- :return: the equivalent values in Hertz.
115
- """
116
- if htk:
117
- return 700. * (10**(z / 2595.) - 1)
118
- else:
119
- z = numpy.array(z, dtype=float)
120
- f_0 = 0
121
- f_sp = 200. / 3.
122
- brkfrq = 1000.
123
- brkpt = (brkfrq - f_0) / f_sp
124
- logstep = numpy.exp(numpy.log(6.4) / 27)
125
-
126
- linpts = (z < brkpt)
127
-
128
- f = numpy.zeros_like(z)
129
-
130
- # fill in parts separately
131
- f[linpts] = f_0 + f_sp * z[linpts]
132
- f[~linpts] = brkfrq * numpy.exp(numpy.log(logstep) * (z[~linpts] - brkpt))
133
-
134
- if f.shape == (1,):
135
- return f[0]
136
- else:
137
- return f
138
-
139
-
140
-
141
- def trfbank(fs, nfft, lowfreq, maxfreq, nlinfilt, nlogfilt, midfreq=1000):
142
- """Compute triangular filterbank for cepstral coefficient computation.
143
-
144
- :param fs: sampling frequency of the original signal.
145
- :param nfft: number of points for the Fourier Transform
146
- :param lowfreq: lower limit of the frequency band filtered
147
- :param maxfreq: higher limit of the frequency band filtered
148
- :param nlinfilt: number of linear filters to use in low frequencies
149
- :param nlogfilt: number of log-linear filters to use in high frequencies
150
- :param midfreq: frequency boundary between linear and log-linear filters
151
-
152
- :return: the filter bank and the central frequencies of each filter
153
- """
154
- # Total number of filters
155
- nfilt = nlinfilt + nlogfilt
156
-
157
- # ------------------------
158
- # Compute the filter bank
159
- # ------------------------
160
- # Compute start/middle/end points of the triangular filters in spectral
161
- # domain
162
- frequences = numpy.zeros(nfilt + 2, dtype=PARAM_TYPE)
163
- if nlogfilt == 0:
164
- linsc = (maxfreq - lowfreq) / (nlinfilt + 1)
165
- frequences[:nlinfilt + 2] = lowfreq + numpy.arange(nlinfilt + 2) * linsc
166
- elif nlinfilt == 0:
167
- low_mel = hz2mel(lowfreq)
168
- max_mel = hz2mel(maxfreq)
169
- mels = numpy.zeros(nlogfilt + 2)
170
- # mels[nlinfilt:]
171
- melsc = (max_mel - low_mel) / (nfilt + 1)
172
- mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
173
- # Back to the frequency domain
174
- frequences = mel2hz(mels)
175
- else:
176
- # Compute linear filters on [0;1000Hz]
177
- linsc = (min([midfreq, maxfreq]) - lowfreq) / (nlinfilt + 1)
178
- frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
179
- # Compute log-linear filters on [1000;maxfreq]
180
- low_mel = hz2mel(min([1000, maxfreq]))
181
- max_mel = hz2mel(maxfreq)
182
- mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
183
- melsc = (max_mel - low_mel) / (nlogfilt + 1)
184
-
185
- # Verify that mel2hz(melsc)>linsc
186
- while mel2hz(melsc) < linsc:
187
- # in this case, we add a linear filter
188
- nlinfilt += 1
189
- nlogfilt -= 1
190
- frequences[:nlinfilt] = lowfreq + numpy.arange(nlinfilt) * linsc
191
- low_mel = hz2mel(frequences[nlinfilt - 1] + 2 * linsc)
192
- max_mel = hz2mel(maxfreq)
193
- mels = numpy.zeros(nlogfilt + 2, dtype=PARAM_TYPE)
194
- melsc = (max_mel - low_mel) / (nlogfilt + 1)
195
-
196
- mels[:nlogfilt + 2] = low_mel + numpy.arange(nlogfilt + 2) * melsc
197
- # Back to the frequency domain
198
- frequences[nlinfilt:] = mel2hz(mels)
199
-
200
- heights = 2. / (frequences[2:] - frequences[0:-2])
201
-
202
- # Compute filterbank coeff (in fft domain, in bins)
203
- fbank = numpy.zeros((nfilt, int(numpy.floor(nfft / 2)) + 1), dtype=PARAM_TYPE)
204
- # FFT bins (in Hz)
205
- n_frequences = numpy.arange(nfft) / (1. * nfft) * fs
206
-
207
- for i in range(nfilt):
208
- low = frequences[i]
209
- cen = frequences[i + 1]
210
- hi = frequences[i + 2]
211
- try:
212
- lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int)
213
- except:
214
- lid = numpy.arange(numpy.floor(low * nfft / fs) + 1, numpy.floor(cen * nfft / fs) + 1, dtype=numpy.int32)
215
- left_slope = heights[i] / (cen - low)
216
- try:
217
- rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int)
218
- except:
219
- rid = numpy.arange(numpy.floor(cen * nfft / fs) + 1,min(numpy.floor(hi * nfft / fs) + 1, nfft), dtype=numpy.int32)
220
- right_slope = heights[i] / (hi - cen)
221
- fbank[i][lid] = left_slope * (n_frequences[lid] - low)
222
- fbank[i][rid[:-1]] = right_slope * (hi - n_frequences[rid[:-1]])
223
-
224
- return fbank, frequences
225
-
226
-
227
- def power_spectrum(input_sig,
228
- fs=8000,
229
- win_time=0.025,
230
- shift=0.01,
231
- prefac=0.97):
232
- """
233
- Compute the power spectrum of the signal.
234
- :param input_sig:
235
- :param fs:
236
- :param win_time:
237
- :param shift:
238
- :param prefac:
239
- :return:
240
- """
241
- window_length = int(round(win_time * fs))
242
- overlap = window_length - int(shift * fs)
243
- framed = framing(input_sig, window_length, win_shift=window_length-overlap).copy()
244
- # Pre-emphasis filtering is applied after framing to be consistent with stream processing
245
- framed = pre_emphasis(framed, prefac)
246
- l = framed.shape[0]
247
- n_fft = 2 ** int(numpy.ceil(numpy.log2(window_length)))
248
- # Windowing has been changed to hanning which is supposed to have less noisy sidelobes
249
- # ham = numpy.hamming(window_length)
250
- window = numpy.hanning(window_length)
251
-
252
- spec = numpy.ones((l, int(n_fft / 2) + 1), dtype=PARAM_TYPE)
253
- log_energy = numpy.log((framed**2).sum(axis=1))
254
- dec = 500000
255
- start = 0
256
- stop = min(dec, l)
257
- while start < l:
258
- ahan = framed[start:stop, :] * window
259
- mag = numpy.fft.rfft(ahan, n_fft, axis=-1)
260
- spec[start:stop, :] = mag.real**2 + mag.imag**2
261
- start = stop
262
- stop = min(stop + dec, l)
263
-
264
- return spec, log_energy
265
-
266
-
267
- def framing(sig, win_size, win_shift=1, context=(0, 0), pad='zeros'):
268
- """
269
- :param sig: input signal, can be mono or multi dimensional
270
- :param win_size: size of the window in term of samples
271
- :param win_shift: shift of the sliding window in terme of samples
272
- :param context: tuple of left and right context
273
- :param pad: can be zeros or edge
274
- """
275
- dsize = sig.dtype.itemsize
276
- if sig.ndim == 1:
277
- sig = sig[:, numpy.newaxis]
278
- # Manage padding
279
- c = (context, ) + (sig.ndim - 1) * ((0, 0), )
280
- _win_size = win_size + sum(context)
281
- shape = (int((sig.shape[0] - win_size) / win_shift) + 1, 1, _win_size, sig.shape[1])
282
- strides = tuple(map(lambda x: x * dsize, [win_shift * sig.shape[1], 1, sig.shape[1], 1]))
283
- if pad == 'zeros':
284
- return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'constant', constant_values=(0,)),
285
- shape=shape,
286
- strides=strides).squeeze()
287
- elif pad == 'edge':
288
- return numpy.lib.stride_tricks.as_strided(numpy.lib.pad(sig, c, 'edge'),
289
- shape=shape,
290
- strides=strides).squeeze()
291
-
292
-
293
- def pre_emphasis(input_sig, pre):
294
- """Pre-emphasis of an audio signal.
295
- :param input_sig: the input vector of signal to pre emphasize
296
- :param pre: value that defines the pre-emphasis filter.
297
- """
298
- if input_sig.ndim == 1:
299
- return (input_sig - numpy.c_[input_sig[numpy.newaxis, :][..., :1],
300
- input_sig[numpy.newaxis, :][..., :-1]].squeeze() * pre)
301
- else:
302
- return input_sig - numpy.c_[input_sig[..., :1], input_sig[..., :-1]] * pre
303
-
304
-
305
- def mfcc(input_sig,
306
- lowfreq=100, maxfreq=8000,
307
- nlinfilt=0, nlogfilt=24,
308
- nwin=0.025,
309
- fs=16000,
310
- nceps=13,
311
- shift=0.01,
312
- get_spec=False,
313
- get_mspec=False,
314
- prefac=0.97):
315
- """Compute Mel Frequency Cepstral Coefficients.
316
-
317
- :param input_sig: input signal from which the coefficients are computed.
318
- Input audio is supposed to be RAW PCM 16bits
319
- :param lowfreq: lower limit of the frequency band filtered.
320
- Default is 100Hz.
321
- :param maxfreq: higher limit of the frequency band filtered.
322
- Default is 8000Hz.
323
- :param nlinfilt: number of linear filters to use in low frequencies.
324
- Default is 0.
325
- :param nlogfilt: number of log-linear filters to use in high frequencies.
326
- Default is 24.
327
- :param nwin: length of the sliding window in seconds
328
- Default is 0.025.
329
- :param fs: sampling frequency of the original signal. Default is 16000Hz.
330
- :param nceps: number of cepstral coefficients to extract.
331
- Default is 13.
332
- :param shift: shift between two analyses. Default is 0.01 (10ms).
333
- :param get_spec: boolean, if true returns the spectrogram
334
- :param get_mspec: boolean, if true returns the output of the filter banks
335
- :param prefac: pre-emphasis filter value
336
-
337
- :return: the cepstral coefficients in a ndaray as well as
338
- the Log-spectrum in the mel-domain in a ndarray.
339
-
340
- .. note:: MFCC are computed as follows:
341
-
342
- - Pre-processing in time-domain (pre-emphasizing)
343
- - Compute the spectrum amplitude by windowing with a Hamming window
344
- - Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively
345
- linearly spaced on the mel scale, and have equal bandwith in the mel scale
346
- - Compute the DCT of the log-spectrom
347
- - Log-energy is returned as first coefficient of the feature vector.
348
-
349
- For more details, refer to [Davis80]_.
350
- """
351
- # Compute power spectrum
352
- spec, log_energy = power_spectrum(input_sig,
353
- fs,
354
- win_time=nwin,
355
- shift=shift,
356
- prefac=prefac)
357
- # Filter the spectrum through the triangle filter-bank
358
- n_fft = 2 ** int(numpy.ceil(numpy.log2(int(round(nwin * fs)))))
359
- fbank = trfbank(fs, n_fft, lowfreq, maxfreq, nlinfilt, nlogfilt)[0]
360
-
361
- mspec = numpy.log(numpy.dot(spec, fbank.T)) # A tester avec log10 et log
362
- # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain)
363
- # The C0 term is removed as it is the constant term
364
- # ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps + 1]
365
- lst = list()
366
- lst.append(None)
367
- lst.append(log_energy)
368
- if get_spec:
369
- lst.append(spec)
370
- else:
371
- lst.append(None)
372
- del spec
373
- if get_mspec:
374
- lst.append(mspec)
375
- else:
376
- lst.append(None)
377
- del mspec
378
-
379
- return lst
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sad_tf/thread_returning.py DELETED
@@ -1,27 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- Created on Tue Mar 27 15:18:49 2018
5
-
6
- @author: elechapt
7
- """
8
-
9
- from threading import Thread
10
-
11
- class ThreadReturning(Thread):
12
- """
13
- Allow us to get the results from a thread
14
- """
15
- def __init__(self, *args, **kwargs):
16
- Thread.__init__(self, *args, **kwargs)
17
- self._return = None
18
-
19
- def run(self):
20
- if self._target is not None:
21
- self._return = self._target(*self._args, **self._kwargs)
22
-
23
- def join(self):
24
- Thread.join(self)
25
- return self._return
26
-
27
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sad_tf/viterbi.py DELETED
@@ -1,222 +0,0 @@
1
- #!/usr/bin/env python
2
- # encoding: utf-8
3
-
4
- # The MIT License (MIT)
5
-
6
- # Copyright (c) 2014-2016 CNRS
7
-
8
- # Permission is hereby granted, free of charge, to any person obtaining a copy
9
- # of this software and associated documentation files (the "Software"), to deal
10
- # in the Software without restriction, including without limitation the rights
11
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
- # copies of the Software, and to permit persons to whom the Software is
13
- # furnished to do so, subject to the following conditions:
14
-
15
- # The above copyright notice and this permission notice shall be included in
16
- # all copies or substantial portions of the Software.
17
-
18
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
- # SOFTWARE.
25
-
26
- # AUTHORS
27
- # Hervé BREDIN - http://herve.niderb.fr
28
-
29
- from __future__ import unicode_literals
30
-
31
- import six.moves
32
- import numpy as np
33
- import itertools
34
-
35
- VITERBI_CONSTRAINT_NONE = 0
36
- VITERBI_CONSTRAINT_FORBIDDEN = 1
37
- VITERBI_CONSTRAINT_MANDATORY = 2
38
-
39
-
40
- LOG_ZERO = np.log(1e-200)
41
-
42
- # handling 'consecutive' constraints is achieved by duplicating states
43
- # the following functions are here to help in this process
44
-
45
-
46
- # create new transition prob. matrix accounting for duplicated states.
47
- def _update_transition(transition, consecutive):
48
-
49
- # initialize with LOG_ZERO everywhere
50
- # except on the +1 diagonal np.log(1)
51
- new_n_states = np.sum(consecutive)
52
- new_transition = LOG_ZERO * np.ones((new_n_states, new_n_states))
53
- for i in range(1, new_n_states):
54
- new_transition[i - 1, i] = np.log(1)
55
-
56
- n_states = len(consecutive)
57
- boundary = np.hstack(([0], np.cumsum(consecutive)))
58
- start = boundary[:-1]
59
- end = boundary[1:] - 1
60
-
61
- for i, j in itertools.product(six.moves.range(n_states), repeat=2):
62
- new_transition[end[i], start[j]] = transition[i, j]
63
-
64
- return new_transition
65
-
66
-
67
- # create new initial prob. matrix accounting for duplicated states.
68
- def _update_initial(initial, consecutive):
69
-
70
- new_n_states = np.sum(consecutive)
71
- new_initial = LOG_ZERO * np.ones((new_n_states, ))
72
-
73
- n_states = len(consecutive)
74
- boundary = np.hstack(([0], np.cumsum(consecutive)))
75
- start = boundary[:-1]
76
-
77
- for i in range(n_states):
78
- new_initial[start[i]] = initial[i]
79
-
80
- return new_initial
81
-
82
-
83
- # create new emission prob. matrix accounting for duplicated states.
84
- def _update_emission(emission, consecutive):
85
-
86
- return np.vstack(
87
- np.tile(e, (c, 1)) # duplicate emission probabilities c times
88
- for e, c in six.moves.zip(emission.T, consecutive)
89
- ).T
90
-
91
-
92
- # create new constraint matrix accounting for duplicated states
93
- def _update_constraint(constraint, consecutive):
94
-
95
- return np.vstack(
96
- np.tile(e, (c, 1)) # duplicate constraint probabilities c times
97
- for e, c in six.moves.zip(constraint.T, consecutive)
98
- ).T
99
-
100
-
101
- # convert sequence of duplicated states back to sequence of original states.
102
- def _update_states(states, consecutive):
103
-
104
- boundary = np.hstack(([0], np.cumsum(consecutive)))
105
- start = boundary[:-1]
106
- end = boundary[1:]
107
-
108
- new_states = np.empty(states.shape)
109
-
110
- for i, (s, e) in enumerate(six.moves.zip(start, end)):
111
- new_states[np.where((s <= states) & (states < e))] = i
112
-
113
- return new_states
114
-
115
-
116
- def viterbi_decoding(emission, transition,
117
- initial=None, consecutive=None, constraint=None):
118
- """(Constrained) Viterbi decoding
119
-
120
- Parameters
121
- ----------
122
- emission : array of shape (n_samples, n_states)
123
- E[t, i] is the emission log-probabilities of sample t at state i.
124
- transition : array of shape (n_states, n_states)
125
- T[i, j] is the transition log-probabilities from state i to state j.
126
- initial : optional, array of shape (n_states, )
127
- I[i] is the initial log-probabilities of state i.
128
- Defaults to equal log-probabilities.
129
- consecutive : optional, int or int array of shape (n_states, )
130
- C[i] is a the minimum-consecutive-states constraint for state i.
131
- C[i] = 1 is equivalent to no constraint (default).
132
- constraint : optional, array of shape (n_samples, n_states)
133
- K[t, i] = 1 forbids state i at time t.
134
- K[t, i] = 2 forces state i at time t.
135
- Use K[t, i] = 0 for no constraint (default).
136
-
137
- Returns
138
- -------
139
- states : array of shape (n_samples, )
140
- Most probable state sequence
141
-
142
- """
143
-
144
- # ~~ INITIALIZATION ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
145
-
146
- T, k = emission.shape # number of observations x number of states
147
-
148
- # no minimum-consecutive-states constraints
149
- if consecutive is None:
150
- consecutive = np.ones((k, ), dtype=int)
151
-
152
- # same value for all states
153
- elif isinstance(consecutive, int):
154
- consecutive = consecutive * np.ones((k, ), dtype=int)
155
-
156
- # (potentially) different values per state
157
- else:
158
- consecutive = np.array(consecutive, dtype=int).reshape((k, ))
159
-
160
- # at least one sample
161
- consecutive = np.maximum(1, consecutive)
162
-
163
- # balance initial probabilities when they are not provided
164
- if initial is None:
165
- initial = np.log(np.ones((k, )) / k)
166
-
167
- # no constraint?
168
- if constraint is None:
169
- constraint = VITERBI_CONSTRAINT_NONE * np.ones((T, k))
170
-
171
- # artificially create new states to account for 'consecutive' constraints
172
- emission = _update_emission(emission, consecutive)
173
- transition = _update_transition(transition, consecutive)
174
- initial = _update_initial(initial, consecutive)
175
- constraint = _update_constraint(constraint, consecutive)
176
- T, K = emission.shape # number of observations x number of new states
177
- states = np.arange(K) # states 0 to K-1
178
-
179
- # set emission probability to zero for forbidden states
180
- emission[
181
- np.where(constraint == VITERBI_CONSTRAINT_FORBIDDEN)] = LOG_ZERO
182
-
183
- # set emission probability to zero for all states but the mandatory one
184
- for t, k in six.moves.zip(
185
- *np.where(constraint == VITERBI_CONSTRAINT_MANDATORY)
186
- ):
187
- emission[t, states != k] = LOG_ZERO
188
-
189
- # ~~ FORWARD PASS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
190
-
191
- V = np.empty((T, K)) # V[t, k] is the probability of the
192
- V[0, :] = emission[0, :] + initial # most probable state sequence for the
193
- # first t observations that has k as
194
- # its final state.
195
-
196
- P = np.empty((T, K), dtype=int) # P[t, k] remembers which state was used
197
- P[0, :] = states # to get from time t-1 to time t at
198
- # state k
199
-
200
- for t in range(1, T):
201
-
202
- # tmp[k, k'] is the probability of the most probable path
203
- # leading to state k at time t - 1, plus the probability of
204
- # transitioning from state k to state k' (at time t)
205
- tmp = (V[t - 1, :] + transition.T).T
206
-
207
- # optimal path to state k at t comes from state P[t, k] at t - 1
208
- # (find among all possible states at this time t)
209
- P[t, :] = np.argmax(tmp, axis=0)
210
-
211
- # update V for time t
212
- V[t, :] = emission[t, :] + tmp[P[t, :], states]
213
-
214
- # ~~ BACK-TRACKING ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
215
- X = np.empty((T,), dtype=int)
216
- X[-1] = np.argmax(V[-1, :])
217
- for t in range(1, T):
218
- X[-(t + 1)] = P[-t, X[-t]]
219
-
220
- # ~~ CONVERT BACK TO ORIGINAL STATES
221
-
222
- return _update_states(X, consecutive)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sad_tf/viterbi_utils.py DELETED
@@ -1,49 +0,0 @@
1
- #!/usr/bin/env python
2
- # encoding: utf-8
3
-
4
- # The MIT License
5
-
6
- # Copyright (c) 2018 Ina (David Doukhan - http://www.ina.fr/)
7
-
8
- # Permission is hereby granted, free of charge, to any person obtaining a copy
9
- # of this software and associated documentation files (the "Software"), to deal
10
- # in the Software without restriction, including without limitation the rights
11
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
- # copies of the Software, and to permit persons to whom the Software is
13
- # furnished to do so, subject to the following conditions:
14
-
15
- # The above copyright notice and this permission notice shall be included in
16
- # all copies or substantial portions of the Software.
17
-
18
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24
- # THE SOFTWARE.
25
-
26
- import numpy as np
27
-
28
-
29
- def pred2logemission(pred, eps=1e-10):
30
- pred = np.array(pred)
31
- ret = np.ones((len(pred), 2)) * eps
32
- ret[pred == 0, 0] = 1 - eps
33
- ret[pred == 1, 1] = 1 - eps
34
- return np.log(ret)
35
-
36
- def log_trans_exp(exp,cost0=0, cost1=0):
37
- # transition cost is assumed to be 10**-exp
38
- cost = -exp * np.log(10)
39
- ret = np.ones((2,2)) * cost
40
- ret[0,0]= cost0
41
- ret[1,1]= cost1
42
- return ret
43
-
44
- def diag_trans_exp(exp, dim):
45
- cost = -exp * np.log(10)
46
- ret = np.ones((dim, dim)) * cost
47
- for i in range(dim):
48
- ret[i, i] = 0
49
- return ret