Dominik Macháček
commited on
Commit
·
ef08538
1
Parent(s):
99aef35
buffer trimming options + most recommendable default
Browse files- whisper_online.py +19 -33
whisper_online.py
CHANGED
|
@@ -212,7 +212,7 @@ class OnlineASRProcessor:
|
|
| 212 |
|
| 213 |
SAMPLING_RATE = 16000
|
| 214 |
|
| 215 |
-
def __init__(self, asr, tokenizer, logfile=sys.stderr):
|
| 216 |
"""asr: WhisperASR object
|
| 217 |
tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
|
| 218 |
logfile: where to store the log.
|
|
@@ -223,6 +223,8 @@ class OnlineASRProcessor:
|
|
| 223 |
|
| 224 |
self.init()
|
| 225 |
|
|
|
|
|
|
|
| 226 |
def init(self):
|
| 227 |
"""run this when starting or restarting processing"""
|
| 228 |
self.audio_buffer = np.array([],dtype=np.float32)
|
|
@@ -278,36 +280,18 @@ class OnlineASRProcessor:
|
|
| 278 |
print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
|
| 279 |
|
| 280 |
# there is a newly confirmed text
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
self.
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
#
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
# elif self.transcript_buffer.complete():
|
| 295 |
-
# self.silence_iters = 0
|
| 296 |
-
# elif not self.transcript_buffer.complete():
|
| 297 |
-
# # print("NOT COMPLETE:",to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
|
| 298 |
-
# self.silence_iters += 1
|
| 299 |
-
# if self.silence_iters >= 3:
|
| 300 |
-
# n = self.last_chunked_at
|
| 301 |
-
## self.chunk_completed_sentence()
|
| 302 |
-
## if n == self.last_chunked_at:
|
| 303 |
-
# self.chunk_at(self.last_chunked_at+self.chunk)
|
| 304 |
-
# print(f"\tCHUNK: 3-times silence! chunk_at {n}+{self.chunk}",file=self.logfile)
|
| 305 |
-
## self.silence_iters = 0
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
# if the audio buffer is longer than 30s, trim it...
|
| 309 |
-
if len(self.audio_buffer)/self.SAMPLING_RATE > 30:
|
| 310 |
-
# ...on the last completed segment (labeled by Whisper)
|
| 311 |
self.chunk_completed_segment(res)
|
| 312 |
|
| 313 |
# alternative: on any word
|
|
@@ -317,7 +301,7 @@ class OnlineASRProcessor:
|
|
| 317 |
#while k>0 and self.commited[k][1] > l:
|
| 318 |
# k -= 1
|
| 319 |
#t = self.commited[k][1]
|
| 320 |
-
print(f"chunking
|
| 321 |
#self.chunk_at(t)
|
| 322 |
|
| 323 |
print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
|
|
@@ -477,6 +461,8 @@ if __name__ == "__main__":
|
|
| 477 |
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
|
| 478 |
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
|
| 479 |
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
|
|
|
|
|
|
|
| 480 |
args = parser.parse_args()
|
| 481 |
|
| 482 |
# reset to store stderr to different file stream, e.g. open(os.devnull,"w")
|
|
@@ -521,7 +507,7 @@ if __name__ == "__main__":
|
|
| 521 |
|
| 522 |
|
| 523 |
min_chunk = args.min_chunk_size
|
| 524 |
-
online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile)
|
| 525 |
|
| 526 |
|
| 527 |
# load the audio into the LRU cache before we start the timer
|
|
|
|
| 212 |
|
| 213 |
SAMPLING_RATE = 16000
|
| 214 |
|
| 215 |
+
def __init__(self, asr, tokenizer=None, logfile=sys.stderr, buffer_trimming=("segment", 15)):
|
| 216 |
"""asr: WhisperASR object
|
| 217 |
tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
|
| 218 |
logfile: where to store the log.
|
|
|
|
| 223 |
|
| 224 |
self.init()
|
| 225 |
|
| 226 |
+
self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
|
| 227 |
+
|
| 228 |
def init(self):
|
| 229 |
"""run this when starting or restarting processing"""
|
| 230 |
self.audio_buffer = np.array([],dtype=np.float32)
|
|
|
|
| 280 |
print("INCOMPLETE:",self.to_flush(self.transcript_buffer.complete()),file=self.logfile,flush=True)
|
| 281 |
|
| 282 |
# there is a newly confirmed text
|
| 283 |
+
|
| 284 |
+
if o and self.buffer_trimming_way == "sentence": # trim the completed sentences
|
| 285 |
+
if len(self.audio_buffer)/self.SAMPLING_RATE > self.buffer_trimming_sec: # longer than this
|
| 286 |
+
self.chunk_completed_sentence()
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
if self.buffer_trimming_way == "segment":
|
| 290 |
+
s = self.buffer_trimming_sec # trim the completed segments longer than s,
|
| 291 |
+
else:
|
| 292 |
+
s = 30 # if the audio buffer is longer than 30s, trim it
|
| 293 |
+
|
| 294 |
+
if len(self.audio_buffer)/self.SAMPLING_RATE > s:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
self.chunk_completed_segment(res)
|
| 296 |
|
| 297 |
# alternative: on any word
|
|
|
|
| 301 |
#while k>0 and self.commited[k][1] > l:
|
| 302 |
# k -= 1
|
| 303 |
#t = self.commited[k][1]
|
| 304 |
+
print(f"chunking segment",file=self.logfile)
|
| 305 |
#self.chunk_at(t)
|
| 306 |
|
| 307 |
print(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}",file=self.logfile)
|
|
|
|
| 461 |
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
|
| 462 |
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
|
| 463 |
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
|
| 464 |
+
parser.add_argument('--buffer_trimming', type=str, default="sentence", choices=["sentence", "segment"],help='Buffer trimming strategy')
|
| 465 |
+
parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming lenght threshold in seconds. If buffer length longer, trimming sentence/segment is triggered.')
|
| 466 |
args = parser.parse_args()
|
| 467 |
|
| 468 |
# reset to store stderr to different file stream, e.g. open(os.devnull,"w")
|
|
|
|
| 507 |
|
| 508 |
|
| 509 |
min_chunk = args.min_chunk_size
|
| 510 |
+
online = OnlineASRProcessor(asr,create_tokenizer(tgt_language),logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
| 511 |
|
| 512 |
|
| 513 |
# load the audio into the LRU cache before we start the timer
|