Commit
·
97a4ebd
1
Parent(s):
2ba48bc
Construct an explicit logger rather than using the root logger
Browse files- whisper_online.py +31 -29
- whisper_online_server.py +7 -12
whisper_online.py
CHANGED
|
@@ -11,6 +11,8 @@ import io
|
|
| 11 |
import soundfile as sf
|
| 12 |
import math
|
| 13 |
|
|
|
|
|
|
|
| 14 |
@lru_cache
|
| 15 |
def load_audio(fname):
|
| 16 |
a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
|
|
@@ -65,7 +67,7 @@ class WhisperTimestampedASR(ASRBase):
|
|
| 65 |
from whisper_timestamped import transcribe_timestamped
|
| 66 |
self.transcribe_timestamped = transcribe_timestamped
|
| 67 |
if model_dir is not None:
|
| 68 |
-
|
| 69 |
return whisper.load_model(modelsize, download_root=cache_dir)
|
| 70 |
|
| 71 |
def transcribe(self, audio, init_prompt=""):
|
|
@@ -106,7 +108,7 @@ class FasterWhisperASR(ASRBase):
|
|
| 106 |
from faster_whisper import WhisperModel
|
| 107 |
logging.getLogger("faster_whisper").setLevel(logging.WARNING)
|
| 108 |
if model_dir is not None:
|
| 109 |
-
|
| 110 |
model_size_or_path = model_dir
|
| 111 |
elif modelsize is not None:
|
| 112 |
model_size_or_path = modelsize
|
|
@@ -229,7 +231,7 @@ class OpenaiApiASR(ASRBase):
|
|
| 229 |
|
| 230 |
# Process transcription/translation
|
| 231 |
transcript = proc.create(**params)
|
| 232 |
-
|
| 233 |
|
| 234 |
return transcript
|
| 235 |
|
|
@@ -276,7 +278,7 @@ class HypothesisBuffer:
|
|
| 276 |
for j in range(i):
|
| 277 |
words.append(repr(self.new.pop(0)))
|
| 278 |
words_msg = "\t".join(words)
|
| 279 |
-
|
| 280 |
break
|
| 281 |
|
| 282 |
def flush(self):
|
|
@@ -365,9 +367,9 @@ class OnlineASRProcessor:
|
|
| 365 |
"""
|
| 366 |
|
| 367 |
prompt, non_prompt = self.prompt()
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
|
| 372 |
|
| 373 |
# transform to [(beg,end,"word1"), ...]
|
|
@@ -377,9 +379,9 @@ class OnlineASRProcessor:
|
|
| 377 |
o = self.transcript_buffer.flush()
|
| 378 |
self.commited.extend(o)
|
| 379 |
completed = self.to_flush(o)
|
| 380 |
-
|
| 381 |
the_rest = self.to_flush(self.transcript_buffer.complete())
|
| 382 |
-
|
| 383 |
|
| 384 |
# there is a newly confirmed text
|
| 385 |
|
|
@@ -403,18 +405,18 @@ class OnlineASRProcessor:
|
|
| 403 |
#while k>0 and self.commited[k][1] > l:
|
| 404 |
# k -= 1
|
| 405 |
#t = self.commited[k][1]
|
| 406 |
-
|
| 407 |
#self.chunk_at(t)
|
| 408 |
|
| 409 |
-
|
| 410 |
return self.to_flush(o)
|
| 411 |
|
| 412 |
def chunk_completed_sentence(self):
|
| 413 |
if self.commited == []: return
|
| 414 |
-
|
| 415 |
sents = self.words_to_sentences(self.commited)
|
| 416 |
for s in sents:
|
| 417 |
-
|
| 418 |
if len(sents) < 2:
|
| 419 |
return
|
| 420 |
while len(sents) > 2:
|
|
@@ -422,7 +424,7 @@ class OnlineASRProcessor:
|
|
| 422 |
# we will continue with audio processing at this timestamp
|
| 423 |
chunk_at = sents[-2][1]
|
| 424 |
|
| 425 |
-
|
| 426 |
self.chunk_at(chunk_at)
|
| 427 |
|
| 428 |
def chunk_completed_segment(self, res):
|
|
@@ -439,12 +441,12 @@ class OnlineASRProcessor:
|
|
| 439 |
ends.pop(-1)
|
| 440 |
e = ends[-2]+self.buffer_time_offset
|
| 441 |
if e <= t:
|
| 442 |
-
|
| 443 |
self.chunk_at(e)
|
| 444 |
else:
|
| 445 |
-
|
| 446 |
else:
|
| 447 |
-
|
| 448 |
|
| 449 |
|
| 450 |
|
|
@@ -490,7 +492,7 @@ class OnlineASRProcessor:
|
|
| 490 |
"""
|
| 491 |
o = self.transcript_buffer.complete()
|
| 492 |
f = self.to_flush(o)
|
| 493 |
-
|
| 494 |
return f
|
| 495 |
|
| 496 |
|
|
@@ -530,7 +532,7 @@ def create_tokenizer(lan):
|
|
| 530 |
|
| 531 |
# the following languages are in Whisper, but not in wtpsplit:
|
| 532 |
if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
|
| 533 |
-
|
| 534 |
lan = None
|
| 535 |
|
| 536 |
from wtpsplit import WtP
|
|
@@ -563,7 +565,7 @@ def asr_factory(args, logfile=sys.stderr):
|
|
| 563 |
"""
|
| 564 |
backend = args.backend
|
| 565 |
if backend == "openai-api":
|
| 566 |
-
|
| 567 |
asr = OpenaiApiASR(lan=args.lan)
|
| 568 |
else:
|
| 569 |
if backend == "faster-whisper":
|
|
@@ -574,14 +576,14 @@ def asr_factory(args, logfile=sys.stderr):
|
|
| 574 |
# Only for FasterWhisperASR and WhisperTimestampedASR
|
| 575 |
size = args.model
|
| 576 |
t = time.time()
|
| 577 |
-
|
| 578 |
asr = asr_cls(modelsize=size, lan=args.lan, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
|
| 579 |
e = time.time()
|
| 580 |
-
|
| 581 |
|
| 582 |
# Apply common configurations
|
| 583 |
if getattr(args, 'vad', False): # Checks if VAD argument is present and True
|
| 584 |
-
|
| 585 |
asr.use_vad()
|
| 586 |
|
| 587 |
language = args.lan
|
|
@@ -619,14 +621,14 @@ if __name__ == "__main__":
|
|
| 619 |
logfile = sys.stderr
|
| 620 |
|
| 621 |
if args.offline and args.comp_unaware:
|
| 622 |
-
|
| 623 |
sys.exit(1)
|
| 624 |
|
| 625 |
audio_path = args.audio_path
|
| 626 |
|
| 627 |
SAMPLING_RATE = 16000
|
| 628 |
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
| 629 |
-
|
| 630 |
|
| 631 |
asr, online = asr_factory(args, logfile=logfile)
|
| 632 |
min_chunk = args.min_chunk_size
|
|
@@ -674,12 +676,12 @@ if __name__ == "__main__":
|
|
| 674 |
try:
|
| 675 |
o = online.process_iter()
|
| 676 |
except AssertionError as e:
|
| 677 |
-
|
| 678 |
pass
|
| 679 |
else:
|
| 680 |
output_transcript(o, now=end)
|
| 681 |
|
| 682 |
-
|
| 683 |
|
| 684 |
if end >= duration:
|
| 685 |
break
|
|
@@ -706,12 +708,12 @@ if __name__ == "__main__":
|
|
| 706 |
try:
|
| 707 |
o = online.process_iter()
|
| 708 |
except AssertionError as e:
|
| 709 |
-
|
| 710 |
pass
|
| 711 |
else:
|
| 712 |
output_transcript(o)
|
| 713 |
now = time.time() - start
|
| 714 |
-
|
| 715 |
|
| 716 |
if end >= duration:
|
| 717 |
break
|
|
|
|
| 11 |
import soundfile as sf
|
| 12 |
import math
|
| 13 |
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
@lru_cache
|
| 17 |
def load_audio(fname):
|
| 18 |
a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
|
|
|
|
| 67 |
from whisper_timestamped import transcribe_timestamped
|
| 68 |
self.transcribe_timestamped = transcribe_timestamped
|
| 69 |
if model_dir is not None:
|
| 70 |
+
logger.debug("ignoring model_dir, not implemented")
|
| 71 |
return whisper.load_model(modelsize, download_root=cache_dir)
|
| 72 |
|
| 73 |
def transcribe(self, audio, init_prompt=""):
|
|
|
|
| 108 |
from faster_whisper import WhisperModel
|
| 109 |
logging.getLogger("faster_whisper").setLevel(logging.WARNING)
|
| 110 |
if model_dir is not None:
|
| 111 |
+
logger.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
|
| 112 |
model_size_or_path = model_dir
|
| 113 |
elif modelsize is not None:
|
| 114 |
model_size_or_path = modelsize
|
|
|
|
| 231 |
|
| 232 |
# Process transcription/translation
|
| 233 |
transcript = proc.create(**params)
|
| 234 |
+
logger.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
|
| 235 |
|
| 236 |
return transcript
|
| 237 |
|
|
|
|
| 278 |
for j in range(i):
|
| 279 |
words.append(repr(self.new.pop(0)))
|
| 280 |
words_msg = "\t".join(words)
|
| 281 |
+
logger.debug(f"removing last {i} words: {words_msg}")
|
| 282 |
break
|
| 283 |
|
| 284 |
def flush(self):
|
|
|
|
| 367 |
"""
|
| 368 |
|
| 369 |
prompt, non_prompt = self.prompt()
|
| 370 |
+
logger.debug(f"PROMPT: {prompt}")
|
| 371 |
+
logger.debug(f"CONTEXT: {non_prompt}")
|
| 372 |
+
logger.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
|
| 373 |
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
|
| 374 |
|
| 375 |
# transform to [(beg,end,"word1"), ...]
|
|
|
|
| 379 |
o = self.transcript_buffer.flush()
|
| 380 |
self.commited.extend(o)
|
| 381 |
completed = self.to_flush(o)
|
| 382 |
+
logger.debug(f">>>>COMPLETE NOW: {completed}")
|
| 383 |
the_rest = self.to_flush(self.transcript_buffer.complete())
|
| 384 |
+
logger.debug(f"INCOMPLETE: {the_rest}")
|
| 385 |
|
| 386 |
# there is a newly confirmed text
|
| 387 |
|
|
|
|
| 405 |
#while k>0 and self.commited[k][1] > l:
|
| 406 |
# k -= 1
|
| 407 |
#t = self.commited[k][1]
|
| 408 |
+
logger.debug(f"chunking segment")
|
| 409 |
#self.chunk_at(t)
|
| 410 |
|
| 411 |
+
logger.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
|
| 412 |
return self.to_flush(o)
|
| 413 |
|
| 414 |
def chunk_completed_sentence(self):
|
| 415 |
if self.commited == []: return
|
| 416 |
+
logger.debug(self.commited)
|
| 417 |
sents = self.words_to_sentences(self.commited)
|
| 418 |
for s in sents:
|
| 419 |
+
logger.debug(f"\t\tSENT: {s}")
|
| 420 |
if len(sents) < 2:
|
| 421 |
return
|
| 422 |
while len(sents) > 2:
|
|
|
|
| 424 |
# we will continue with audio processing at this timestamp
|
| 425 |
chunk_at = sents[-2][1]
|
| 426 |
|
| 427 |
+
logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
|
| 428 |
self.chunk_at(chunk_at)
|
| 429 |
|
| 430 |
def chunk_completed_segment(self, res):
|
|
|
|
| 441 |
ends.pop(-1)
|
| 442 |
e = ends[-2]+self.buffer_time_offset
|
| 443 |
if e <= t:
|
| 444 |
+
logger.debug(f"--- segment chunked at {e:2.2f}")
|
| 445 |
self.chunk_at(e)
|
| 446 |
else:
|
| 447 |
+
logger.debug(f"--- last segment not within commited area")
|
| 448 |
else:
|
| 449 |
+
logger.debug(f"--- not enough segments to chunk")
|
| 450 |
|
| 451 |
|
| 452 |
|
|
|
|
| 492 |
"""
|
| 493 |
o = self.transcript_buffer.complete()
|
| 494 |
f = self.to_flush(o)
|
| 495 |
+
logger.debug("last, noncommited: {f}")
|
| 496 |
return f
|
| 497 |
|
| 498 |
|
|
|
|
| 532 |
|
| 533 |
# the following languages are in Whisper, but not in wtpsplit:
|
| 534 |
if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
|
| 535 |
+
logger.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
|
| 536 |
lan = None
|
| 537 |
|
| 538 |
from wtpsplit import WtP
|
|
|
|
| 565 |
"""
|
| 566 |
backend = args.backend
|
| 567 |
if backend == "openai-api":
|
| 568 |
+
logger.debug("Using OpenAI API.")
|
| 569 |
asr = OpenaiApiASR(lan=args.lan)
|
| 570 |
else:
|
| 571 |
if backend == "faster-whisper":
|
|
|
|
| 576 |
# Only for FasterWhisperASR and WhisperTimestampedASR
|
| 577 |
size = args.model
|
| 578 |
t = time.time()
|
| 579 |
+
logger.debug(f"Loading Whisper {size} model for {args.lan}...")
|
| 580 |
asr = asr_cls(modelsize=size, lan=args.lan, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
|
| 581 |
e = time.time()
|
| 582 |
+
logger.debug(f"done. It took {round(e-t,2)} seconds.")
|
| 583 |
|
| 584 |
# Apply common configurations
|
| 585 |
if getattr(args, 'vad', False): # Checks if VAD argument is present and True
|
| 586 |
+
logger.info("Setting VAD filter")
|
| 587 |
asr.use_vad()
|
| 588 |
|
| 589 |
language = args.lan
|
|
|
|
| 621 |
logfile = sys.stderr
|
| 622 |
|
| 623 |
if args.offline and args.comp_unaware:
|
| 624 |
+
logger.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
|
| 625 |
sys.exit(1)
|
| 626 |
|
| 627 |
audio_path = args.audio_path
|
| 628 |
|
| 629 |
SAMPLING_RATE = 16000
|
| 630 |
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
| 631 |
+
logger.info("Audio duration is: %2.2f seconds" % duration)
|
| 632 |
|
| 633 |
asr, online = asr_factory(args, logfile=logfile)
|
| 634 |
min_chunk = args.min_chunk_size
|
|
|
|
| 676 |
try:
|
| 677 |
o = online.process_iter()
|
| 678 |
except AssertionError as e:
|
| 679 |
+
logger.error(f"assertion error: {repr(e)}")
|
| 680 |
pass
|
| 681 |
else:
|
| 682 |
output_transcript(o, now=end)
|
| 683 |
|
| 684 |
+
logger.debug(f"## last processed {end:.2f}s")
|
| 685 |
|
| 686 |
if end >= duration:
|
| 687 |
break
|
|
|
|
| 708 |
try:
|
| 709 |
o = online.process_iter()
|
| 710 |
except AssertionError as e:
|
| 711 |
+
logger.error(f"assertion error: {e}")
|
| 712 |
pass
|
| 713 |
else:
|
| 714 |
output_transcript(o)
|
| 715 |
now = time.time() - start
|
| 716 |
+
logger.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
|
| 717 |
|
| 718 |
if end >= duration:
|
| 719 |
break
|
whisper_online_server.py
CHANGED
|
@@ -7,6 +7,8 @@ import os
|
|
| 7 |
import logging
|
| 8 |
import numpy as np
|
| 9 |
|
|
|
|
|
|
|
| 10 |
parser = argparse.ArgumentParser()
|
| 11 |
|
| 12 |
# server options
|
|
@@ -38,13 +40,6 @@ language = args.lan
|
|
| 38 |
asr, online = asr_factory(args)
|
| 39 |
min_chunk = args.min_chunk_size
|
| 40 |
|
| 41 |
-
|
| 42 |
-
if args.buffer_trimming == "sentence":
|
| 43 |
-
tokenizer = create_tokenizer(tgt_language)
|
| 44 |
-
else:
|
| 45 |
-
tokenizer = None
|
| 46 |
-
online = OnlineASRProcessor(asr,tokenizer,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
| 47 |
-
|
| 48 |
# warm up the ASR because the very first transcribe takes more time than the others.
|
| 49 |
# Test results in https://github.com/ufal/whisper_streaming/pull/81
|
| 50 |
msg = "Whisper is not warmed up. The first chunk processing may take longer."
|
|
@@ -161,7 +156,7 @@ class ServerProcessor:
|
|
| 161 |
try:
|
| 162 |
self.send_result(o)
|
| 163 |
except BrokenPipeError:
|
| 164 |
-
|
| 165 |
break
|
| 166 |
|
| 167 |
# o = online.finish() # this should be working
|
|
@@ -175,13 +170,13 @@ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
| 175 |
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
| 176 |
s.bind((args.host, args.port))
|
| 177 |
s.listen(1)
|
| 178 |
-
|
| 179 |
while True:
|
| 180 |
conn, addr = s.accept()
|
| 181 |
-
|
| 182 |
connection = Connection(conn)
|
| 183 |
proc = ServerProcessor(connection, online, min_chunk)
|
| 184 |
proc.process()
|
| 185 |
conn.close()
|
| 186 |
-
|
| 187 |
-
|
|
|
|
| 7 |
import logging
|
| 8 |
import numpy as np
|
| 9 |
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
print(__name__)
|
| 12 |
parser = argparse.ArgumentParser()
|
| 13 |
|
| 14 |
# server options
|
|
|
|
| 40 |
asr, online = asr_factory(args)
|
| 41 |
min_chunk = args.min_chunk_size
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# warm up the ASR because the very first transcribe takes more time than the others.
|
| 44 |
# Test results in https://github.com/ufal/whisper_streaming/pull/81
|
| 45 |
msg = "Whisper is not warmed up. The first chunk processing may take longer."
|
|
|
|
| 156 |
try:
|
| 157 |
self.send_result(o)
|
| 158 |
except BrokenPipeError:
|
| 159 |
+
logger.info("broken pipe -- connection closed?")
|
| 160 |
break
|
| 161 |
|
| 162 |
# o = online.finish() # this should be working
|
|
|
|
| 170 |
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
| 171 |
s.bind((args.host, args.port))
|
| 172 |
s.listen(1)
|
| 173 |
+
logger.info('Listening on'+str((args.host, args.port)))
|
| 174 |
while True:
|
| 175 |
conn, addr = s.accept()
|
| 176 |
+
logger.info('Connected to client on {}'.format(addr))
|
| 177 |
connection = Connection(conn)
|
| 178 |
proc = ServerProcessor(connection, online, min_chunk)
|
| 179 |
proc.process()
|
| 180 |
conn.close()
|
| 181 |
+
logger.info('Connection to client closed')
|
| 182 |
+
logger.info('Connection closed, terminating.')
|