File size: 6,879 Bytes
d543411 6fa0080 d543411 6fa0080 d543411 6fa0080 d543411 6fa0080 d543411 6fa0080 d543411 6fa0080 d543411 6fa0080 d543411 6fa0080 d543411 c812334 6fa0080 d543411 6fa0080 d543411 6fa0080 d543411 6fa0080 d543411 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
from whisper_online import *
from voice_activity_controller import *
import soundfile
import io
SAMPLING_RATE = 16000
class VACOnlineASRProcessor(OnlineASRProcessor):
def __init__(self, online_chunk_size, *a, **kw):
self.online_chunk_size = online_chunk_size
self.online = OnlineASRProcessor(*a, **kw)
self.vac = VoiceActivityController(use_vad_result = False)
self.logfile = self.online.logfile
self.init()
def init(self):
self.online.init()
self.vac.reset_states()
self.current_online_chunk_buffer_size = 0
self.is_currently_final = False
def insert_audio_chunk(self, audio):
r = self.vac.detect_speech_iter(audio,audio_in_int16=False)
audio, is_final = r
print(is_final)
self.is_currently_final = is_final
self.online.insert_audio_chunk(audio)
self.current_online_chunk_buffer_size += len(audio)
def process_iter(self):
if self.is_currently_final:
return self.finish()
elif self.current_online_chunk_buffer_size > SAMPLING_RATE*self.online_chunk_size:
self.current_online_chunk_buffer_size = 0
ret = self.online.process_iter()
return ret
else:
print("no online update, only VAD", file=self.logfile)
return (None, None, "")
def finish(self):
ret = self.online.finish()
self.online.init(keep_offset=True)
self.current_online_chunk_buffer_size = 0
return ret
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
add_shared_args(parser)
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
args = parser.parse_args()
# reset to store stderr to different file stream, e.g. open(os.devnull,"w")
logfile = sys.stderr
if args.offline and args.comp_unaware:
print("No or one option from --offline and --comp_unaware are available, not both. Exiting.",file=logfile)
sys.exit(1)
audio_path = args.audio_path
SAMPLING_RATE = 16000
duration = len(load_audio(audio_path))/SAMPLING_RATE
print("Audio duration is: %2.2f seconds" % duration, file=logfile)
size = args.model
language = args.lan
t = time.time()
print(f"Loading Whisper {size} model for {language}...",file=logfile,end=" ",flush=True)
if args.backend == "faster-whisper":
asr_cls = FasterWhisperASR
else:
asr_cls = WhisperTimestampedASR
asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
if args.task == "translate":
asr.set_translate_task()
tgt_language = "en" # Whisper translates into English
else:
tgt_language = language # Whisper transcribes in this language
e = time.time()
print(f"done. It took {round(e-t,2)} seconds.",file=logfile)
if args.vad:
print("setting VAD filter",file=logfile)
asr.use_vad()
min_chunk = args.vac_chunk_size
if args.buffer_trimming == "sentence":
tokenizer = create_tokenizer(tgt_language)
else:
tokenizer = None
online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
# load the audio into the LRU cache before we start the timer
a = load_audio_chunk(audio_path,0,1)
# warm up the ASR, because the very first transcribe takes much more time than the other
asr.transcribe(a)
beg = args.start_at
start = time.time()-beg
def output_transcript(o, now=None):
# output format in stdout is like:
# 4186.3606 0 1720 Takhle to je
# - the first three words are:
# - emission time from beginning of processing, in milliseconds
# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
# - the next words: segment transcript
if now is None:
now = time.time()-start
if o[0] is not None:
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True)
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True)
else:
print(o,file=logfile,flush=True)
if args.offline: ## offline mode processing (for testing/debugging)
a = load_audio(audio_path)
online.insert_audio_chunk(a)
try:
o = online.process_iter()
except AssertionError:
print("assertion error",file=logfile)
pass
else:
output_transcript(o)
now = None
elif args.comp_unaware: # computational unaware mode
end = beg + min_chunk
while True:
a = load_audio_chunk(audio_path,beg,end)
online.insert_audio_chunk(a)
try:
o = online.process_iter()
except AssertionError:
print("assertion error",file=logfile)
pass
else:
output_transcript(o, now=end)
print(f"## last processed {end:.2f}s",file=logfile,flush=True)
if end >= duration:
break
beg = end
if end + min_chunk > duration:
end = duration
else:
end += min_chunk
now = duration
else: # online = simultaneous mode
end = 0
while True:
now = time.time() - start
if now < end+min_chunk:
time.sleep(min_chunk+end-now)
end = time.time() - start
a = load_audio_chunk(audio_path,beg,end)
beg = end
online.insert_audio_chunk(a)
try:
o = online.process_iter()
except AssertionError:
print("assertion error",file=logfile)
pass
else:
output_transcript(o)
now = time.time() - start
print(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}",file=logfile,flush=True)
if end >= duration:
break
now = None
o = online.finish()
output_transcript(o, now=now)
|