Dominik Macháček
commited on
Commit
·
2625be1
1
Parent(s):
260b1f8
Ukrainian tokenizer support
Browse files- whisper_online.py +17 -9
- whisper_online_server.py +4 -1
whisper_online.py
CHANGED
|
@@ -4,7 +4,7 @@ import numpy as np
|
|
| 4 |
import librosa
|
| 5 |
from functools import lru_cache
|
| 6 |
import time
|
| 7 |
-
|
| 8 |
|
| 9 |
|
| 10 |
@lru_cache
|
|
@@ -207,14 +207,12 @@ class OnlineASRProcessor:
|
|
| 207 |
|
| 208 |
SAMPLING_RATE = 16000
|
| 209 |
|
| 210 |
-
def __init__(self,
|
| 211 |
-
"""
|
| 212 |
-
|
| 213 |
-
chunk: number of seconds for intended size of audio interval that is inserted and looped
|
| 214 |
"""
|
| 215 |
-
self.language = language
|
| 216 |
self.asr = asr
|
| 217 |
-
self.tokenizer =
|
| 218 |
|
| 219 |
self.init()
|
| 220 |
|
|
@@ -369,7 +367,7 @@ class OnlineASRProcessor:
|
|
| 369 |
self.last_chunked_at = time
|
| 370 |
|
| 371 |
def words_to_sentences(self, words):
|
| 372 |
-
"""Uses
|
| 373 |
Returns: [(beg,end,"sentence 1"),...]
|
| 374 |
"""
|
| 375 |
|
|
@@ -419,6 +417,15 @@ class OnlineASRProcessor:
|
|
| 419 |
return (b,e,t)
|
| 420 |
|
| 421 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
|
| 423 |
## main:
|
| 424 |
|
|
@@ -482,8 +489,9 @@ if __name__ == "__main__":
|
|
| 482 |
print("setting VAD filter",file=sys.stderr)
|
| 483 |
asr.use_vad()
|
| 484 |
|
|
|
|
| 485 |
min_chunk = args.min_chunk_size
|
| 486 |
-
online = OnlineASRProcessor(tgt_language
|
| 487 |
|
| 488 |
|
| 489 |
# load the audio into the LRU cache before we start the timer
|
|
|
|
| 4 |
import librosa
|
| 5 |
from functools import lru_cache
|
| 6 |
import time
|
| 7 |
+
|
| 8 |
|
| 9 |
|
| 10 |
@lru_cache
|
|
|
|
| 207 |
|
| 208 |
SAMPLING_RATE = 16000
|
| 209 |
|
| 210 |
+
def __init__(self, asr, tokenizer):
|
| 211 |
+
"""asr: WhisperASR object
|
| 212 |
+
tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer.
|
|
|
|
| 213 |
"""
|
|
|
|
| 214 |
self.asr = asr
|
| 215 |
+
self.tokenizer = tokenizer
|
| 216 |
|
| 217 |
self.init()
|
| 218 |
|
|
|
|
| 367 |
self.last_chunked_at = time
|
| 368 |
|
| 369 |
def words_to_sentences(self, words):
|
| 370 |
+
"""Uses self.tokenizer for sentence segmentation of words.
|
| 371 |
Returns: [(beg,end,"sentence 1"),...]
|
| 372 |
"""
|
| 373 |
|
|
|
|
| 417 |
return (b,e,t)
|
| 418 |
|
| 419 |
|
| 420 |
+
def create_tokenizer(lan):
|
| 421 |
+
if lan == "uk":
|
| 422 |
+
import tokenize_uk
|
| 423 |
+
class UkrainianTokenizer:
|
| 424 |
+
def split(self, text):
|
| 425 |
+
return tokenize_uk.tokenize_sents(text)
|
| 426 |
+
return UkrainianTokenizer()
|
| 427 |
+
from mosestokenizer import MosesTokenizer
|
| 428 |
+
return MosesTokenizer(lan)
|
| 429 |
|
| 430 |
## main:
|
| 431 |
|
|
|
|
| 489 |
print("setting VAD filter",file=sys.stderr)
|
| 490 |
asr.use_vad()
|
| 491 |
|
| 492 |
+
|
| 493 |
min_chunk = args.min_chunk_size
|
| 494 |
+
online = OnlineASRProcessor(asr,create_tokenizer(tgt_language))
|
| 495 |
|
| 496 |
|
| 497 |
# load the audio into the LRU cache before we start the timer
|
whisper_online_server.py
CHANGED
|
@@ -48,6 +48,9 @@ asr = asr_cls(modelsize=size, lan=language, cache_dir=args.model_cache_dir, mode
|
|
| 48 |
|
| 49 |
if args.task == "translate":
|
| 50 |
asr.set_translate_task()
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
e = time.time()
|
| 53 |
print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
|
|
@@ -58,7 +61,7 @@ if args.vad:
|
|
| 58 |
|
| 59 |
|
| 60 |
min_chunk = args.min_chunk_size
|
| 61 |
-
online = OnlineASRProcessor(
|
| 62 |
|
| 63 |
|
| 64 |
|
|
|
|
| 48 |
|
| 49 |
if args.task == "translate":
|
| 50 |
asr.set_translate_task()
|
| 51 |
+
tgt_language = "en"
|
| 52 |
+
else:
|
| 53 |
+
tgt_language = language
|
| 54 |
|
| 55 |
e = time.time()
|
| 56 |
print(f"done. It took {round(e-t,2)} seconds.",file=sys.stderr)
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
min_chunk = args.min_chunk_size
|
| 64 |
+
online = OnlineASRProcessor(asr,create_tokenizer(tgt_language))
|
| 65 |
|
| 66 |
|
| 67 |
|