Spaces:

aadnk
/

whisper-webui

Runtime error

App Files Files Community

aadnk commited on Nov 22, 2022

Commit

01fddc0

1 Parent(s): 9934006

Fix CLI for parallel devices

Browse files

Files changed (4) hide show

app.py +4 -1
cli.py +8 -4
src/vadParallel.py +12 -5
src/whisperContainer.py +3 -2

app.py CHANGED Viewed

@@ -60,6 +60,9 @@ class WhisperTranscriber:
         self.inputAudioMaxDuration = input_audio_max_duration
         self.deleteUploadedFiles = delete_uploaded_files
     def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
         try:
             source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
@@ -255,7 +258,7 @@ def create_ui(input_audio_max_duration, share=False, server_name: str = None, se
     ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout)
     # Specify a list of devices to use for parallel processing
-    ui.parallel_device_list = [ device.strip() for device in vad_parallel_devices.split(",") ] if vad_parallel_devices else None
     ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "

         self.inputAudioMaxDuration = input_audio_max_duration
         self.deleteUploadedFiles = delete_uploaded_files
+    def set_parallel_devices(self, vad_parallel_devices: str):
+        self.parallel_device_list = [ device.strip() for device in vad_parallel_devices.split(",") ] if vad_parallel_devices else None
     def transcribe_webui(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow):
         try:
             source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
     ui = WhisperTranscriber(input_audio_max_duration, vad_process_timeout)
     # Specify a list of devices to use for parallel processing
+    ui.set_parallel_devices(vad_parallel_devices)
     ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "

cli.py CHANGED Viewed

@@ -12,6 +12,7 @@ from app import LANGUAGES, WhisperTranscriber
 from src.download import download_url
 from src.utils import optional_float, optional_int, str2bool
 def cli():
@@ -31,7 +32,7 @@ def cli():
     parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
     parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
     parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
-    parser.add_argument("--vad_parallel_devices", type=str, default="0", help="A commma delimited list of CUDA devices to use for paralell processing. If None, disable parallel processing.")
     parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
     parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
@@ -73,9 +74,12 @@ def cli():
     vad_padding = args.pop("vad_padding")
     vad_prompt_window = args.pop("vad_prompt_window")
-    model = whisper.load_model(model_name, device=device, download_root=model_dir)
     transcriber = WhisperTranscriber(delete_uploaded_files=False)
-    transcriber.parallel_device_list = args.pop("vad_parallel_devices")
     for audio_path in args.pop("audio"):
         sources = []
@@ -99,7 +103,7 @@ def cli():
             transcriber.write_result(result, source_name, output_dir)
-    transcriber.clear_cache()
 def uri_validator(x):
     try:

 from src.download import download_url
 from src.utils import optional_float, optional_int, str2bool
+from src.whisperContainer import WhisperContainer
 def cli():
     parser.add_argument("--vad_max_merge_size", type=optional_float, default=30, help="The maximum size (in seconds) of a voice segment")
     parser.add_argument("--vad_padding", type=optional_float, default=1, help="The padding (in seconds) to add to each voice segment")
     parser.add_argument("--vad_prompt_window", type=optional_float, default=3, help="The window size of the prompt to pass to Whisper")
+    parser.add_argument("--vad_parallel_devices", type=str, default="", help="A commma delimited list of CUDA devices to use for paralell processing. If None, disable parallel processing.")
     parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
     parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
     vad_padding = args.pop("vad_padding")
     vad_prompt_window = args.pop("vad_prompt_window")
+    model = WhisperContainer(model_name, device=device, download_root=model_dir)
     transcriber = WhisperTranscriber(delete_uploaded_files=False)
+    transcriber.set_parallel_devices(args.pop("vad_parallel_devices"))
+    if (transcriber._has_parallel_devices()):
+        print("Using parallel devices:", transcriber.parallel_device_list)
     for audio_path in args.pop("audio"):
         sources = []
             transcriber.write_result(result, source_name, output_dir)
+    transcriber.close()
 def uri_validator(x):
     try:

src/vadParallel.py CHANGED Viewed

@@ -88,14 +88,20 @@ class ParallelTranscription(AbstractTranscription):
         # Split into a list for each device
         # TODO: Split by time instead of by number of chunks
-        merged_split = self._chunks(merged, max(len(merged) // len(devices), 1))
         # Parameters that will be passed to the transcribe function
         parameters = []
         segment_index = config.initial_segment_index
         for i in range(len(merged_split)):
-            device_segment_list = merged_split[i]
             # Create a new config with the given device ID
             device_config = ParallelTranscriptionConfig(devices[i], device_segment_list, segment_index, config)
@@ -159,7 +165,8 @@ class ParallelTranscription(AbstractTranscription):
             os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
         return super().transcribe(audio, whisperCallable, config)
-    def _chunks(self, lst, n):
-        """Yield successive n-sized chunks from lst."""
-        return [lst[i:i + n] for i in range(0, len(lst), n)]

         # Split into a list for each device
         # TODO: Split by time instead of by number of chunks
+        merged_split = list(self._split(merged, len(devices)))
         # Parameters that will be passed to the transcribe function
         parameters = []
         segment_index = config.initial_segment_index
         for i in range(len(merged_split)):
+            device_segment_list = list(merged_split[i])
+            device_id = devices[i]
+            if (len(device_segment_list) <= 0):
+                continue
+            print("Device " + device_id + " (index " + str(i) + ") has " + str(len(device_segment_list)) + " segments")
             # Create a new config with the given device ID
             device_config = ParallelTranscriptionConfig(devices[i], device_segment_list, segment_index, config)
             os.environ["CUDA_VISIBLE_DEVICES"] = config.device_id
         return super().transcribe(audio, whisperCallable, config)
+    def _split(self, a, n):
+        """Split a list into n approximately equal parts."""
+        k, m = divmod(len(a), n)
+        return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))

src/whisperContainer.py CHANGED Viewed

@@ -23,9 +23,10 @@ class WhisperModelCache:
 GLOBAL_WHISPER_MODEL_CACHE = WhisperModelCache()
 class WhisperContainer:
-    def __init__(self, model_name: str, device: str = None, cache: WhisperModelCache = None):
         self.model_name = model_name
         self.device = device
         self.cache = cache
         # Will be created on demand
@@ -36,7 +37,7 @@ class WhisperContainer:
             if (self.cache is None):
                 print("Loading whisper model " + self.model_name)
-                self.model = whisper.load_model(self.model_name, device=self.device)
             else:
                 self.model = self.cache.get(self.model_name, device=self.device)
         return self.model

 GLOBAL_WHISPER_MODEL_CACHE = WhisperModelCache()
 class WhisperContainer:
+    def __init__(self, model_name: str, device: str = None, download_root: str = None, cache: WhisperModelCache = None):
         self.model_name = model_name
         self.device = device
+        self.download_root = download_root
         self.cache = cache
         # Will be created on demand
             if (self.cache is None):
                 print("Loading whisper model " + self.model_name)
+                self.model = whisper.load_model(self.model_name, device=self.device, download_root=self.download_root)
             else:
                 self.model = self.cache.get(self.model_name, device=self.device)
         return self.model