whisper-webui3

Paused

App Files Files Community

aadnk commited on Oct 15, 2022

Commit

883c794

1 Parent(s): 7f502b4

Refactor function names

Browse files

Also prepare code for creating a CLI

Files changed (5) hide show

app-local.py +2 -2
app-network.py +2 -2
app-shared.py +2 -2
app.py +50 -46
src/download.py +4 -4

app-local.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # Run the app with no audio file restrictions
-from app import createUi
-createUi(-1)

 # Run the app with no audio file restrictions
+from app import create_ui
+create_ui(-1)

app-network.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # Run the app with no audio file restrictions, and make it available on the network
-from app import createUi
-createUi(-1, server_name="0.0.0.0")

 # Run the app with no audio file restrictions, and make it available on the network
+from app import create_ui
+create_ui(-1, server_name="0.0.0.0")

app-shared.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # Run the app with no audio file restrictions
-from app import createUi
-createUi(-1, share=True)

 # Run the app with no audio file restrictions
+from app import create_ui
+create_ui(-1, share=True)

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ import ffmpeg
 # UI
 import gradio as gr
-from src.download import ExceededMaximumDuration, downloadUrl
 from src.utils import slugify, write_srt, write_vtt
 from src.vad import VadPeriodicTranscription, VadSileroTranscription
@@ -45,26 +45,27 @@ LANGUAGES = [
  "Hausa", "Bashkir", "Javanese", "Sundanese"
 ]
-model_cache = dict()
-class UI:
-    def __init__(self, inputAudioMaxDuration):
         self.vad_model = None
         self.inputAudioMaxDuration = inputAudioMaxDuration
-    def transcribeFile(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding):
         try:
-            source, sourceName = self.getSource(urlData, uploadFile, microphoneData)
             try:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
-                model = model_cache.get(selectedModel, None)
                 if not model:
                     model = whisper.load_model(selectedModel)
-                    model_cache[selectedModel] = model
                 # Callable for processing an audio file
                 whisperCallable = lambda audio : model.transcribe(audio, language=selectedLanguage, task=task)
@@ -100,36 +101,39 @@ class UI:
                 text = result["text"]
                 language = result["language"]
-                languageMaxLineWidth = getMaxLineWidth(language)
                 print("Max line width " + str(languageMaxLineWidth))
-                vtt = getSubs(result["segments"], "vtt", languageMaxLineWidth)
-                srt = getSubs(result["segments"], "srt", languageMaxLineWidth)
                 # Files that can be downloaded
                 downloadDirectory = tempfile.mkdtemp()
                 filePrefix = slugify(sourceName, allow_unicode=True)
                 download = []
-                download.append(createFile(srt, downloadDirectory, filePrefix + "-subs.srt"));
-                download.append(createFile(vtt, downloadDirectory, filePrefix + "-subs.vtt"));
-                download.append(createFile(text, downloadDirectory, filePrefix + "-transcript.txt"));
                 return download, text, vtt
             finally:
                 # Cleanup source
-                if DELETE_UPLOADED_FILES:
                     print("Deleting source file " + source)
                     os.remove(source)
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
-    def getSource(self, urlData, uploadFile, microphoneData):
         if urlData:
             # Download from YouTube
-            source = downloadUrl(urlData, self.inputAudioMaxDuration)
         else:
             # File input
             source = uploadFile if uploadFile is not None else microphoneData
@@ -146,38 +150,38 @@ class UI:
         return source, sourceName
-def getMaxLineWidth(language: str) -> int:
-    if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
-        # Chinese characters and kana are wider, so limit line length to 40 characters
-        return 40
-    else:
-        # TODO: Add more languages
-        # 80 latin characters should fit on a 1080p/720p screen
-        return 80
-def createFile(text: str, directory: str, fileName: str) -> str:
-    # Write the text to a file
-    with open(os.path.join(directory, fileName), 'w+', encoding="utf-8") as file:
-        file.write(text)
-    return file.name
-def getSubs(segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
-    segmentStream = StringIO()
-    if format == 'vtt':
-        write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
-    elif format == 'srt':
-        write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
-    else:
-        raise Exception("Unknown format " + format)
-    segmentStream.seek(0)
-    return segmentStream.read()
-def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
-    ui = UI(inputAudioMaxDuration)
     ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
@@ -188,9 +192,9 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
     if inputAudioMaxDuration > 0:
         ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
-    ui_article = "Read the [documentation her](https://huggingface.co/spaces/aadnk/whisper-webui/blob/main/docs/options.md)"
-    demo = gr.Interface(fn=ui.transcribeFile, description=ui_description, article=ui_article, inputs=[
         gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
@@ -210,4 +214,4 @@ def createUi(inputAudioMaxDuration, share=False, server_name: str = None):
     demo.launch(share=share, server_name=server_name)
 if __name__ == '__main__':
-    createUi(DEFAULT_INPUT_AUDIO_MAX_DURATION)

 # UI
 import gradio as gr
+from src.download import ExceededMaximumDuration, download_url
 from src.utils import slugify, write_srt, write_vtt
 from src.vad import VadPeriodicTranscription, VadSileroTranscription
  "Hausa", "Bashkir", "Javanese", "Sundanese"
 ]
+class WhisperTranscriber:
+    def __init__(self, inputAudioMaxDuration: float = DEFAULT_INPUT_AUDIO_MAX_DURATION, deleteUploadedFiles: bool = DELETE_UPLOADED_FILES):
+        self.model_cache = dict()
         self.vad_model = None
         self.inputAudioMaxDuration = inputAudioMaxDuration
+        self.deleteUploadedFiles = deleteUploadedFiles
+    def transcribe_file(self, modelName, languageName, urlData, uploadFile, microphoneData, task, vad, vadMergeWindow, vadMaxMergeSize, vadPadding):
         try:
+            source, sourceName = self.__get_source(urlData, uploadFile, microphoneData)
             try:
                 selectedLanguage = languageName.lower() if len(languageName) > 0 else None
                 selectedModel = modelName if modelName is not None else "base"
+                model = self.model_cache.get(selectedModel, None)
                 if not model:
                     model = whisper.load_model(selectedModel)
+                    self.model_cache[selectedModel] = model
                 # Callable for processing an audio file
                 whisperCallable = lambda audio : model.transcribe(audio, language=selectedLanguage, task=task)
                 text = result["text"]
                 language = result["language"]
+                languageMaxLineWidth = self.__get_max_line_width(language)
                 print("Max line width " + str(languageMaxLineWidth))
+                vtt = self.__get_subs(result["segments"], "vtt", languageMaxLineWidth)
+                srt = self.__get_subs(result["segments"], "srt", languageMaxLineWidth)
                 # Files that can be downloaded
                 downloadDirectory = tempfile.mkdtemp()
                 filePrefix = slugify(sourceName, allow_unicode=True)
                 download = []
+                download.append(self.__create_file(srt, downloadDirectory, filePrefix + "-subs.srt"));
+                download.append(self.__create_file(vtt, downloadDirectory, filePrefix + "-subs.vtt"));
+                download.append(self.__create_file(text, downloadDirectory, filePrefix + "-transcript.txt"));
                 return download, text, vtt
             finally:
                 # Cleanup source
+                if self.deleteUploadedFiles:
                     print("Deleting source file " + source)
                     os.remove(source)
         except ExceededMaximumDuration as e:
             return [], ("[ERROR]: Maximum remote video length is " + str(e.maxDuration) + "s, file was " + str(e.videoDuration) + "s"), "[ERROR]"
+    def clear_cache(self):
+        self.model_cache = dict()
+    def __get_source(self, urlData, uploadFile, microphoneData):
         if urlData:
             # Download from YouTube
+            source = download_url(urlData, self.inputAudioMaxDuration)
         else:
             # File input
             source = uploadFile if uploadFile is not None else microphoneData
         return source, sourceName
+    def __get_max_line_width(self, language: str) -> int:
+        if (language and language.lower() in ["japanese", "ja", "chinese", "zh"]):
+            # Chinese characters and kana are wider, so limit line length to 40 characters
+            return 40
+        else:
+            # TODO: Add more languages
+            # 80 latin characters should fit on a 1080p/720p screen
+            return 80
+    def __get_subs(self, segments: Iterator[dict], format: str, maxLineWidth: int) -> str:
+        segmentStream = StringIO()
+        if format == 'vtt':
+            write_vtt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+        elif format == 'srt':
+            write_srt(segments, file=segmentStream, maxLineWidth=maxLineWidth)
+        else:
+            raise Exception("Unknown format " + format)
+        segmentStream.seek(0)
+        return segmentStream.read()
+    def __create_file(self, text: str, directory: str, fileName: str) -> str:
+        # Write the text to a file
+        with open(os.path.join(directory, fileName), 'w+', encoding="utf-8") as file:
+            file.write(text)
+        return file.name
+def create_ui(inputAudioMaxDuration, share=False, server_name: str = None):
+    ui = WhisperTranscriber(inputAudioMaxDuration)
     ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
     ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
     if inputAudioMaxDuration > 0:
         ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
+    ui_article = "Read the [documentation here](https://huggingface.co/spaces/aadnk/whisper-webui/blob/main/docs/options.md)"
+    demo = gr.Interface(fn=ui.transcribe_file, description=ui_description, article=ui_article, inputs=[
         gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
         gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
         gr.Text(label="URL (YouTube, etc.)"),
     demo.launch(share=share, server_name=server_name)
 if __name__ == '__main__':
+    create_ui(DEFAULT_INPUT_AUDIO_MAX_DURATION)

src/download.py CHANGED Viewed

@@ -13,16 +13,16 @@ class FilenameCollectorPP(PostProcessor):
         self.filenames.append(information["filepath"])
         return [], information
-def downloadUrl(url: str, maxDuration: int = None):
     try:
-        return _performDownload(url, maxDuration=maxDuration)
     except yt_dlp.utils.DownloadError as e:
         # In case of an OS error, try again with a different output template
         if e.msg and e.msg.find("[Errno 36] File name too long") >= 0:
-            return _performDownload(url, maxDuration=maxDuration, outputTemplate="%(title).10s %(id)s.%(ext)s")
         pass
-def _performDownload(url: str, maxDuration: int = None, outputTemplate: str = None):
     destinationDirectory = mkdtemp()
     ydl_opts = {

         self.filenames.append(information["filepath"])
         return [], information
+def download_url(url: str, maxDuration: int = None):
     try:
+        return _perform_download(url, maxDuration=maxDuration)
     except yt_dlp.utils.DownloadError as e:
         # In case of an OS error, try again with a different output template
         if e.msg and e.msg.find("[Errno 36] File name too long") >= 0:
+            return _perform_download(url, maxDuration=maxDuration, outputTemplate="%(title).10s %(id)s.%(ext)s")
         pass
+def _perform_download(url: str, maxDuration: int = None, outputTemplate: str = None):
     destinationDirectory = mkdtemp()
     ydl_opts = {