Spaces:

Ketengan-Diffusion
/

whisper-webui3

Paused

App Files Files Community

aadnk commited on Sep 23, 2022

Commit

71950a8

1 Parent(s): 93c4867

Make it easier to run with no audio file restrictions

Browse files

Files changed (3) hide show

README.md +12 -0
app-full.py +3 -0
app.py +44 -33

README.md CHANGED Viewed

@@ -11,3 +11,15 @@ license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Running Locally
+To run this program locally, first install Python 3.9 and Git. Then install Pytorch 10.1 and all the dependencies:
+```
+pip install -r requirements.txt
+```
+Finally, run the "full" version of the app:
+```
+python app-full.py
+```

app-full.py ADDED Viewed

	@@ -0,0 +1,3 @@

+# Run the app with no audio file restrictions
+from app import createUi
+createUi(-1)

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ import ffmpeg
 #os.system("pip install git+https://github.com/openai/whisper.git")
 # Limitations (set to -1 to disable)
-INPUT_AUDIO_MAX_DURATION = 120 # seconds
 LANGUAGES = [
  "English", "Chinese", "German", "Spanish", "Russian", "Korean",
@@ -34,46 +34,57 @@ LANGUAGES = [
 model_cache = dict()
-def transcribeFile(modelName, languageName, uploadFile, microphoneData, task):
-    source = uploadFile if uploadFile is not None else microphoneData
-    selectedLanguage = languageName.lower() if len(languageName) > 0 else None
-    selectedModel = modelName if modelName is not None else "base"
-    if INPUT_AUDIO_MAX_DURATION > 0:
-        # Calculate audio length
-        audioDuration = ffmpeg.probe(source)["format"]["duration"]
-        if float(audioDuration) > INPUT_AUDIO_MAX_DURATION:
-            return ("[ERROR]: Maximum audio file length is " + str(INPUT_AUDIO_MAX_DURATION) + "s, file was " + str(audioDuration) + "s"), "[ERROR]"
-    model = model_cache.get(selectedModel, None)
-    if not model:
-        model = whisper.load_model(selectedModel)
-        model_cache[selectedModel] = model
-    result = model.transcribe(source, language=selectedLanguage, task=task)
-    segmentStream = StringIO()
-    write_vtt(result["segments"], file=segmentStream)
-    segmentStream.seek(0)
-    return result["text"], segmentStream.read()
-ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
-ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
-ui_description += " as well as speech translation and language identification. "
-if INPUT_AUDIO_MAX_DURATION > 0:
-    ui_description += "\n\n" + "Max audio file length: " + str(INPUT_AUDIO_MAX_DURATION) + " s"
-demo = gr.Interface(fn=transcribeFile, description=ui_description, inputs=[
-    gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
-    gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
-    gr.Audio(source="upload", type="filepath", label="Upload Audio"),
-    gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
-    gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
-], outputs=[gr.Text(label="Transcription"), gr.Text(label="Segments")])
-demo.launch()

 #os.system("pip install git+https://github.com/openai/whisper.git")
 # Limitations (set to -1 to disable)
+DEFAULT_INPUT_AUDIO_MAX_DURATION = 120 # seconds
 LANGUAGES = [
  "English", "Chinese", "German", "Spanish", "Russian", "Korean",
 model_cache = dict()
+class UI:
+    def __init__(self, inputAudioMaxDuration):
+        self.inputAudioMaxDuration = inputAudioMaxDuration
+    def transcribeFile(self, modelName, languageName, uploadFile, microphoneData, task):
+        source = uploadFile if uploadFile is not None else microphoneData
+        selectedLanguage = languageName.lower() if len(languageName) > 0 else None
+        selectedModel = modelName if modelName is not None else "base"
+        if self.inputAudioMaxDuration > 0:
+            # Calculate audio length
+            audioDuration = ffmpeg.probe(source)["format"]["duration"]
+            if float(audioDuration) > self.inputAudioMaxDuration:
+                return ("[ERROR]: Maximum audio file length is " + str(self.inputAudioMaxDuration) + "s, file was " + str(audioDuration) + "s"), "[ERROR]"
+        model = model_cache.get(selectedModel, None)
+        if not model:
+            model = whisper.load_model(selectedModel)
+            model_cache[selectedModel] = model
+        result = model.transcribe(source, language=selectedLanguage, task=task)
+        segmentStream = StringIO()
+        write_vtt(result["segments"], file=segmentStream)
+        segmentStream.seek(0)
+        return result["text"], segmentStream.read()
+def createUi(inputAudioMaxDuration):
+    ui = UI(inputAudioMaxDuration)
+    ui_description = "Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse "
+    ui_description += " audio and is also a multi-task model that can perform multilingual speech recognition "
+    ui_description += " as well as speech translation and language identification. "
+    if inputAudioMaxDuration > 0:
+        ui_description += "\n\n" + "Max audio file length: " + str(inputAudioMaxDuration) + " s"
+    demo = gr.Interface(fn=ui.transcribeFile, description=ui_description, inputs=[
+        gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"], value="medium", label="Model"),
+        gr.Dropdown(choices=sorted(LANGUAGES), label="Language"),
+        gr.Audio(source="upload", type="filepath", label="Upload Audio"),
+        gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
+        gr.Dropdown(choices=["transcribe", "translate"], label="Task"),
+    ], outputs=[gr.Text(label="Transcription"), gr.Text(label="Segments")])
+    demo.launch()
+if __name__ == '__main__':
+    createUi(DEFAULT_INPUT_AUDIO_MAX_DURATION)