whisper-v3-zero

Running on Zero

App Files Files Community

devilent2 commited on Apr 7

Commit

dde0a2b

•

1 Parent(s): 00f1499

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -68

app.py CHANGED Viewed

@@ -10,6 +10,8 @@ from transformers.pipelines.audio_utils import ffmpeg_read
 DEFAULT_MODEL_NAME = "distil-whisper/distil-large-v3"
 BATCH_SIZE = 8
 device = 0 if torch.cuda.is_available() else "cpu"
 if device == "cpu":
     DEFAULT_MODEL_NAME = "openai/whisper-tiny"
@@ -23,42 +25,17 @@ def load_pipeline(model_name):
     )
 pipe = load_pipeline(DEFAULT_MODEL_NAME)
-@spaces.GPU
-def transcribe(inputs, task, model_name):
-    if inputs is None:
-        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    global pipe
-    if model_name != pipe.model.name_or_path:
-        pipe = load_pipeline(model_name)
-    start_time = time.time()  # Record the start time
-    # Load the audio file and calculate its duration
-    audio = mp.AudioFileClip(inputs)
-    audio_duration = audio.duration
-    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    end_time = time.time()  # Record the end time
-    transcription_time = end_time - start_time  # Calculate the transcription time
-    # Create the transcription time output with additional information
-    transcription_time_output = (
-        f"Transcription Time: {transcription_time:.2f} seconds\n"
-        f"Audio Duration: {audio_duration:.2f} seconds\n"
-        f"Model Used: {model_name}\n"
-        f"Device Used: {'GPU' if torch.cuda.is_available() else 'CPU'}"
-    )
-    return text, transcription_time_output
 from gpustat import GPUStatCollection
 def update_gpu_status():
     if torch.cuda.is_available() == False:
-        return "No Nviadia Device"
     try:
         gpu_stats = GPUStatCollection.new_query()
         for gpu in gpu_stats:
@@ -81,9 +58,10 @@ def torch_update_gpu_status():
         gpu_info = torch.cuda.get_device_name(0)
         gpu_memory = torch.cuda.mem_get_info(0)
         total_memory = gpu_memory[1] / (1024 * 1024)
         used_memory = (gpu_memory[1] - gpu_memory[0]) / (1024 * 1024)
-        gpu_status = f"GPU: {gpu_info}\nTotal Memory: {total_memory:.2f} MB\nUsed Memory: {used_memory:.2f} MB"
     else:
         gpu_status = "No GPU available"
     return gpu_status
@@ -102,70 +80,117 @@ def update_cpu_status():
 def update_status():
     gpu_status = update_gpu_status()
     cpu_status = update_cpu_status()
-    return gpu_status, cpu_status
 def refresh_status():
     return update_status()
-demo = gr.Blocks()
-mf_transcribe = gr.Interface(
-    fn=transcribe,
-    inputs=[
-        gr.Audio(type="filepath"),
-        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
-        gr.Textbox(
             label="Model Name",
             value=DEFAULT_MODEL_NAME,
             placeholder="Enter the model name",
-            info="Some available models: distil-whisper/distil-large-v3 distil-whisper/distil-medium.en Systran/faster-distil-whisper-large-v3 Systran/faster-whisper-large-v3 Systran/faster-whisper-medium openai/whisper-tiny, openai/whisper-base, openai/whisper-medium, openai/whisper-large-v3",
-        ),
-    ],
-    outputs=[gr.TextArea(label="Transcription"), gr.TextArea(label="Transcription Info")],
-    theme="huggingface",
-    title="Whisper Transcription",
-    description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the specified OpenAI Whisper"
-        " checkpoint and 🤗 Transformers to transcribe audio files of arbitrary length."
-    ),
-    allow_flagging="never",
-)
-file_transcribe = gr.Interface(
-    fn=transcribe,
     inputs=[
-        gr.Audio(type="filepath", label="Audio file"),
-        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
-        gr.Textbox(
-            label="Model Name",
-            value=DEFAULT_MODEL_NAME,
-            placeholder="Enter the model name",
-            info="Some available models: openai/whisper-tiny, openai/whisper-base, openai/whisper-medium, openai/whisper-large-v2",
-        ),
     ],
-    outputs=[gr.TextArea(label="Transcription"), gr.TextArea(label="Transcription Info")],
     theme="huggingface",
     title="Whisper Transcription",
     description=(
-        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the specified OpenAI Whisper"
-        " checkpoint and 🤗 Transformers to transcribe audio files of arbitrary length."
     ),
     allow_flagging="never",
 )
 with demo:
-    gr.TabbedInterface([mf_transcribe, file_transcribe], ["Microphone", "Audio file"])
     with gr.Row():
         refresh_button = gr.Button("Refresh Status")  # Create a refresh button
-    gpu_status_output = gr.Textbox(label="GPU Status", interactive=False)
-    cpu_status_output = gr.Textbox(label="CPU Status", interactive=False)
     # Link the refresh button to the refresh_status function
-    refresh_button.click(refresh_status, None, [gpu_status_output, cpu_status_output])
     # Load the initial status using update_status function
-    demo.load(update_status, inputs=None, outputs=[gpu_status_output, cpu_status_output], every=2, queue=False)
 # Launch the Gradio app
 demo.launch(share=True)

 DEFAULT_MODEL_NAME = "distil-whisper/distil-large-v3"
 BATCH_SIZE = 8
+print('start app')
 device = 0 if torch.cuda.is_available() else "cpu"
 if device == "cpu":
     DEFAULT_MODEL_NAME = "openai/whisper-tiny"
     )
 pipe = load_pipeline(DEFAULT_MODEL_NAME)
+openai_pipe=load_pipeline("openai/whisper-large-v3")
+default_pipe = load_pipeline(DEFAULT_MODEL_NAME)
+#pipe = None
 from gpustat import GPUStatCollection
 def update_gpu_status():
     if torch.cuda.is_available() == False:
+        return "No Nvidia Device"
     try:
         gpu_stats = GPUStatCollection.new_query()
         for gpu in gpu_stats:
         gpu_info = torch.cuda.get_device_name(0)
         gpu_memory = torch.cuda.mem_get_info(0)
         total_memory = gpu_memory[1] / (1024 * 1024)
+        free_memory=gpu_memory[0] /(1024 *1024)
         used_memory = (gpu_memory[1] - gpu_memory[0]) / (1024 * 1024)
+        gpu_status = f"GPU: {gpu_info} Free Memory:{free_memory}MB   Total Memory: {total_memory:.2f} MB  Used Memory: {used_memory:.2f} MB"
     else:
         gpu_status = "No GPU available"
     return gpu_status
 def update_status():
     gpu_status = update_gpu_status()
     cpu_status = update_cpu_status()
+    sys_status=gpu_status+"\n\n"+cpu_status
+    return sys_status
 def refresh_status():
     return update_status()
+@spaces.GPU
+def transcribe(audio_path, model_name):
+    print(str(time.time())+'  start transcribe ')
+    if audio_path is None:
+        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    audio_path=audio_path.strip()
+    model_name=model_name.strip()
+    global pipe
+    if model_name != pipe.model.name_or_path:
+        print("old model is:"+ pipe.model.name_or_path )
+        if model_name=="openai/whisper-large-v3":
+            pipe=openai_pipe
+            print(str(time.time())+" use openai model " + pipe.model.name_or_path)
+        elif model_name==DEFAULT_MODEL_NAME:
+            pipe=default_pipe
+            print(str(time.time())+" use default model " + pipe.model.name_or_path)
+        else:
+            print(str(time.time())+'  start load model ' + model_name)
+            pipe = load_pipeline(model_name)
+            print(str(time.time())+'  finished load model ' + model_name)
+    start_time = time.time()  # Record the start time
+    print(str(time.time())+'  start processing and set recording start time point')
+    # Load the audio file and calculate its duration
+    audio = mp.AudioFileClip(audio_path)
+    audio_duration = audio.duration
+    print(str(time.time())+'   start pipe ')
+    text = pipe(audio_path, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
+    end_time = time.time()  # Record the end time
+    transcription_time = end_time - start_time  # Calculate the transcription time
+    # Create the transcription time output with additional information
+    transcription_time_output = (
+        f"Transcription Time: {transcription_time:.2f} seconds\n"
+        f"Audio Duration: {audio_duration:.2f} seconds\n"
+        f"Model Used: {model_name}\n"
+        f"Device Used: {'GPU' if torch.cuda.is_available() else 'CPU'}"
+    )
+    print(str(time.time())+'   return transcribe '+ text )
+    return text, transcription_time_output
+@spaces.GPU
+def handle_upload_audio(audio_path,model_name,old_transcription=''):
+    print('old_trans:' + old_transcription)
+    (text,transcription_time_output)=transcribe(audio_path,model_name)
+    return text+'\n\n'+old_transcription, transcription_time_output
+graudio=gr.Audio(type="filepath",show_download_button=True)
+grmodel_textbox=gr.Textbox(
             label="Model Name",
             value=DEFAULT_MODEL_NAME,
             placeholder="Enter the model name",
+            info="Some available models: distil-whisper/distil-large-v3   distil-whisper/distil-medium.en   Systran/faster-distil-whisper-large-v3    Systran/faster-whisper-large-v3    Systran/faster-whisper-medium    openai/whisper-tiny,   openai/whisper-base,   openai/whisper-medium,    openai/whisper-large-v3",
+        )
+groutputs=[gr.TextArea(label="Transcription",elem_id="transcription_textarea",interactive=True,lines=20,show_copy_button=True),
+           gr.TextArea(label="Transcription Info",interactive=True,show_copy_button=True)]
+mf_transcribe = gr.Interface(
+    fn=handle_upload_audio,
     inputs=[
+        graudio, #"numpy" or filepath
+        #gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
+        grmodel_textbox,
     ],
+    outputs=groutputs,
     theme="huggingface",
     title="Whisper Transcription",
     description=(
+        "Scroll to Bottom to show system status.  "
+        "Transcribe long-form microphone or audio file after uploaded audio! "
     ),
     allow_flagging="never",
 )
+demo = gr.Blocks()
 with demo:
+    gr.TabbedInterface([mf_transcribe, ], ["Audio",])
     with gr.Row():
         refresh_button = gr.Button("Refresh Status")  # Create a refresh button
+    sys_status_output = gr.Textbox(label="System Status", interactive=False)
     # Link the refresh button to the refresh_status function
+    refresh_button.click(refresh_status, None, [sys_status_output])
     # Load the initial status using update_status function
+    demo.load(update_status, inputs=None, outputs=[sys_status_output], every=2, queue=False)
+    graudio.stop_recording(handle_upload_audio,inputs=[graudio,grmodel_textbox,groutputs[0]],outputs=groutputs)
+    graudio.upload(handle_upload_audio,inputs=[graudio,grmodel_textbox,groutputs[0]],outputs=groutputs)
 # Launch the Gradio app
 demo.launch(share=True)
+print('launched\n\n')