Spaces:

mmdbes
/

Farsi_Voice_To_Text

Sleeping

App Files Files Community

mmdbes commited on Aug 21

Commit

9960131

verified ·

1 Parent(s): 224287f

Rename whisper.py to app.py

Browse files

Files changed (2) hide show

app.py +79 -0
whisper.py +0 -67

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# app.py
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import gradio as gr
+import os
+# --- 1. Model Configuration and Loading ---
+# This part runs only once when the app starts.
+print("--- Setting up for CPU ---")
+device = "cpu"
+torch_dtype = torch.float32  # Use float32 for CPU
+model_id = "vhdm/whisper-large-fa-v1"
+print("--- Loading model and processor ---")
+# Load the model and processor
+model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id,
+    torch_dtype=torch_dtype,
+    low_cpu_mem_usage=True,
+    use_safetensors=True  # Safetensors is generally preferred
+)
+processor = AutoProcessor.from_pretrained(model_id)
+# Create the pipeline
+print("--- Creating transcription pipeline ---")
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model,
+    tokenizer=processor.tokenizer,
+    feature_extractor=processor.feature_extractor,
+    max_new_tokens=128,
+    torch_dtype=torch_dtype,
+    device=device,
+)
+print("--- Setup complete. Gradio app is ready. ---")
+# --- 2. The Transcription Function ---
+# This function is called every time a user uploads a file.
+def transcribe_audio(audio_filepath):
+    """
+    Takes an audio file path, transcribes it, and returns the text.
+    """
+    if audio_filepath is None:
+        return "Please upload an audio file first."
+    print(f"Processing file: {audio_filepath}")
+    result = pipe(audio_filepath, return_timestamps=True)
+    transcription = result["text"]
+    print(f"Transcription result: {transcription}")
+    return transcription
+# --- 3. Gradio Web Interface ---
+# Define the title and description for the web app
+title = "Whisper Persian ASR 🇮🇷"
+description = """
+This is a demo for the `vhdm/whisper-large-fa-v1` model for automatic speech recognition (ASR) in Persian.
+<br>
+Upload your audio file (MP3, WAV, etc.) or record directly from your microphone and click 'Submit' to see the transcription.
+"""
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=transcribe_audio,
+    inputs=gr.Audio(type="filepath", label="Upload or Record Persian Audio"),
+    outputs=gr.Textbox(label="Transcription Result"),
+    title=title,
+    description=description,
+    examples=[["example.wav"]] # Optional: add an example file
+)
+# Launch the app
+iface.launch()

whisper.py DELETED Viewed

@@ -1,67 +0,0 @@
-# -*- coding: utf-8 -*-
-"""whisper.ipynb
-Automatically generated by Colab.
-Original file is located at
-    https://colab.research.google.com/drive/1zMkidOS8-BJnLbouvesA3v7WYcChikTY
-"""
-# Cell 1: Installations (run this once, then restart runtime)
-!pip install --upgrade --force-reinstall transformers accelerate datasets torchcodec ffmpeg-python torch
-# Cell 2: Main Script (Corrected)
-import torch
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
-import os
-print("✅ Libraries loaded successfully!")
-# Set up the device
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-print(f"Using device: {device}")
-# Model ID
-model_id = "vhdm/whisper-large-fa-v1"
-# Load the model
-print("Loading model...")
-# Corrected the typo in the class name here ✅
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
-)
-model.to(device)
-# Load the processor
-print("Loading processor...")
-processor = AutoProcessor.from_pretrained(model_id)
-# Create the pipeline for long-form audio
-pipe = pipeline(
-    "automatic-speech-recognition",
-    model=model,
-    tokenizer=processor.tokenizer,
-    feature_extractor=processor.feature_extractor,
-    max_new_tokens=128,
-    return_timestamps=True,
-    torch_dtype=torch_dtype,
-    device=device,
-)
-# --- Put your long audio file's name here ---
-file_path = "long.mp3"
-# ------------------------------------------
-# Check if the file exists
-if not os.path.exists(file_path):
-    print(f"❌ Error: File '{file_path}' not found. Please upload your file and check the name.")
-else:
-    # Process your long audio file
-    print(f"Processing long audio file: {file_path} ... (This might take a while)")
-    result = pipe(file_path)
-    # Print the final result
-    print("\n--- Transcription Result ---")
-    print(result["text"])