mmdbes commited on
Commit
9960131
ยท
verified ยท
1 Parent(s): 224287f

Rename whisper.py to app.py

Browse files
Files changed (2) hide show
  1. app.py +79 -0
  2. whisper.py +0 -67
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+
3
+ import torch
4
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
5
+ import gradio as gr
6
+ import os
7
+
8
+ # --- 1. Model Configuration and Loading ---
9
+ # This part runs only once when the app starts.
10
+
11
+ print("--- Setting up for CPU ---")
12
+ device = "cpu"
13
+ torch_dtype = torch.float32 # Use float32 for CPU
14
+
15
+ model_id = "vhdm/whisper-large-fa-v1"
16
+
17
+ print("--- Loading model and processor ---")
18
+ # Load the model and processor
19
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
20
+ model_id,
21
+ torch_dtype=torch_dtype,
22
+ low_cpu_mem_usage=True,
23
+ use_safetensors=True # Safetensors is generally preferred
24
+ )
25
+ processor = AutoProcessor.from_pretrained(model_id)
26
+
27
+ # Create the pipeline
28
+ print("--- Creating transcription pipeline ---")
29
+ pipe = pipeline(
30
+ "automatic-speech-recognition",
31
+ model=model,
32
+ tokenizer=processor.tokenizer,
33
+ feature_extractor=processor.feature_extractor,
34
+ max_new_tokens=128,
35
+ torch_dtype=torch_dtype,
36
+ device=device,
37
+ )
38
+
39
+ print("--- Setup complete. Gradio app is ready. ---")
40
+
41
+ # --- 2. The Transcription Function ---
42
+ # This function is called every time a user uploads a file.
43
+
44
+ def transcribe_audio(audio_filepath):
45
+ """
46
+ Takes an audio file path, transcribes it, and returns the text.
47
+ """
48
+ if audio_filepath is None:
49
+ return "Please upload an audio file first."
50
+
51
+ print(f"Processing file: {audio_filepath}")
52
+ result = pipe(audio_filepath, return_timestamps=True)
53
+ transcription = result["text"]
54
+ print(f"Transcription result: {transcription}")
55
+
56
+ return transcription
57
+
58
+ # --- 3. Gradio Web Interface ---
59
+
60
+ # Define the title and description for the web app
61
+ title = "Whisper Persian ASR ๐Ÿ‡ฎ๐Ÿ‡ท"
62
+ description = """
63
+ This is a demo for the `vhdm/whisper-large-fa-v1` model for automatic speech recognition (ASR) in Persian.
64
+ <br>
65
+ Upload your audio file (MP3, WAV, etc.) or record directly from your microphone and click 'Submit' to see the transcription.
66
+ """
67
+
68
+ # Create the Gradio interface
69
+ iface = gr.Interface(
70
+ fn=transcribe_audio,
71
+ inputs=gr.Audio(type="filepath", label="Upload or Record Persian Audio"),
72
+ outputs=gr.Textbox(label="Transcription Result"),
73
+ title=title,
74
+ description=description,
75
+ examples=[["example.wav"]] # Optional: add an example file
76
+ )
77
+
78
+ # Launch the app
79
+ iface.launch()
whisper.py DELETED
@@ -1,67 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """whisper.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1zMkidOS8-BJnLbouvesA3v7WYcChikTY
8
- """
9
-
10
- # Cell 1: Installations (run this once, then restart runtime)
11
- !pip install --upgrade --force-reinstall transformers accelerate datasets torchcodec ffmpeg-python torch
12
-
13
- # Cell 2: Main Script (Corrected)
14
- import torch
15
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
16
- import os
17
-
18
- print("โœ… Libraries loaded successfully!")
19
-
20
- # Set up the device
21
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
22
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
23
- print(f"Using device: {device}")
24
-
25
- # Model ID
26
- model_id = "vhdm/whisper-large-fa-v1"
27
-
28
- # Load the model
29
- print("Loading model...")
30
- # Corrected the typo in the class name here โœ…
31
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
32
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
33
- )
34
- model.to(device)
35
-
36
- # Load the processor
37
- print("Loading processor...")
38
- processor = AutoProcessor.from_pretrained(model_id)
39
-
40
- # Create the pipeline for long-form audio
41
- pipe = pipeline(
42
- "automatic-speech-recognition",
43
- model=model,
44
- tokenizer=processor.tokenizer,
45
- feature_extractor=processor.feature_extractor,
46
- max_new_tokens=128,
47
- return_timestamps=True,
48
- torch_dtype=torch_dtype,
49
- device=device,
50
- )
51
-
52
- # --- Put your long audio file's name here ---
53
- file_path = "long.mp3"
54
- # ------------------------------------------
55
-
56
- # Check if the file exists
57
- if not os.path.exists(file_path):
58
- print(f"โŒ Error: File '{file_path}' not found. Please upload your file and check the name.")
59
- else:
60
- # Process your long audio file
61
- print(f"Processing long audio file: {file_path} ... (This might take a while)")
62
- result = pipe(file_path)
63
-
64
- # Print the final result
65
- print("\n--- Transcription Result ---")
66
- print(result["text"])
67
-