Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,23 +1,45 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
-
import torchaudio
|
4 |
-
from
|
5 |
import os
|
6 |
import tempfile
|
|
|
7 |
|
8 |
-
#
|
9 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
try:
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
except Exception as e:
|
14 |
-
print(f"Error
|
15 |
-
|
16 |
-
|
17 |
|
18 |
def clone_voice(reference_audio, text_to_speak, language="en"):
|
19 |
-
if
|
20 |
-
return None, "Error:
|
21 |
|
22 |
if reference_audio is None:
|
23 |
return None, "Error: Please upload a reference audio file."
|
@@ -25,43 +47,38 @@ def clone_voice(reference_audio, text_to_speak, language="en"):
|
|
25 |
return None, "Error: Please enter text to speak."
|
26 |
|
27 |
try:
|
28 |
-
#
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
-
#
|
32 |
-
|
33 |
-
output_audio_path = tmp_output_file.name
|
34 |
|
35 |
-
#
|
36 |
-
#
|
37 |
-
#
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
|
42 |
-
#
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
# 'voice_speed' aur 'device' optional parameters hain
|
47 |
-
cli.generate(
|
48 |
-
text=text_to_speak,
|
49 |
-
speaker=reference_audio_path, # Reference audio ka path
|
50 |
-
output_path=output_audio_path,
|
51 |
-
language=language # Zuban specify karna (default 'en' hai)
|
52 |
-
)
|
53 |
|
54 |
print(f"Generated audio saved at: {output_audio_path}")
|
55 |
-
return output_audio_path,
|
56 |
|
57 |
except Exception as e:
|
58 |
print(f"Error during voice cloning: {e}")
|
59 |
-
return None, f"Error: {str(e)}"
|
60 |
-
finally:
|
61 |
-
# Temporary reference audio file ko delete karna agar woh Gradio ne temp banayi ho
|
62 |
-
# Agar Gradio khud manage kar raha hai to iski zaroorat nahi
|
63 |
-
pass
|
64 |
-
|
65 |
|
66 |
# Gradio Interface Banana
|
67 |
with gr.Blocks() as demo:
|
@@ -75,7 +92,8 @@ with gr.Blocks() as demo:
|
|
75 |
with gr.Column():
|
76 |
reference_audio_input = gr.Audio(type="filepath", label="Reference Voice (WAV/MP3)")
|
77 |
text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter text here...")
|
78 |
-
|
|
|
79 |
clone_button = gr.Button("Clone Voice ✨")
|
80 |
with gr.Column():
|
81 |
output_audio = gr.Audio(label="Cloned Voice Output", type="filepath")
|
@@ -93,12 +111,13 @@ with gr.Blocks() as demo:
|
|
93 |
- Behtareen results ke liye, reference audio saaf (clear) aur kam se kam 5 second ki honi chahiye.
|
94 |
- Background noise kam se kam ho.
|
95 |
- Yeh model research purposes ke liye hai. Iska ghalat istemal na karein.
|
|
|
96 |
"""
|
97 |
)
|
98 |
|
99 |
if __name__ == "__main__":
|
100 |
-
if
|
101 |
-
print("
|
102 |
else:
|
103 |
print("Launching Gradio app...")
|
104 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
+
import torchaudio # Shayad zaroorat na pare, agar transformers khud handle kar le.
|
4 |
+
from transformers import AutoProcessor, AutoModelForTextToSpeech # Ye hai asal naya import
|
5 |
import os
|
6 |
import tempfile
|
7 |
+
import soundfile as sf # Generated audio ko save karne ke liye
|
8 |
|
9 |
+
# --- OpenVoiceV2 Model aur Processor ko Load karna ---
|
10 |
+
# Yahan aapko asal OpenVoiceV2 model ka ID dalna hoga jo Hugging Face Hub par available ho.
|
11 |
+
# EXAMPLE: "myshell-ai/OpenVoiceV2-base" (Yeh ek misali ID hai, asal ID confirm karni hogi)
|
12 |
+
# Aapko Hugging Face Hub par 'OpenVoiceV2' search karke sahi model ID dhoondhna hoga.
|
13 |
+
# Agar official OpenVoiceV2 model Transformers mein abhi tak available nahi hai,
|
14 |
+
# to phir yeh tareeqa kaam nahi karega, aur humein koi aur hal dekhna hoga.
|
15 |
+
|
16 |
+
# NOTE: Main yahan ek placeholder model (Bark) istemal kar raha hoon,
|
17 |
+
# aapko isay OpenVoiceV2 ke asal model ID se badalna hoga.
|
18 |
+
# Agar OpenVoiceV2 seedha AutoModelForTextToSpeech se load nahi hota,
|
19 |
+
# to hamein uski specific library ke tareeqe par jana hoga, lekin pehle yeh try karein.
|
20 |
try:
|
21 |
+
# Model ID ko OpenVoiceV2 ke asal ID se badlein
|
22 |
+
# Agar OpenVoiceV2 Hugging Face Transformers par 'text-to-speech' model ke tor par hai
|
23 |
+
model_id = "suno/bark-small" # <--- Yahan OpenVoiceV2 ka asal model ID daalein!
|
24 |
+
# Misal ke tor par "myshell-ai/open-voice-v2" ya similar.
|
25 |
+
# Agar ye Bark model chal gaya, to OpenVoiceV2 ka
|
26 |
+
# sahi model ID dhoondh kar yahan lagayen.
|
27 |
+
|
28 |
+
processor = AutoProcessor.from_pretrained(model_id)
|
29 |
+
model = AutoModelForTextToSpeech.from_pretrained(model_id)
|
30 |
+
print(f"Model '{model_id}' loaded successfully using Transformers.")
|
31 |
+
# Agar OpenVoiceV2 ke liye alag se speaker encoder ki zaroorat ho,
|
32 |
+
# to woh bhi yahan load karna padega. Masalan:
|
33 |
+
# speaker_encoder = AutoModel.from_pretrained("path/to/speaker-encoder")
|
34 |
+
# Ye OpenVoiceV2 ke documentation par depend karta hai.
|
35 |
except Exception as e:
|
36 |
+
print(f"Error loading model using Transformers: {e}")
|
37 |
+
print("Please ensure OpenVoiceV2 is available on Hugging Face Hub for AutoProcessor/AutoModelForTextToSpeech, or check the correct model ID.")
|
38 |
+
processor, model = None, None
|
39 |
|
40 |
def clone_voice(reference_audio, text_to_speak, language="en"):
|
41 |
+
if processor is None or model is None:
|
42 |
+
return None, "Error: Model could not be loaded at startup. Check logs."
|
43 |
|
44 |
if reference_audio is None:
|
45 |
return None, "Error: Please upload a reference audio file."
|
|
|
47 |
return None, "Error: Please enter text to speak."
|
48 |
|
49 |
try:
|
50 |
+
# --- Reference Audio se Speaker Embedding Hasil Karna (OpenVoiceV2 Specific) ---
|
51 |
+
# Ye hissa OpenVoiceV2 ki khas functionality hai.
|
52 |
+
# Aapko OpenVoiceV2 ke documentation se pata karna hoga ke
|
53 |
+
# reference audio se speaker embedding kaise nikali jati hai.
|
54 |
+
# Bark (jo abhi placeholder hai) is tareeqe se speaker embedding nahi nikalta.
|
55 |
+
#
|
56 |
+
# Misal ke tor par (conceptual code, asal OpenVoiceV2 ke hisab se badle ga):
|
57 |
+
# ref_audio_waveform, ref_sample_rate = torchaudio.load(reference_audio)
|
58 |
+
# speaker_embedding = some_openvoice_speaker_encoder(ref_audio_waveform, ref_sample_rate)
|
59 |
+
# Is waqt, main Bark ke liye placeholder logic istemal kar raha hoon.
|
60 |
|
61 |
+
# Text ko process karein
|
62 |
+
inputs = processor(text=text_to_speak, return_tensors="pt")
|
|
|
63 |
|
64 |
+
# Voice generate karein
|
65 |
+
# Agar OpenVoiceV2 mein speaker embedding pass karna ho, to generate method mein add hoga:
|
66 |
+
# generated_audio_array = model.generate(**inputs, speaker_embedding=speaker_embedding)
|
67 |
+
|
68 |
+
# Abhi Bark placeholder ke liye:
|
69 |
+
generated_audio_array = model.generate(**inputs, do_sample=True, generation_config=model.generation_config)
|
70 |
|
71 |
+
# Output audio ko temporary file mein save karein
|
72 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_output_file:
|
73 |
+
output_audio_path = tmp_output_file.name
|
74 |
+
sf.write(output_audio_path, generated_audio_array[0].cpu().numpy(), samplerate=model.generation_config.sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
print(f"Generated audio saved at: {output_audio_path}")
|
77 |
+
return output_audio_path, "Voice generated successfully!"
|
78 |
|
79 |
except Exception as e:
|
80 |
print(f"Error during voice cloning: {e}")
|
81 |
+
return None, f"Error: {str(e)}. Make sure OpenVoiceV2 model ID is correct and it supports voice cloning with provided inputs."
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
# Gradio Interface Banana
|
84 |
with gr.Blocks() as demo:
|
|
|
92 |
with gr.Column():
|
93 |
reference_audio_input = gr.Audio(type="filepath", label="Reference Voice (WAV/MP3)")
|
94 |
text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter text here...")
|
95 |
+
# Language dropdown bhi rehne dein agar OpenVoiceV2 support karta ho
|
96 |
+
language_input = gr.Dropdown(label="Language (for text accent, optional)", choices=["en", "es", "fr", "zh", "jp", "kr"], value="en")
|
97 |
clone_button = gr.Button("Clone Voice ✨")
|
98 |
with gr.Column():
|
99 |
output_audio = gr.Audio(label="Cloned Voice Output", type="filepath")
|
|
|
111 |
- Behtareen results ke liye, reference audio saaf (clear) aur kam se kam 5 second ki honi chahiye.
|
112 |
- Background noise kam se kam ho.
|
113 |
- Yeh model research purposes ke liye hai. Iska ghalat istemal na karein.
|
114 |
+
**IMPORTANT:** Is app ko OpenVoiceV2 ke sahi Hugging Face model ID se update karna hoga.
|
115 |
"""
|
116 |
)
|
117 |
|
118 |
if __name__ == "__main__":
|
119 |
+
if processor is None or model is None:
|
120 |
+
print("Model failed to load. Gradio app might not function correctly.")
|
121 |
else:
|
122 |
print("Launching Gradio app...")
|
123 |
demo.launch()
|