Spaces:

Rashidkareem
/

Rashid-TTS

Runtime error

App Files Files Community

Rashidkareem commited on 11 days ago

Commit

036fcaf

verified ·

1 Parent(s): 77ae55e

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -43

app.py CHANGED Viewed

@@ -1,23 +1,45 @@
 import gradio as gr
 import torch
-import torchaudio
-from openvoice_cli.cli import OpenVoiceCLI  # Corrected import statement
 import os
 import tempfile
-# OpenVoiceCLI ko initialize karna
-# Yeh model ko download karega agar pehli baar run ho raha hai
 try:
-    cli = OpenVoiceCLI()
-    print("OpenVoiceCLI initialized successfully.")
 except Exception as e:
-    print(f"Error initializing OpenVoiceCLI: {e}")
-    # Agar initialization fail ho to fallback mechanism ya error message
-    cli = None # cli ko None set kar dein takey neeche check kiya ja sake
 def clone_voice(reference_audio, text_to_speak, language="en"):
-    if cli is None:
-        return None, "Error: OpenVoice model could not be loaded."
     if reference_audio is None:
         return None, "Error: Please upload a reference audio file."
@@ -25,43 +47,38 @@ def clone_voice(reference_audio, text_to_speak, language="en"):
         return None, "Error: Please enter text to speak."
     try:
-        # Gradio se anay wali audio file ka path hasil karna
-        reference_audio_path = reference_audio
-        # Temporary file banana output ke liye
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_output_file:
-            output_audio_path = tmp_output_file.name
-        # OpenVoiceCLI ka istemal karke voice clone karna
-        # Note: OpenVoiceCLI seedha reference speaker aur text input leta hai.
-        # Isko file paths ki zaroorat hoti hai.
-        print(f"Using reference audio: {reference_audio_path}")
-        print(f"Text to speak: {text_to_speak}")
-        print(f"Output path: {output_audio_path}")
-        # OpenVoiceCLI.generate() method ko call karna
-        # Yahan 'speaker' argument mein reference audio ka path jayega
-        # 'text' argument mein bolne wala text
-        # 'output_path' mein jahan audio save karni hai
-        # 'voice_speed' aur 'device' optional parameters hain
-        cli.generate(
-            text=text_to_speak,
-            speaker=reference_audio_path,  # Reference audio ka path
-            output_path=output_audio_path,
-            language=language # Zuban specify karna (default 'en' hai)
-        )
         print(f"Generated audio saved at: {output_audio_path}")
-        return output_audio_path, None # Audio file ka path return karna
     except Exception as e:
         print(f"Error during voice cloning: {e}")
-        return None, f"Error: {str(e)}"
-    finally:
-        # Temporary reference audio file ko delete karna agar woh Gradio ne temp banayi ho
-        # Agar Gradio khud manage kar raha hai to iski zaroorat nahi
-        pass
 # Gradio Interface Banana
 with gr.Blocks() as demo:
@@ -75,7 +92,8 @@ with gr.Blocks() as demo:
         with gr.Column():
             reference_audio_input = gr.Audio(type="filepath", label="Reference Voice (WAV/MP3)")
             text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter text here...")
-            language_input = gr.Dropdown(label="Language (for text accent, optional)", choices=["en", "es", "fr", "zh", "jp", "kr"], value="en") # Aap aur zubanein add kar sakte hain
             clone_button = gr.Button("Clone Voice ✨")
         with gr.Column():
             output_audio = gr.Audio(label="Cloned Voice Output", type="filepath")
@@ -93,12 +111,13 @@ with gr.Blocks() as demo:
         - Behtareen results ke liye, reference audio saaf (clear) aur kam se kam 5 second ki honi chahiye.
         - Background noise kam se kam ho.
         - Yeh model research purposes ke liye hai. Iska ghalat istemal na karein.
         """
     )
 if __name__ == "__main__":
-    if cli is None:
-        print("OpenVoiceCLI failed to initialize. Gradio app will not run correctly.")
     else:
         print("Launching Gradio app...")
     demo.launch()

 import gradio as gr
 import torch
+import torchaudio # Shayad zaroorat na pare, agar transformers khud handle kar le.
+from transformers import AutoProcessor, AutoModelForTextToSpeech # Ye hai asal naya import
 import os
 import tempfile
+import soundfile as sf # Generated audio ko save karne ke liye
+# --- OpenVoiceV2 Model aur Processor ko Load karna ---
+# Yahan aapko asal OpenVoiceV2 model ka ID dalna hoga jo Hugging Face Hub par available ho.
+# EXAMPLE: "myshell-ai/OpenVoiceV2-base" (Yeh ek misali ID hai, asal ID confirm karni hogi)
+# Aapko Hugging Face Hub par 'OpenVoiceV2' search karke sahi model ID dhoondhna hoga.
+# Agar official OpenVoiceV2 model Transformers mein abhi tak available nahi hai,
+# to phir yeh tareeqa kaam nahi karega, aur humein koi aur hal dekhna hoga.
+# NOTE: Main yahan ek placeholder model (Bark) istemal kar raha hoon,
+# aapko isay OpenVoiceV2 ke asal model ID se badalna hoga.
+# Agar OpenVoiceV2 seedha AutoModelForTextToSpeech se load nahi hota,
+# to hamein uski specific library ke tareeqe par jana hoga, lekin pehle yeh try karein.
 try:
+    # Model ID ko OpenVoiceV2 ke asal ID se badlein
+    # Agar OpenVoiceV2 Hugging Face Transformers par 'text-to-speech' model ke tor par hai
+    model_id = "suno/bark-small" # <--- Yahan OpenVoiceV2 ka asal model ID daalein!
+                                #      Misal ke tor par "myshell-ai/open-voice-v2" ya similar.
+                                #      Agar ye Bark model chal gaya, to OpenVoiceV2 ka
+                                #      sahi model ID dhoondh kar yahan lagayen.
+    processor = AutoProcessor.from_pretrained(model_id)
+    model = AutoModelForTextToSpeech.from_pretrained(model_id)
+    print(f"Model '{model_id}' loaded successfully using Transformers.")
+    # Agar OpenVoiceV2 ke liye alag se speaker encoder ki zaroorat ho,
+    # to woh bhi yahan load karna padega. Masalan:
+    # speaker_encoder = AutoModel.from_pretrained("path/to/speaker-encoder")
+    # Ye OpenVoiceV2 ke documentation par depend karta hai.
 except Exception as e:
+    print(f"Error loading model using Transformers: {e}")
+    print("Please ensure OpenVoiceV2 is available on Hugging Face Hub for AutoProcessor/AutoModelForTextToSpeech, or check the correct model ID.")
+    processor, model = None, None
 def clone_voice(reference_audio, text_to_speak, language="en"):
+    if processor is None or model is None:
+        return None, "Error: Model could not be loaded at startup. Check logs."
     if reference_audio is None:
         return None, "Error: Please upload a reference audio file."
         return None, "Error: Please enter text to speak."
     try:
+        # --- Reference Audio se Speaker Embedding Hasil Karna (OpenVoiceV2 Specific) ---
+        # Ye hissa OpenVoiceV2 ki khas functionality hai.
+        # Aapko OpenVoiceV2 ke documentation se pata karna hoga ke
+        # reference audio se speaker embedding kaise nikali jati hai.
+        # Bark (jo abhi placeholder hai) is tareeqe se speaker embedding nahi nikalta.
+        #
+        # Misal ke tor par (conceptual code, asal OpenVoiceV2 ke hisab se badle ga):
+        # ref_audio_waveform, ref_sample_rate = torchaudio.load(reference_audio)
+        # speaker_embedding = some_openvoice_speaker_encoder(ref_audio_waveform, ref_sample_rate)
+        # Is waqt, main Bark ke liye placeholder logic istemal kar raha hoon.
+        # Text ko process karein
+        inputs = processor(text=text_to_speak, return_tensors="pt")
+        # Voice generate karein
+        # Agar OpenVoiceV2 mein speaker embedding pass karna ho, to generate method mein add hoga:
+        # generated_audio_array = model.generate(**inputs, speaker_embedding=speaker_embedding)
+        # Abhi Bark placeholder ke liye:
+        generated_audio_array = model.generate(**inputs, do_sample=True, generation_config=model.generation_config)
+        # Output audio ko temporary file mein save karein
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_output_file:
+            output_audio_path = tmp_output_file.name
+            sf.write(output_audio_path, generated_audio_array[0].cpu().numpy(), samplerate=model.generation_config.sample_rate)
         print(f"Generated audio saved at: {output_audio_path}")
+        return output_audio_path, "Voice generated successfully!"
     except Exception as e:
         print(f"Error during voice cloning: {e}")
+        return None, f"Error: {str(e)}. Make sure OpenVoiceV2 model ID is correct and it supports voice cloning with provided inputs."
 # Gradio Interface Banana
 with gr.Blocks() as demo:
         with gr.Column():
             reference_audio_input = gr.Audio(type="filepath", label="Reference Voice (WAV/MP3)")
             text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter text here...")
+            # Language dropdown bhi rehne dein agar OpenVoiceV2 support karta ho
+            language_input = gr.Dropdown(label="Language (for text accent, optional)", choices=["en", "es", "fr", "zh", "jp", "kr"], value="en")
             clone_button = gr.Button("Clone Voice ✨")
         with gr.Column():
             output_audio = gr.Audio(label="Cloned Voice Output", type="filepath")
         - Behtareen results ke liye, reference audio saaf (clear) aur kam se kam 5 second ki honi chahiye.
         - Background noise kam se kam ho.
         - Yeh model research purposes ke liye hai. Iska ghalat istemal na karein.
+        **IMPORTANT:** Is app ko OpenVoiceV2 ke sahi Hugging Face model ID se update karna hoga.
         """
     )
 if __name__ == "__main__":
+    if processor is None or model is None:
+        print("Model failed to load. Gradio app might not function correctly.")
     else:
         print("Launching Gradio app...")
     demo.launch()