import gradio as gr import torch import torchaudio # Shayad zaroorat na pare, agar transformers khud handle kar le. from transformers import AutoProcessor, AutoModelForTextToSpeech # Ye hai asal naya import import os import tempfile import soundfile as sf # Generated audio ko save karne ke liye # --- OpenVoiceV2 Model aur Processor ko Load karna --- # Yahan aapko asal OpenVoiceV2 model ka ID dalna hoga jo Hugging Face Hub par available ho. # EXAMPLE: "myshell-ai/OpenVoiceV2-base" (Yeh ek misali ID hai, asal ID confirm karni hogi) # Aapko Hugging Face Hub par 'OpenVoiceV2' search karke sahi model ID dhoondhna hoga. # Agar official OpenVoiceV2 model Transformers mein abhi tak available nahi hai, # to phir yeh tareeqa kaam nahi karega, aur humein koi aur hal dekhna hoga. # NOTE: Main yahan ek placeholder model (Bark) istemal kar raha hoon, # aapko isay OpenVoiceV2 ke asal model ID se badalna hoga. # Agar OpenVoiceV2 seedha AutoModelForTextToSpeech se load nahi hota, # to hamein uski specific library ke tareeqe par jana hoga, lekin pehle yeh try karein. try: # Model ID ko OpenVoiceV2 ke asal ID se badlein # Agar OpenVoiceV2 Hugging Face Transformers par 'text-to-speech' model ke tor par hai model_id = "suno/bark-small" # <--- Yahan OpenVoiceV2 ka asal model ID daalein! # Misal ke tor par "myshell-ai/open-voice-v2" ya similar. # Agar ye Bark model chal gaya, to OpenVoiceV2 ka # sahi model ID dhoondh kar yahan lagayen. processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForTextToSpeech.from_pretrained(model_id) print(f"Model '{model_id}' loaded successfully using Transformers.") # Agar OpenVoiceV2 ke liye alag se speaker encoder ki zaroorat ho, # to woh bhi yahan load karna padega. Masalan: # speaker_encoder = AutoModel.from_pretrained("path/to/speaker-encoder") # Ye OpenVoiceV2 ke documentation par depend karta hai. except Exception as e: print(f"Error loading model using Transformers: {e}") print("Please ensure OpenVoiceV2 is available on Hugging Face Hub for AutoProcessor/AutoModelForTextToSpeech, or check the correct model ID.") processor, model = None, None def clone_voice(reference_audio, text_to_speak, language="en"): if processor is None or model is None: return None, "Error: Model could not be loaded at startup. Check logs." if reference_audio is None: return None, "Error: Please upload a reference audio file." if not text_to_speak: return None, "Error: Please enter text to speak." try: # --- Reference Audio se Speaker Embedding Hasil Karna (OpenVoiceV2 Specific) --- # Ye hissa OpenVoiceV2 ki khas functionality hai. # Aapko OpenVoiceV2 ke documentation se pata karna hoga ke # reference audio se speaker embedding kaise nikali jati hai. # Bark (jo abhi placeholder hai) is tareeqe se speaker embedding nahi nikalta. # # Misal ke tor par (conceptual code, asal OpenVoiceV2 ke hisab se badle ga): # ref_audio_waveform, ref_sample_rate = torchaudio.load(reference_audio) # speaker_embedding = some_openvoice_speaker_encoder(ref_audio_waveform, ref_sample_rate) # Is waqt, main Bark ke liye placeholder logic istemal kar raha hoon. # Text ko process karein inputs = processor(text=text_to_speak, return_tensors="pt") # Voice generate karein # Agar OpenVoiceV2 mein speaker embedding pass karna ho, to generate method mein add hoga: # generated_audio_array = model.generate(**inputs, speaker_embedding=speaker_embedding) # Abhi Bark placeholder ke liye: generated_audio_array = model.generate(**inputs, do_sample=True, generation_config=model.generation_config) # Output audio ko temporary file mein save karein with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_output_file: output_audio_path = tmp_output_file.name sf.write(output_audio_path, generated_audio_array[0].cpu().numpy(), samplerate=model.generation_config.sample_rate) print(f"Generated audio saved at: {output_audio_path}") return output_audio_path, "Voice generated successfully!" except Exception as e: print(f"Error during voice cloning: {e}") return None, f"Error: {str(e)}. Make sure OpenVoiceV2 model ID is correct and it supports voice cloning with provided inputs." # Gradio Interface Banana with gr.Blocks() as demo: gr.Markdown( """ # OpenVoice V2 Voice Cloning Space 🎤 Ek example voice audio file upload karein aur text likhein. Yeh tool us text ko aapki upload ki hui voice mein convert kar dega. """ ) with gr.Row(): with gr.Column(): reference_audio_input = gr.Audio(type="filepath", label="Reference Voice (WAV/MP3)") text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter text here...") # Language dropdown bhi rehne dein agar OpenVoiceV2 support karta ho language_input = gr.Dropdown(label="Language (for text accent, optional)", choices=["en", "es", "fr", "zh", "jp", "kr"], value="en") clone_button = gr.Button("Clone Voice ✨") with gr.Column(): output_audio = gr.Audio(label="Cloned Voice Output", type="filepath") error_message = gr.Textbox(label="Status/Error", interactive=False) clone_button.click( clone_voice, inputs=[reference_audio_input, text_input, language_input], outputs=[output_audio, error_message] ) gr.Markdown( """ **Note:** - Behtareen results ke liye, reference audio saaf (clear) aur kam se kam 5 second ki honi chahiye. - Background noise kam se kam ho. - Yeh model research purposes ke liye hai. Iska ghalat istemal na karein. **IMPORTANT:** Is app ko OpenVoiceV2 ke sahi Hugging Face model ID se update karna hoga. """ ) if __name__ == "__main__": if processor is None or model is None: print("Model failed to load. Gradio app might not function correctly.") else: print("Launching Gradio app...") demo.launch()