Rashidkareem commited on
Commit
036fcaf
·
verified ·
1 Parent(s): 77ae55e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -43
app.py CHANGED
@@ -1,23 +1,45 @@
1
  import gradio as gr
2
  import torch
3
- import torchaudio
4
- from openvoice_cli.cli import OpenVoiceCLI # Corrected import statement
5
  import os
6
  import tempfile
 
7
 
8
- # OpenVoiceCLI ko initialize karna
9
- # Yeh model ko download karega agar pehli baar run ho raha hai
 
 
 
 
 
 
 
 
 
10
  try:
11
- cli = OpenVoiceCLI()
12
- print("OpenVoiceCLI initialized successfully.")
 
 
 
 
 
 
 
 
 
 
 
 
13
  except Exception as e:
14
- print(f"Error initializing OpenVoiceCLI: {e}")
15
- # Agar initialization fail ho to fallback mechanism ya error message
16
- cli = None # cli ko None set kar dein takey neeche check kiya ja sake
17
 
18
  def clone_voice(reference_audio, text_to_speak, language="en"):
19
- if cli is None:
20
- return None, "Error: OpenVoice model could not be loaded."
21
 
22
  if reference_audio is None:
23
  return None, "Error: Please upload a reference audio file."
@@ -25,43 +47,38 @@ def clone_voice(reference_audio, text_to_speak, language="en"):
25
  return None, "Error: Please enter text to speak."
26
 
27
  try:
28
- # Gradio se anay wali audio file ka path hasil karna
29
- reference_audio_path = reference_audio
 
 
 
 
 
 
 
 
30
 
31
- # Temporary file banana output ke liye
32
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_output_file:
33
- output_audio_path = tmp_output_file.name
34
 
35
- # OpenVoiceCLI ka istemal karke voice clone karna
36
- # Note: OpenVoiceCLI seedha reference speaker aur text input leta hai.
37
- # Isko file paths ki zaroorat hoti hai.
38
- print(f"Using reference audio: {reference_audio_path}")
39
- print(f"Text to speak: {text_to_speak}")
40
- print(f"Output path: {output_audio_path}")
41
 
42
- # OpenVoiceCLI.generate() method ko call karna
43
- # Yahan 'speaker' argument mein reference audio ka path jayega
44
- # 'text' argument mein bolne wala text
45
- # 'output_path' mein jahan audio save karni hai
46
- # 'voice_speed' aur 'device' optional parameters hain
47
- cli.generate(
48
- text=text_to_speak,
49
- speaker=reference_audio_path, # Reference audio ka path
50
- output_path=output_audio_path,
51
- language=language # Zuban specify karna (default 'en' hai)
52
- )
53
 
54
  print(f"Generated audio saved at: {output_audio_path}")
55
- return output_audio_path, None # Audio file ka path return karna
56
 
57
  except Exception as e:
58
  print(f"Error during voice cloning: {e}")
59
- return None, f"Error: {str(e)}"
60
- finally:
61
- # Temporary reference audio file ko delete karna agar woh Gradio ne temp banayi ho
62
- # Agar Gradio khud manage kar raha hai to iski zaroorat nahi
63
- pass
64
-
65
 
66
  # Gradio Interface Banana
67
  with gr.Blocks() as demo:
@@ -75,7 +92,8 @@ with gr.Blocks() as demo:
75
  with gr.Column():
76
  reference_audio_input = gr.Audio(type="filepath", label="Reference Voice (WAV/MP3)")
77
  text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter text here...")
78
- language_input = gr.Dropdown(label="Language (for text accent, optional)", choices=["en", "es", "fr", "zh", "jp", "kr"], value="en") # Aap aur zubanein add kar sakte hain
 
79
  clone_button = gr.Button("Clone Voice ✨")
80
  with gr.Column():
81
  output_audio = gr.Audio(label="Cloned Voice Output", type="filepath")
@@ -93,12 +111,13 @@ with gr.Blocks() as demo:
93
  - Behtareen results ke liye, reference audio saaf (clear) aur kam se kam 5 second ki honi chahiye.
94
  - Background noise kam se kam ho.
95
  - Yeh model research purposes ke liye hai. Iska ghalat istemal na karein.
 
96
  """
97
  )
98
 
99
  if __name__ == "__main__":
100
- if cli is None:
101
- print("OpenVoiceCLI failed to initialize. Gradio app will not run correctly.")
102
  else:
103
  print("Launching Gradio app...")
104
  demo.launch()
 
1
  import gradio as gr
2
  import torch
3
+ import torchaudio # Shayad zaroorat na pare, agar transformers khud handle kar le.
4
+ from transformers import AutoProcessor, AutoModelForTextToSpeech # Ye hai asal naya import
5
  import os
6
  import tempfile
7
+ import soundfile as sf # Generated audio ko save karne ke liye
8
 
9
+ # --- OpenVoiceV2 Model aur Processor ko Load karna ---
10
+ # Yahan aapko asal OpenVoiceV2 model ka ID dalna hoga jo Hugging Face Hub par available ho.
11
+ # EXAMPLE: "myshell-ai/OpenVoiceV2-base" (Yeh ek misali ID hai, asal ID confirm karni hogi)
12
+ # Aapko Hugging Face Hub par 'OpenVoiceV2' search karke sahi model ID dhoondhna hoga.
13
+ # Agar official OpenVoiceV2 model Transformers mein abhi tak available nahi hai,
14
+ # to phir yeh tareeqa kaam nahi karega, aur humein koi aur hal dekhna hoga.
15
+
16
+ # NOTE: Main yahan ek placeholder model (Bark) istemal kar raha hoon,
17
+ # aapko isay OpenVoiceV2 ke asal model ID se badalna hoga.
18
+ # Agar OpenVoiceV2 seedha AutoModelForTextToSpeech se load nahi hota,
19
+ # to hamein uski specific library ke tareeqe par jana hoga, lekin pehle yeh try karein.
20
  try:
21
+ # Model ID ko OpenVoiceV2 ke asal ID se badlein
22
+ # Agar OpenVoiceV2 Hugging Face Transformers par 'text-to-speech' model ke tor par hai
23
+ model_id = "suno/bark-small" # <--- Yahan OpenVoiceV2 ka asal model ID daalein!
24
+ # Misal ke tor par "myshell-ai/open-voice-v2" ya similar.
25
+ # Agar ye Bark model chal gaya, to OpenVoiceV2 ka
26
+ # sahi model ID dhoondh kar yahan lagayen.
27
+
28
+ processor = AutoProcessor.from_pretrained(model_id)
29
+ model = AutoModelForTextToSpeech.from_pretrained(model_id)
30
+ print(f"Model '{model_id}' loaded successfully using Transformers.")
31
+ # Agar OpenVoiceV2 ke liye alag se speaker encoder ki zaroorat ho,
32
+ # to woh bhi yahan load karna padega. Masalan:
33
+ # speaker_encoder = AutoModel.from_pretrained("path/to/speaker-encoder")
34
+ # Ye OpenVoiceV2 ke documentation par depend karta hai.
35
  except Exception as e:
36
+ print(f"Error loading model using Transformers: {e}")
37
+ print("Please ensure OpenVoiceV2 is available on Hugging Face Hub for AutoProcessor/AutoModelForTextToSpeech, or check the correct model ID.")
38
+ processor, model = None, None
39
 
40
  def clone_voice(reference_audio, text_to_speak, language="en"):
41
+ if processor is None or model is None:
42
+ return None, "Error: Model could not be loaded at startup. Check logs."
43
 
44
  if reference_audio is None:
45
  return None, "Error: Please upload a reference audio file."
 
47
  return None, "Error: Please enter text to speak."
48
 
49
  try:
50
+ # --- Reference Audio se Speaker Embedding Hasil Karna (OpenVoiceV2 Specific) ---
51
+ # Ye hissa OpenVoiceV2 ki khas functionality hai.
52
+ # Aapko OpenVoiceV2 ke documentation se pata karna hoga ke
53
+ # reference audio se speaker embedding kaise nikali jati hai.
54
+ # Bark (jo abhi placeholder hai) is tareeqe se speaker embedding nahi nikalta.
55
+ #
56
+ # Misal ke tor par (conceptual code, asal OpenVoiceV2 ke hisab se badle ga):
57
+ # ref_audio_waveform, ref_sample_rate = torchaudio.load(reference_audio)
58
+ # speaker_embedding = some_openvoice_speaker_encoder(ref_audio_waveform, ref_sample_rate)
59
+ # Is waqt, main Bark ke liye placeholder logic istemal kar raha hoon.
60
 
61
+ # Text ko process karein
62
+ inputs = processor(text=text_to_speak, return_tensors="pt")
 
63
 
64
+ # Voice generate karein
65
+ # Agar OpenVoiceV2 mein speaker embedding pass karna ho, to generate method mein add hoga:
66
+ # generated_audio_array = model.generate(**inputs, speaker_embedding=speaker_embedding)
67
+
68
+ # Abhi Bark placeholder ke liye:
69
+ generated_audio_array = model.generate(**inputs, do_sample=True, generation_config=model.generation_config)
70
 
71
+ # Output audio ko temporary file mein save karein
72
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_output_file:
73
+ output_audio_path = tmp_output_file.name
74
+ sf.write(output_audio_path, generated_audio_array[0].cpu().numpy(), samplerate=model.generation_config.sample_rate)
 
 
 
 
 
 
 
75
 
76
  print(f"Generated audio saved at: {output_audio_path}")
77
+ return output_audio_path, "Voice generated successfully!"
78
 
79
  except Exception as e:
80
  print(f"Error during voice cloning: {e}")
81
+ return None, f"Error: {str(e)}. Make sure OpenVoiceV2 model ID is correct and it supports voice cloning with provided inputs."
 
 
 
 
 
82
 
83
  # Gradio Interface Banana
84
  with gr.Blocks() as demo:
 
92
  with gr.Column():
93
  reference_audio_input = gr.Audio(type="filepath", label="Reference Voice (WAV/MP3)")
94
  text_input = gr.Textbox(label="Text to Synthesize", placeholder="Enter text here...")
95
+ # Language dropdown bhi rehne dein agar OpenVoiceV2 support karta ho
96
+ language_input = gr.Dropdown(label="Language (for text accent, optional)", choices=["en", "es", "fr", "zh", "jp", "kr"], value="en")
97
  clone_button = gr.Button("Clone Voice ✨")
98
  with gr.Column():
99
  output_audio = gr.Audio(label="Cloned Voice Output", type="filepath")
 
111
  - Behtareen results ke liye, reference audio saaf (clear) aur kam se kam 5 second ki honi chahiye.
112
  - Background noise kam se kam ho.
113
  - Yeh model research purposes ke liye hai. Iska ghalat istemal na karein.
114
+ **IMPORTANT:** Is app ko OpenVoiceV2 ke sahi Hugging Face model ID se update karna hoga.
115
  """
116
  )
117
 
118
  if __name__ == "__main__":
119
+ if processor is None or model is None:
120
+ print("Model failed to load. Gradio app might not function correctly.")
121
  else:
122
  print("Launching Gradio app...")
123
  demo.launch()