Spaces:

JotunnBurton
/

wuwa-bert-vits2

Sleeping

App Files Files Community

JotunnBurton commited on Apr 17

Commit

bfb8cef

verified ·

1 Parent(s): b537b52

Update app.py

Browse files

Files changed (1) hide show

app.py +120 -0

app.py CHANGED Viewed

@@ -187,6 +187,126 @@ def create_tts_fn(hps, net_g, device):
         return "Success", (hps.data.sampling_rate, audio)
     return tts_fn
 # Then patch create_tab to accept split_fn and use it in slicer.click
 # And in the model loop, generate both tts_fn and split_fn then pass both into create_tab
 # (Same as your current setup but now split_fn is isolated per model just like tts_fn)

         return "Success", (hps.data.sampling_rate, audio)
     return tts_fn
+# Function to build a single tab per model
+def create_tab(title, example, speakers, tts_fn, split_fn, repid):
+    with gr.TabItem(speakers[0]):
+        gr.Markdown(
+            '<div align="center">'
+            f'<a><strong>{repid}</strong></a>'
+            f'<br>'
+            f'<a><strong>{title}</strong></a>'
+            f'<br>'
+            f'<a><strong>{speakers}</strong></a>'
+            f'</div>'
+        )
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(label="Input text", lines=5, value=example)
+                speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="Speaker")
+                prompt_mode = gr.Radio(["Text prompt", "Audio prompt"], label="Prompt Mode", value="Text prompt")
+                text_prompt = gr.Textbox(label="Text prompt", value="Happy", visible=True)
+                audio_prompt = gr.Audio(label="Audio prompt", type="filepath", visible=False)
+                sdp_ratio = gr.Slider(0, 1, 0.2, 0.1, label="SDP Ratio")
+                noise_scale = gr.Slider(0.1, 2.0, 0.6, 0.1, label="Noise")
+                noise_scale_w = gr.Slider(0.1, 2.0, 0.8, 0.1, label="Noise_W")
+                length_scale = gr.Slider(0.1, 2.0, 1.0, 0.1, label="Length")
+                language = gr.Dropdown(choices=["JP", "ZH", "EN", "mix", "auto"], value="JP", label="Language")
+                btn = gr.Button("Generate Audio", variant="primary")
+            with gr.Column():
+                with gr.Accordion("Semantic Fusion", open=False):
+                    gr.Markdown(
+                        value="Use auxiliary text semantics to assist speech generation (language remains same as main text)\n\n"
+                              "**Note**: Avoid using *command-style text* (e.g., 'Happy'). Use *emotionally rich text* (e.g., 'I'm so happy!!!')\n\n"
+                              "Leave it blank to disable. \n\n"
+                              "**If mispronunciations occur, try replacing characters and inputting the original here with weight set to 1.0 for semantic retention.**"
+                    )
+                    style_text = gr.Textbox(label="Auxiliary Text")
+                    style_weight = gr.Slider(0, 1, 0.7, 0.1, label="Weight", info="Ratio between main and auxiliary BERT embeddings")
+                with gr.Row():
+                    with gr.Column():
+                        interval_between_sent = gr.Slider(0, 5, 0.2, 0.1, label="Pause between sentences (sec)")
+                        interval_between_para = gr.Slider(0, 10, 1, 0.1, label="Pause between paragraphs (sec)")
+                        opt_cut_by_sent = gr.Checkbox(label="Split by sentence")
+                        slicer = gr.Button("Split and Generate", variant="primary")
+            with gr.Column():
+                output_msg = gr.Textbox(label="Output Message")
+                output_audio = gr.Audio(label="Output Audio")
+        prompt_mode.change(lambda x: gr_util(x), inputs=[prompt_mode], outputs=[text_prompt, audio_prompt])
+        audio_prompt.upload(lambda x: load_audio(x), inputs=[audio_prompt], outputs=[audio_prompt])
+        btn.click(
+            tts_fn,
+            inputs=[
+                input_text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language,
+                audio_prompt, text_prompt, prompt_mode, style_text, style_weight
+            ],
+            outputs=[output_msg, output_audio],
+        )
+        slicer.click(
+            split_fn,
+            inputs=[
+                input_text, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale, language,
+                opt_cut_by_sent, interval_between_para, interval_between_sent,
+                audio_prompt, text_prompt, style_text, style_weight
+            ],
+            outputs=[output_msg, output_audio],
+        )
+# --- Main entry point ---
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--share", default=False, help="make link public", action="store_true")
+    parser.add_argument("-d", "--debug", action="store_true", help="enable DEBUG-LEVEL log")
+    args = parser.parse_args()
+    if args.debug:
+        logger.setLevel(logging.DEBUG)
+    with open("pretrained_models/info.json", "r", encoding="utf-8") as f:
+        models_info = json.load(f)
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    models = []
+    for _, info in models_info.items():
+        if not info['enable']:
+            continue
+        name, title, repid, example, filename = info['name'], info['title'], info['repid'], info['example'], info['filename']
+        files = list_repo_files(repo_id=repid)
+        model_subfolder = None
+        for f in files:
+            if f.endswith(filename):
+                parts = f.split("/")
+                if len(parts) > 1:
+                    model_subfolder = "/".join(parts[:-1])
+                break
+        if model_subfolder:
+            model_path = hf_hub_download(repo_id=repid, filename=filename, subfolder=model_subfolder)
+            config_path = hf_hub_download(repo_id=repid, filename="config.json", subfolder=model_subfolder)
+        else:
+            model_path = hf_hub_download(repo_id=repid, filename=filename)
+            config_path = hf_hub_download(repo_id=repid, filename="config.json")
+        hps = utils.get_hparams_from_file(config_path)
+        version = hps.version if hasattr(hps, "version") else "v2"
+        net_g = get_net_g(model_path, version, device, hps)
+        tts_fn = create_tts_fn(hps, net_g, device)
+        split_fn = create_split_fn(hps, net_g, device)
+        models.append((title, example, list(hps.data.spk2id.keys()), tts_fn, split_fn, repid))
+    with gr.Blocks(theme='NoCrypt/miku') as app:
+        gr.Markdown("## ✅ All models loaded successfully. Ready to use.")
+        with gr.Tabs():
+            for (title, example, speakers, tts_fn, split_fn, repid) in models:
+                create_tab(title, example, speakers, tts_fn, split_fn, repid)
+    app.queue().launch(share=args.share)
 # Then patch create_tab to accept split_fn and use it in slicer.click
 # And in the model loop, generate both tts_fn and split_fn then pass both into create_tab
 # (Same as your current setup but now split_fn is isolated per model just like tts_fn)