Spaces:

thunnai
/

nanospeech

Runtime error

App Files Files Community

thunnai commited on Feb 10

Commit

f62dcd4

1 Parent(s): 38eabe4

initial setup

Browse files

Files changed (12) hide show

app.py +165 -0
nanospeech/voices/celeste.txt +1 -0
nanospeech/voices/celeste.wav +0 -0
nanospeech/voices/luna.txt +1 -0
nanospeech/voices/luna.wav +0 -0
nanospeech/voices/nash.txt +1 -0
nanospeech/voices/nash.wav +0 -0
nanospeech/voices/orion.txt +1 -0
nanospeech/voices/orion.wav +0 -0
nanospeech/voices/rhea.txt +1 -0
nanospeech/voices/rhea.wav +0 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+from glob import glob
+from dataclasses import dataclass
+import gradio as gr
+import soundfile as sf
+from nanospeech.nanospeech_torch import Nanospeech
+from nanospeech.generate import generate_one, SAMPLE_RATE, split_sentences
+import numpy as np
+from typing import Optional
+PROMPT_DIR = 'nanospeech/voices'
+# Note: gradio expects audio as int16, so we need to convert to float32 when loading and convert back when returning
+def convert_audio_int16_to_float32(audio: np.ndarray) -> np.ndarray:
+    return audio.astype(np.float32) / 32768.0
+def convert_audio_float32_to_int16(audio: np.ndarray) -> np.ndarray:
+    return (np.clip(audio, -1.0, 1.0) * 32768.0).astype(np.int16)
+@dataclass
+class VoicePrompt:
+    wav_path: str
+    text: str
+def get_prompt_list(prompt_dir=PROMPT_DIR):
+    wav_paths = glob(os.path.join(prompt_dir, '*.wav'))
+    prompt_lookup: dict[str, VoicePrompt] = {}
+    for wav_path in wav_paths:
+        voice_name = os.path.splitext(os.path.basename(wav_path))[0]
+        text_path = wav_path.replace('.wav', '.txt')
+        with open(text_path, 'r') as f:
+            text = f.read()
+        prompt_lookup[voice_name] = VoicePrompt(
+            wav_path=wav_path,
+            text=text
+        )
+    return prompt_lookup
+def create_demo(prompt_list: dict[str, VoicePrompt], model: 'Nanospeech'):
+    def update_prompt(voice_name: str):
+        return (
+            prompt_list[voice_name].wav_path,
+            prompt_list[voice_name].text
+        )
+    def _generate(prompt_audio: str, prompt_text: str, input_text: str, nfe_steps: int = 8, method: str = "rk4", cfg_strength: float = 2.0, sway_sampling_coef: float = -1.0, speed: float = 1.0, seed: Optional[int] = None):
+        print(f'generating: {input_text}, prompt: {prompt_text}, prompt_audio: {prompt_audio}')
+        # Load reference audio into memory
+        if isinstance(prompt_audio, tuple):
+            sr, ref_audio = prompt_audio
+            ref_audio = convert_audio_int16_to_float32(ref_audio)
+        else:
+            ref_audio, sr = sf.read(prompt_audio)
+            print('loaded from path')
+        if sr != SAMPLE_RATE:
+            raise ValueError("Reference audio must be mono with a sample rate of 24kHz")
+        # Split input text into sentences
+        sentences = split_sentences(input_text)
+        is_single_generation = len(sentences) <= 1
+        if is_single_generation:
+            wave = generate_one(
+                model=model,
+                text=input_text,
+                ref_audio=ref_audio,
+                ref_audio_text=prompt_text,
+                steps=nfe_steps,
+                method=method,
+                cfg_strength=cfg_strength,
+                sway_sampling_coef=sway_sampling_coef,
+                speed=speed,
+                seed=seed,
+                player=None,
+            )
+            if hasattr(wave, 'numpy'):
+                wave = wave.numpy()
+        else:
+            # Generate multiple sentences and concatenate
+            output = []
+            for sentence_text in sentences:
+                wave = generate_one(
+                    model=model,
+                    text=sentence_text,
+                    ref_audio=ref_audio,
+                    ref_audio_text=prompt_text,
+                    steps=nfe_steps,
+                    method=method,
+                    cfg_strength=cfg_strength,
+                    sway_sampling_coef=sway_sampling_coef,
+                    speed=speed,
+                    seed=seed,
+                    player=None,
+                )
+                if hasattr(wave, 'numpy'):
+                    wave = wave.numpy()
+                output.append(wave)
+            wave = np.concatenate(output, axis=0)
+        return (SAMPLE_RATE, wave)
+    with gr.Blocks() as demo:
+        gr.Markdown("# (Unofficial) Nanospeech Demo")
+        gr.Markdown("A simple, hackable text-to-speech system in PyTorch and MLX - [github](https://github.com/lucasnewman/nanospeech)")
+        with gr.Group():
+            gr.Markdown("## Select a voice prompt")
+            voice_dropdown = gr.Dropdown(choices=list(prompt_list.keys()), value='celeste', interactive=True, label="Voice")
+        with gr.Group():
+            gr.Markdown("## Voice Prompt")
+            with gr.Row():
+                prompt_audio = gr.Audio(label="Audio", value=prompt_list[voice_dropdown.value].wav_path)
+                prompt_text = gr.Textbox(label="Text", value=prompt_list[voice_dropdown.value].text, interactive=False)
+            voice_dropdown.change(fn=update_prompt, inputs=voice_dropdown, outputs=[prompt_audio, prompt_text])
+            with gr.Accordion("Advanced Settings", open=False):
+                speed = gr.Slider(label="Speed", value=1.0, minimum=0.1, maximum=2.0, step=0.1)
+                nfe_steps = gr.Slider(label="NFE Steps - more steps = more stable, but slower", value=8, minimum=1, maximum=64, step=1)
+                method = gr.Dropdown(choices=["rk4", "euler", "midpoint"], value="rk4", label="Method")
+                cfg_strength = gr.Slider(label="CFG Strength", value=2.0, minimum=0.0, maximum=5.0, step=0.1)
+                sway_sampling_coef = gr.Slider(label="Sway Sampling Coef", value=-1.0, minimum=-5.0, maximum=5.0, step=0.1)
+        with gr.Group():
+            gr.Markdown("# Generate")
+            input_text = gr.Textbox(label="Input Text", value="Hello, how are you?")
+            generate_button = gr.Button("Generate")
+        with gr.Group():
+            output_audio = gr.Audio(label="Output Audio")
+        generate_button.click(fn=_generate, inputs=[prompt_audio, prompt_text, input_text, nfe_steps, method, cfg_strength, sway_sampling_coef, speed], outputs=output_audio)
+    return demo
+if __name__ == "__main__":
+    # Preload the model
+    model = Nanospeech.from_pretrained("lucasnewman/nanospeech")
+    prompt_list = get_prompt_list()
+    demo = create_demo(prompt_list, model)
+    demo.launch()

nanospeech/voices/celeste.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Pickled cucumbers.

nanospeech/voices/celeste.wav ADDED Viewed

Binary file (53.8 kB). View file

nanospeech/voices/luna.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Do you wish to see him?

nanospeech/voices/luna.wav ADDED Viewed

Binary file (50 kB). View file

nanospeech/voices/nash.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Look out for what?

nanospeech/voices/nash.wav ADDED Viewed

Binary file (50 kB). View file

nanospeech/voices/orion.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ It isn't his fault.

nanospeech/voices/orion.wav ADDED Viewed

Binary file (48 kB). View file

nanospeech/voices/rhea.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ They haven't met?

nanospeech/voices/rhea.wav ADDED Viewed

Binary file (51.9 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+soundfile
+git+https://github.com/thunn/nanospeech.git
+numpy