thunnai commited on
Commit
f62dcd4
·
1 Parent(s): 38eabe4

initial setup

Browse files
app.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from glob import glob
3
+ from dataclasses import dataclass
4
+
5
+ import gradio as gr
6
+ import soundfile as sf
7
+ from nanospeech.nanospeech_torch import Nanospeech
8
+ from nanospeech.generate import generate_one, SAMPLE_RATE, split_sentences
9
+ import numpy as np
10
+ from typing import Optional
11
+
12
+
13
+ PROMPT_DIR = 'nanospeech/voices'
14
+
15
+ # Note: gradio expects audio as int16, so we need to convert to float32 when loading and convert back when returning
16
+
17
+ def convert_audio_int16_to_float32(audio: np.ndarray) -> np.ndarray:
18
+ return audio.astype(np.float32) / 32768.0
19
+
20
+ def convert_audio_float32_to_int16(audio: np.ndarray) -> np.ndarray:
21
+ return (np.clip(audio, -1.0, 1.0) * 32768.0).astype(np.int16)
22
+
23
+ @dataclass
24
+ class VoicePrompt:
25
+ wav_path: str
26
+ text: str
27
+
28
+ def get_prompt_list(prompt_dir=PROMPT_DIR):
29
+ wav_paths = glob(os.path.join(prompt_dir, '*.wav'))
30
+
31
+ prompt_lookup: dict[str, VoicePrompt] = {}
32
+
33
+ for wav_path in wav_paths:
34
+ voice_name = os.path.splitext(os.path.basename(wav_path))[0]
35
+ text_path = wav_path.replace('.wav', '.txt')
36
+
37
+ with open(text_path, 'r') as f:
38
+ text = f.read()
39
+
40
+ prompt_lookup[voice_name] = VoicePrompt(
41
+ wav_path=wav_path,
42
+ text=text
43
+ )
44
+
45
+ return prompt_lookup
46
+
47
+ def create_demo(prompt_list: dict[str, VoicePrompt], model: 'Nanospeech'):
48
+
49
+ def update_prompt(voice_name: str):
50
+ return (
51
+ prompt_list[voice_name].wav_path,
52
+ prompt_list[voice_name].text
53
+ )
54
+
55
+ def _generate(prompt_audio: str, prompt_text: str, input_text: str, nfe_steps: int = 8, method: str = "rk4", cfg_strength: float = 2.0, sway_sampling_coef: float = -1.0, speed: float = 1.0, seed: Optional[int] = None):
56
+
57
+ print(f'generating: {input_text}, prompt: {prompt_text}, prompt_audio: {prompt_audio}')
58
+
59
+
60
+ # Load reference audio into memory
61
+ if isinstance(prompt_audio, tuple):
62
+ sr, ref_audio = prompt_audio
63
+
64
+ ref_audio = convert_audio_int16_to_float32(ref_audio)
65
+ else:
66
+ ref_audio, sr = sf.read(prompt_audio)
67
+ print('loaded from path')
68
+
69
+ if sr != SAMPLE_RATE:
70
+ raise ValueError("Reference audio must be mono with a sample rate of 24kHz")
71
+
72
+ # Split input text into sentences
73
+ sentences = split_sentences(input_text)
74
+ is_single_generation = len(sentences) <= 1
75
+
76
+ if is_single_generation:
77
+ wave = generate_one(
78
+ model=model,
79
+ text=input_text,
80
+ ref_audio=ref_audio,
81
+ ref_audio_text=prompt_text,
82
+ steps=nfe_steps,
83
+ method=method,
84
+ cfg_strength=cfg_strength,
85
+ sway_sampling_coef=sway_sampling_coef,
86
+ speed=speed,
87
+ seed=seed,
88
+ player=None,
89
+ )
90
+ if hasattr(wave, 'numpy'):
91
+ wave = wave.numpy()
92
+ else:
93
+ # Generate multiple sentences and concatenate
94
+ output = []
95
+ for sentence_text in sentences:
96
+ wave = generate_one(
97
+ model=model,
98
+ text=sentence_text,
99
+ ref_audio=ref_audio,
100
+ ref_audio_text=prompt_text,
101
+ steps=nfe_steps,
102
+ method=method,
103
+ cfg_strength=cfg_strength,
104
+ sway_sampling_coef=sway_sampling_coef,
105
+ speed=speed,
106
+ seed=seed,
107
+ player=None,
108
+ )
109
+ if hasattr(wave, 'numpy'):
110
+ wave = wave.numpy()
111
+ output.append(wave)
112
+
113
+ wave = np.concatenate(output, axis=0)
114
+
115
+ return (SAMPLE_RATE, wave)
116
+
117
+
118
+ with gr.Blocks() as demo:
119
+ gr.Markdown("# (Unofficial) Nanospeech Demo")
120
+ gr.Markdown("A simple, hackable text-to-speech system in PyTorch and MLX - [github](https://github.com/lucasnewman/nanospeech)")
121
+
122
+ with gr.Group():
123
+ gr.Markdown("## Select a voice prompt")
124
+ voice_dropdown = gr.Dropdown(choices=list(prompt_list.keys()), value='celeste', interactive=True, label="Voice")
125
+
126
+ with gr.Group():
127
+ gr.Markdown("## Voice Prompt")
128
+ with gr.Row():
129
+ prompt_audio = gr.Audio(label="Audio", value=prompt_list[voice_dropdown.value].wav_path)
130
+ prompt_text = gr.Textbox(label="Text", value=prompt_list[voice_dropdown.value].text, interactive=False)
131
+
132
+ voice_dropdown.change(fn=update_prompt, inputs=voice_dropdown, outputs=[prompt_audio, prompt_text])
133
+
134
+ with gr.Accordion("Advanced Settings", open=False):
135
+ speed = gr.Slider(label="Speed", value=1.0, minimum=0.1, maximum=2.0, step=0.1)
136
+ nfe_steps = gr.Slider(label="NFE Steps - more steps = more stable, but slower", value=8, minimum=1, maximum=64, step=1)
137
+
138
+ method = gr.Dropdown(choices=["rk4", "euler", "midpoint"], value="rk4", label="Method")
139
+ cfg_strength = gr.Slider(label="CFG Strength", value=2.0, minimum=0.0, maximum=5.0, step=0.1)
140
+ sway_sampling_coef = gr.Slider(label="Sway Sampling Coef", value=-1.0, minimum=-5.0, maximum=5.0, step=0.1)
141
+
142
+ with gr.Group():
143
+ gr.Markdown("# Generate")
144
+
145
+ input_text = gr.Textbox(label="Input Text", value="Hello, how are you?")
146
+ generate_button = gr.Button("Generate")
147
+
148
+ with gr.Group():
149
+ output_audio = gr.Audio(label="Output Audio")
150
+
151
+ generate_button.click(fn=_generate, inputs=[prompt_audio, prompt_text, input_text, nfe_steps, method, cfg_strength, sway_sampling_coef, speed], outputs=output_audio)
152
+
153
+ return demo
154
+
155
+
156
+ if __name__ == "__main__":
157
+
158
+
159
+ # Preload the model
160
+ model = Nanospeech.from_pretrained("lucasnewman/nanospeech")
161
+ prompt_list = get_prompt_list()
162
+
163
+ demo = create_demo(prompt_list, model)
164
+ demo.launch()
165
+
nanospeech/voices/celeste.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Pickled cucumbers.
nanospeech/voices/celeste.wav ADDED
Binary file (53.8 kB). View file
 
nanospeech/voices/luna.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Do you wish to see him?
nanospeech/voices/luna.wav ADDED
Binary file (50 kB). View file
 
nanospeech/voices/nash.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Look out for what?
nanospeech/voices/nash.wav ADDED
Binary file (50 kB). View file
 
nanospeech/voices/orion.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ It isn't his fault.
nanospeech/voices/orion.wav ADDED
Binary file (48 kB). View file
 
nanospeech/voices/rhea.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ They haven't met?
nanospeech/voices/rhea.wav ADDED
Binary file (51.9 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ soundfile
3
+ git+https://github.com/thunn/nanospeech.git
4
+ numpy