Spaces:
Sleeping
Sleeping
0xrushi
commited on
Commit
Β·
ef09716
1
Parent(s):
2c5fb56
rest
Browse files- data/{ref_weights.pkl β 15sec.wav} +2 -2
- scripts/f5py.py +24 -14
data/{ref_weights.pkl β 15sec.wav}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d60bbb8b75f74fbd5793f32895fcbbe4587e5efc2b464a328d812dec86583cd3
|
3 |
+
size 5810220
|
scripts/f5py.py
CHANGED
@@ -25,9 +25,9 @@ from f5_tts.infer.utils_infer import (
|
|
25 |
infer_process,
|
26 |
load_model,
|
27 |
load_vocoder,
|
|
|
28 |
remove_silence_for_generated_wav,
|
29 |
)
|
30 |
-
from ref_utils import load_ref_weights
|
31 |
|
32 |
# ββ USER CONFIG ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
33 |
config_path = "infer/examples/basic/basic.toml"
|
@@ -35,6 +35,7 @@ model = "F5TTS_v1_Base"
|
|
35 |
model_cfg_path = None # e.g. "path/to/your/model.yaml", or leave None to use default from config
|
36 |
ckpt_file = "" # leave blank to pull from HF cache
|
37 |
vocab_file = "" # leave blank to use default
|
|
|
38 |
ref_text = (
|
39 |
"Fuck your phone. Stop texting all the time. "
|
40 |
"Look up from your phone and breathe. Release yourself."
|
@@ -74,14 +75,14 @@ fix_duration = config.get("fix_duration", fix_duration)
|
|
74 |
device = config.get("device", device)
|
75 |
|
76 |
# if user pointed at example paths inside the package, fix them
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
|
86 |
# if using a gen_file, load its text
|
87 |
if gen_file:
|
@@ -142,7 +143,7 @@ ema_model = load_model(
|
|
142 |
)
|
143 |
|
144 |
|
145 |
-
def generate_tts(input_text, output_dir="tests", output_file=None, ref_text=None):
|
146 |
"""
|
147 |
Generate text-to-speech audio from input text.
|
148 |
|
@@ -150,6 +151,7 @@ def generate_tts(input_text, output_dir="tests", output_file=None, ref_text=None
|
|
150 |
input_text (str): Text to convert to speech
|
151 |
output_dir (str): Directory to save the output file (default: "tests")
|
152 |
output_file (str): Output filename (default: auto-generated based on timestamp)
|
|
|
153 |
ref_text (str): Reference text (default: predefined text)
|
154 |
|
155 |
Returns:
|
@@ -166,10 +168,18 @@ def generate_tts(input_text, output_dir="tests", output_file=None, ref_text=None
|
|
166 |
if output_file is None:
|
167 |
output_file = f"infer_cli_{datetime.now():%Y%m%d_%H%M%S}.wav"
|
168 |
|
169 |
-
#
|
170 |
-
|
171 |
-
|
172 |
-
voices
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
# break text into perβvoice chunks
|
175 |
reg1 = r"(?=\[\w+\])"
|
|
|
25 |
infer_process,
|
26 |
load_model,
|
27 |
load_vocoder,
|
28 |
+
preprocess_ref_audio_text,
|
29 |
remove_silence_for_generated_wav,
|
30 |
)
|
|
|
31 |
|
32 |
# ββ USER CONFIG ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
33 |
config_path = "infer/examples/basic/basic.toml"
|
|
|
35 |
model_cfg_path = None # e.g. "path/to/your/model.yaml", or leave None to use default from config
|
36 |
ckpt_file = "" # leave blank to pull from HF cache
|
37 |
vocab_file = "" # leave blank to use default
|
38 |
+
ref_audio = "data/15sec.wav"
|
39 |
ref_text = (
|
40 |
"Fuck your phone. Stop texting all the time. "
|
41 |
"Look up from your phone and breathe. Release yourself."
|
|
|
75 |
device = config.get("device", device)
|
76 |
|
77 |
# if user pointed at example paths inside the package, fix them
|
78 |
+
if "infer/examples/" in ref_audio:
|
79 |
+
ref_audio = str(files("f5_tts").joinpath(ref_audio))
|
80 |
+
if gen_file and "infer/examples/" in gen_file:
|
81 |
+
gen_file = str(files("f5_tts").joinpath(gen_file))
|
82 |
+
if "voices" in config:
|
83 |
+
for v in config["voices"].values():
|
84 |
+
if "infer/examples/" in v.get("ref_audio", ""):
|
85 |
+
v["ref_audio"] = str(files("f5_tts").joinpath(v["ref_audio"]))
|
86 |
|
87 |
# if using a gen_file, load its text
|
88 |
if gen_file:
|
|
|
143 |
)
|
144 |
|
145 |
|
146 |
+
def generate_tts(input_text, output_dir="tests", output_file=None, ref_audio=ref_audio, ref_text=None):
|
147 |
"""
|
148 |
Generate text-to-speech audio from input text.
|
149 |
|
|
|
151 |
input_text (str): Text to convert to speech
|
152 |
output_dir (str): Directory to save the output file (default: "tests")
|
153 |
output_file (str): Output filename (default: auto-generated based on timestamp)
|
154 |
+
ref_audio (str): Reference audio file (default: "15sec.wav")
|
155 |
ref_text (str): Reference text (default: predefined text)
|
156 |
|
157 |
Returns:
|
|
|
168 |
if output_file is None:
|
169 |
output_file = f"infer_cli_{datetime.now():%Y%m%d_%H%M%S}.wav"
|
170 |
|
171 |
+
# assemble voices dict
|
172 |
+
main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
|
173 |
+
voices = {"main": main_voice}
|
174 |
+
if "voices" in config:
|
175 |
+
voices.update(config["voices"])
|
176 |
+
voices["main"] = main_voice
|
177 |
+
|
178 |
+
# preprocess all references
|
179 |
+
for name, v in voices.items():
|
180 |
+
v["ref_audio"], v["ref_text"] = preprocess_ref_audio_text(
|
181 |
+
v["ref_audio"], v["ref_text"]
|
182 |
+
)
|
183 |
|
184 |
# break text into perβvoice chunks
|
185 |
reg1 = r"(?=\[\w+\])"
|