0xrushi commited on
Commit
ef09716
Β·
1 Parent(s): 2c5fb56
data/{ref_weights.pkl β†’ 15sec.wav} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76513fe3c720861d8c165113a4844336c957422d3c967e9a3e5300d1a1293bfe
3
- size 126
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60bbb8b75f74fbd5793f32895fcbbe4587e5efc2b464a328d812dec86583cd3
3
+ size 5810220
scripts/f5py.py CHANGED
@@ -25,9 +25,9 @@ from f5_tts.infer.utils_infer import (
25
  infer_process,
26
  load_model,
27
  load_vocoder,
 
28
  remove_silence_for_generated_wav,
29
  )
30
- from ref_utils import load_ref_weights
31
 
32
  # ── USER CONFIG ────────────────────────────────────────────────────────────────
33
  config_path = "infer/examples/basic/basic.toml"
@@ -35,6 +35,7 @@ model = "F5TTS_v1_Base"
35
  model_cfg_path = None # e.g. "path/to/your/model.yaml", or leave None to use default from config
36
  ckpt_file = "" # leave blank to pull from HF cache
37
  vocab_file = "" # leave blank to use default
 
38
  ref_text = (
39
  "Fuck your phone. Stop texting all the time. "
40
  "Look up from your phone and breathe. Release yourself."
@@ -74,14 +75,14 @@ fix_duration = config.get("fix_duration", fix_duration)
74
  device = config.get("device", device)
75
 
76
  # if user pointed at example paths inside the package, fix them
77
- # if "infer/examples/" in ref_audio:
78
- # ref_audio = str(files("f5_tts").joinpath(ref_audio))
79
- # if gen_file and "infer/examples/" in gen_file:
80
- # gen_file = str(files("f5_tts").joinpath(gen_file))
81
- # if "voices" in config:
82
- # for v in config["voices"].values():
83
- # if "infer/examples/" in v.get("ref_audio", ""):
84
- # v["ref_audio"] = str(files("f5_tts").joinpath(v["ref_audio"]))
85
 
86
  # if using a gen_file, load its text
87
  if gen_file:
@@ -142,7 +143,7 @@ ema_model = load_model(
142
  )
143
 
144
 
145
- def generate_tts(input_text, output_dir="tests", output_file=None, ref_text=None):
146
  """
147
  Generate text-to-speech audio from input text.
148
 
@@ -150,6 +151,7 @@ def generate_tts(input_text, output_dir="tests", output_file=None, ref_text=None
150
  input_text (str): Text to convert to speech
151
  output_dir (str): Directory to save the output file (default: "tests")
152
  output_file (str): Output filename (default: auto-generated based on timestamp)
 
153
  ref_text (str): Reference text (default: predefined text)
154
 
155
  Returns:
@@ -166,10 +168,18 @@ def generate_tts(input_text, output_dir="tests", output_file=None, ref_text=None
166
  if output_file is None:
167
  output_file = f"infer_cli_{datetime.now():%Y%m%d_%H%M%S}.wav"
168
 
169
- # load preprocessed reference weights
170
- base_dir = os.path.dirname(os.path.dirname(__file__))
171
- pkl_path = os.path.join(base_dir, "data", "ref_weights.pkl")
172
- voices = load_ref_weights(pkl_path)
 
 
 
 
 
 
 
 
173
 
174
  # break text into per‑voice chunks
175
  reg1 = r"(?=\[\w+\])"
 
25
  infer_process,
26
  load_model,
27
  load_vocoder,
28
+ preprocess_ref_audio_text,
29
  remove_silence_for_generated_wav,
30
  )
 
31
 
32
  # ── USER CONFIG ────────────────────────────────────────────────────────────────
33
  config_path = "infer/examples/basic/basic.toml"
 
35
  model_cfg_path = None # e.g. "path/to/your/model.yaml", or leave None to use default from config
36
  ckpt_file = "" # leave blank to pull from HF cache
37
  vocab_file = "" # leave blank to use default
38
+ ref_audio = "data/15sec.wav"
39
  ref_text = (
40
  "Fuck your phone. Stop texting all the time. "
41
  "Look up from your phone and breathe. Release yourself."
 
75
  device = config.get("device", device)
76
 
77
  # if user pointed at example paths inside the package, fix them
78
+ if "infer/examples/" in ref_audio:
79
+ ref_audio = str(files("f5_tts").joinpath(ref_audio))
80
+ if gen_file and "infer/examples/" in gen_file:
81
+ gen_file = str(files("f5_tts").joinpath(gen_file))
82
+ if "voices" in config:
83
+ for v in config["voices"].values():
84
+ if "infer/examples/" in v.get("ref_audio", ""):
85
+ v["ref_audio"] = str(files("f5_tts").joinpath(v["ref_audio"]))
86
 
87
  # if using a gen_file, load its text
88
  if gen_file:
 
143
  )
144
 
145
 
146
+ def generate_tts(input_text, output_dir="tests", output_file=None, ref_audio=ref_audio, ref_text=None):
147
  """
148
  Generate text-to-speech audio from input text.
149
 
 
151
  input_text (str): Text to convert to speech
152
  output_dir (str): Directory to save the output file (default: "tests")
153
  output_file (str): Output filename (default: auto-generated based on timestamp)
154
+ ref_audio (str): Reference audio file (default: "15sec.wav")
155
  ref_text (str): Reference text (default: predefined text)
156
 
157
  Returns:
 
168
  if output_file is None:
169
  output_file = f"infer_cli_{datetime.now():%Y%m%d_%H%M%S}.wav"
170
 
171
+ # assemble voices dict
172
+ main_voice = {"ref_audio": ref_audio, "ref_text": ref_text}
173
+ voices = {"main": main_voice}
174
+ if "voices" in config:
175
+ voices.update(config["voices"])
176
+ voices["main"] = main_voice
177
+
178
+ # preprocess all references
179
+ for name, v in voices.items():
180
+ v["ref_audio"], v["ref_text"] = preprocess_ref_audio_text(
181
+ v["ref_audio"], v["ref_text"]
182
+ )
183
 
184
  # break text into per‑voice chunks
185
  reg1 = r"(?=\[\w+\])"