Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
7438313
1
Parent(s):
fae012e
latest
Browse files
chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc
CHANGED
Binary files a/chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc and b/chatterbox/src/chatterbox/__pycache__/tts.cpython-311.pyc differ
|
|
chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc
CHANGED
Binary files a/chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc and b/chatterbox/src/chatterbox/models/t3/inference/__pycache__/t3_hf_backend.cpython-311.pyc differ
|
|
chatterbox/src/chatterbox/tts.py
CHANGED
@@ -6,7 +6,6 @@ import torch
|
|
6 |
import perth
|
7 |
import torch.nn.functional as F
|
8 |
from huggingface_hub import hf_hub_download
|
9 |
-
from silero_vad import load_silero_vad, get_speech_timestamps
|
10 |
|
11 |
from .models.t3 import T3
|
12 |
from .models.s3tokenizer import S3_SR, drop_invalid_tokens
|
@@ -14,23 +13,11 @@ from .models.s3gen import S3GEN_SR, S3Gen
|
|
14 |
from .models.tokenizers import EnTokenizer
|
15 |
from .models.voice_encoder import VoiceEncoder
|
16 |
from .models.t3.modules.cond_enc import T3Cond
|
17 |
-
from .utils import trim_silence
|
18 |
|
19 |
|
20 |
REPO_ID = "ResembleAI/chatterbox"
|
21 |
|
22 |
|
23 |
-
def change_pace(speech_tokens: torch.Tensor, pace: float):
|
24 |
-
"""
|
25 |
-
:param speech_tokens: Tensor of shape (L,)
|
26 |
-
:param pace: float, pace (default: 1)
|
27 |
-
"""
|
28 |
-
L = len(speech_tokens)
|
29 |
-
speech_tokens = F.interpolate(speech_tokens.view(1, 1, -1).float(), size=int(L / pace), mode="nearest")
|
30 |
-
speech_tokens = speech_tokens.view(-1).long()
|
31 |
-
return speech_tokens
|
32 |
-
|
33 |
-
|
34 |
def punc_norm(text: str) -> str:
|
35 |
"""
|
36 |
Quick cleanup func for punctuation from LLMs or
|
@@ -134,7 +121,6 @@ class ChatterboxTTS:
|
|
134 |
self.device = device
|
135 |
self.conds = conds
|
136 |
self.watermarker = perth.PerthImplicitWatermarker()
|
137 |
-
self.vad_model = load_silero_vad()
|
138 |
|
139 |
@classmethod
|
140 |
def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
|
@@ -182,19 +168,6 @@ class ChatterboxTTS:
|
|
182 |
|
183 |
ref_16k_wav = librosa.resample(s3gen_ref_wav, orig_sr=S3GEN_SR, target_sr=S3_SR)
|
184 |
|
185 |
-
vad_wav = ref_16k_wav
|
186 |
-
if S3_SR != 16000:
|
187 |
-
vad_wav = librosa.resample(ref_16k_wav, orig_sr=S3_SR, target_sr=16000)
|
188 |
-
|
189 |
-
speech_timestamps = get_speech_timestamps(
|
190 |
-
vad_wav,
|
191 |
-
self.vad_model,
|
192 |
-
return_seconds=True,
|
193 |
-
)
|
194 |
-
|
195 |
-
# s3gen_ref_wav = trim_silence(s3gen_ref_wav, speech_timestamps, S3GEN_SR)
|
196 |
-
# ref_16k_wav = trim_silence(ref_16k_wav, speech_timestamps, S3_SR)
|
197 |
-
|
198 |
s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
|
199 |
s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
|
200 |
|
@@ -220,8 +193,7 @@ class ChatterboxTTS:
|
|
220 |
text,
|
221 |
audio_prompt_path=None,
|
222 |
exaggeration=0.5,
|
223 |
-
cfg_weight=0,
|
224 |
-
pace=1,
|
225 |
temperature=0.8,
|
226 |
):
|
227 |
if audio_prompt_path:
|
@@ -263,8 +235,6 @@ class ChatterboxTTS:
|
|
263 |
speech_tokens = drop_invalid_tokens(speech_tokens)
|
264 |
speech_tokens = speech_tokens.to(self.device)
|
265 |
|
266 |
-
speech_tokens = change_pace(speech_tokens, pace=pace)
|
267 |
-
|
268 |
wav, _ = self.s3gen.inference(
|
269 |
speech_tokens=speech_tokens,
|
270 |
ref_dict=self.conds.gen,
|
|
|
6 |
import perth
|
7 |
import torch.nn.functional as F
|
8 |
from huggingface_hub import hf_hub_download
|
|
|
9 |
|
10 |
from .models.t3 import T3
|
11 |
from .models.s3tokenizer import S3_SR, drop_invalid_tokens
|
|
|
13 |
from .models.tokenizers import EnTokenizer
|
14 |
from .models.voice_encoder import VoiceEncoder
|
15 |
from .models.t3.modules.cond_enc import T3Cond
|
|
|
16 |
|
17 |
|
18 |
REPO_ID = "ResembleAI/chatterbox"
|
19 |
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
def punc_norm(text: str) -> str:
|
22 |
"""
|
23 |
Quick cleanup func for punctuation from LLMs or
|
|
|
121 |
self.device = device
|
122 |
self.conds = conds
|
123 |
self.watermarker = perth.PerthImplicitWatermarker()
|
|
|
124 |
|
125 |
@classmethod
|
126 |
def from_local(cls, ckpt_dir, device) -> 'ChatterboxTTS':
|
|
|
168 |
|
169 |
ref_16k_wav = librosa.resample(s3gen_ref_wav, orig_sr=S3GEN_SR, target_sr=S3_SR)
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
s3gen_ref_wav = s3gen_ref_wav[:self.DEC_COND_LEN]
|
172 |
s3gen_ref_dict = self.s3gen.embed_ref(s3gen_ref_wav, S3GEN_SR, device=self.device)
|
173 |
|
|
|
193 |
text,
|
194 |
audio_prompt_path=None,
|
195 |
exaggeration=0.5,
|
196 |
+
cfg_weight=0.5,
|
|
|
197 |
temperature=0.8,
|
198 |
):
|
199 |
if audio_prompt_path:
|
|
|
235 |
speech_tokens = drop_invalid_tokens(speech_tokens)
|
236 |
speech_tokens = speech_tokens.to(self.device)
|
237 |
|
|
|
|
|
238 |
wav, _ = self.s3gen.inference(
|
239 |
speech_tokens=speech_tokens,
|
240 |
ref_dict=self.conds.gen,
|
chatterbox/src/chatterbox/utils.py
CHANGED
@@ -1,15 +1,15 @@
|
|
1 |
-
import numpy as np
|
2 |
-
|
3 |
-
|
4 |
-
def trim_silence(wav, speech_timestamps, sr):
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
|
|
1 |
+
# import numpy as np
|
2 |
+
#
|
3 |
+
#
|
4 |
+
# def trim_silence(wav, speech_timestamps, sr):
|
5 |
+
# """TODO: fading"""
|
6 |
+
# if len(speech_timestamps) == 0:
|
7 |
+
# return wav # WARNING: no speech detected, returning original wav
|
8 |
+
# segs = []
|
9 |
+
# for segment in speech_timestamps:
|
10 |
+
# start_s, end_s = segment['start'], segment['end']
|
11 |
+
# start = int(start_s * sr)
|
12 |
+
# end = int(end_s * sr)
|
13 |
+
# seg = wav[start: end]
|
14 |
+
# segs.append(seg)
|
15 |
+
# return np.concatenate(segs)
|