Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -34,11 +34,21 @@ def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion
|
|
34 |
|
35 |
speaker_embedding = None
|
36 |
if speaker_audio is not None:
|
|
|
|
|
37 |
wav, sr = torchaudio.load(speaker_audio)
|
|
|
|
|
|
|
38 |
speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16))
|
|
|
|
|
39 |
|
40 |
emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16)
|
41 |
vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0)
|
|
|
|
|
|
|
42 |
|
43 |
cond_dict = make_cond_dict(
|
44 |
text=input,
|
@@ -52,8 +62,12 @@ def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion
|
|
52 |
dnsmos_ovrl=float(dnsmos_ovrl),
|
53 |
device=device,
|
54 |
)
|
|
|
|
|
55 |
|
56 |
conditioning = model.prepare_conditioning(cond_dict)
|
|
|
|
|
57 |
|
58 |
codes = model.generate(
|
59 |
prefix_conditioning=conditioning,
|
@@ -62,12 +76,20 @@ def generate(input, language, speaker_audio, emotion_happy, emotion_sad, emotion
|
|
62 |
batch_size=1,
|
63 |
sampling_params=dict(min_p=float(min_p)),
|
64 |
)
|
|
|
|
|
65 |
|
66 |
wav_out = model.autoencoder.decode(codes).cpu().detach()
|
67 |
sr_out = model.autoencoder.sampling_rate
|
|
|
|
|
|
|
68 |
|
69 |
if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :]
|
70 |
-
|
|
|
|
|
|
|
71 |
return (sr_out, wav_out.squeeze().numpy())
|
72 |
|
73 |
# Initialize
|
|
|
34 |
|
35 |
speaker_embedding = None
|
36 |
if speaker_audio is not None:
|
37 |
+
print(1)
|
38 |
+
print(speaker_audio)
|
39 |
wav, sr = torchaudio.load(speaker_audio)
|
40 |
+
print(2)
|
41 |
+
print(wav)
|
42 |
+
print(sr)
|
43 |
speaker_embedding = (model.make_speaker_embedding(wav, sr).to(device, dtype=torch.bfloat16))
|
44 |
+
print(3)
|
45 |
+
print(speaker_embedding)
|
46 |
|
47 |
emotion_tensor = torch.tensor([emotion_happy, emotion_sad, emotion_disgust, emotion_fear, emotion_surprise, emotion_anger, emotion_other, emotion_neutral], device=device, dtype=torch.bfloat16)
|
48 |
vq_tensor = torch.tensor([clarity] * 8, device=device, dtype=torch.bfloat16).unsqueeze(0)
|
49 |
+
print(4)
|
50 |
+
print(emotion_tensor)
|
51 |
+
print(vq_tensor)
|
52 |
|
53 |
cond_dict = make_cond_dict(
|
54 |
text=input,
|
|
|
62 |
dnsmos_ovrl=float(dnsmos_ovrl),
|
63 |
device=device,
|
64 |
)
|
65 |
+
print(5)
|
66 |
+
print(cond_dict)
|
67 |
|
68 |
conditioning = model.prepare_conditioning(cond_dict)
|
69 |
+
print(6)
|
70 |
+
print(conditioning)
|
71 |
|
72 |
codes = model.generate(
|
73 |
prefix_conditioning=conditioning,
|
|
|
76 |
batch_size=1,
|
77 |
sampling_params=dict(min_p=float(min_p)),
|
78 |
)
|
79 |
+
print(7)
|
80 |
+
print(codes)
|
81 |
|
82 |
wav_out = model.autoencoder.decode(codes).cpu().detach()
|
83 |
sr_out = model.autoencoder.sampling_rate
|
84 |
+
print(8)
|
85 |
+
print(wav_output)
|
86 |
+
print(sr_output)
|
87 |
|
88 |
if wav_out.dim() == 2 and wav_out.size(0) > 1: wav_out = wav_out[0:1, :]
|
89 |
+
|
90 |
+
print(9)
|
91 |
+
print((sr_out, wav_out.squeeze().numpy()))
|
92 |
+
|
93 |
return (sr_out, wav_out.squeeze().numpy())
|
94 |
|
95 |
# Initialize
|