Spaces:
Running
Running
Commit
·
755994c
1
Parent(s):
c24b656
mix
Browse files- app.py +19 -4
- configs/base.yaml +2 -1
- vits/data_utils.py +27 -10
- vits/models.py +16 -9
- vits_pretrain/{sovits5.0_bigvgan.pth → sovits5.0_bigvgan_mix.pth} +2 -2
- whisper/inference.py +2 -1
app.py
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
from vits.models import SynthesizerInfer
|
2 |
from omegaconf import OmegaConf
|
3 |
import torchcrepe
|
@@ -72,7 +74,7 @@ model = SynthesizerInfer(
|
|
72 |
hp.data.filter_length // 2 + 1,
|
73 |
hp.data.segment_size // hp.data.hop_length,
|
74 |
hp)
|
75 |
-
load_svc_model("vits_pretrain/sovits5.
|
76 |
model.eval()
|
77 |
model.to(device)
|
78 |
|
@@ -81,6 +83,8 @@ def svc_change(argswave, argsspk):
|
|
81 |
|
82 |
argsppg = "svc_tmp.ppg.npy"
|
83 |
os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
|
|
|
|
|
84 |
|
85 |
spk = np.load(argsspk)
|
86 |
spk = torch.FloatTensor(spk)
|
@@ -89,13 +93,20 @@ def svc_change(argswave, argsspk):
|
|
89 |
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
|
90 |
ppg = torch.FloatTensor(ppg)
|
91 |
|
|
|
|
|
|
|
|
|
92 |
pit = compute_f0_nn(argswave, device)
|
93 |
pit = torch.FloatTensor(pit)
|
94 |
|
95 |
len_pit = pit.size()[0]
|
|
|
96 |
len_ppg = ppg.size()[0]
|
97 |
-
len_min = min(len_pit,
|
|
|
98 |
pit = pit[:len_min]
|
|
|
99 |
ppg = ppg[:len_min, :]
|
100 |
|
101 |
with torch.no_grad():
|
@@ -129,11 +140,12 @@ def svc_change(argswave, argsspk):
|
|
129 |
cut_e_out = -1 * hop_frame * hop_size
|
130 |
|
131 |
sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
|
|
|
132 |
sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
|
133 |
sub_len = torch.LongTensor([cut_e - cut_s]).to(device)
|
134 |
sub_har = source[:, :, cut_s *
|
135 |
hop_size:cut_e * hop_size].to(device)
|
136 |
-
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
|
137 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
138 |
|
139 |
sub_out = sub_out[cut_s_out:cut_e_out]
|
@@ -148,10 +160,11 @@ def svc_change(argswave, argsspk):
|
|
148 |
cut_s = 0
|
149 |
cut_s_out = 0
|
150 |
sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
|
|
|
151 |
sub_pit = pit[cut_s:].unsqueeze(0).to(device)
|
152 |
sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
|
153 |
sub_har = source[:, :, cut_s * hop_size:].to(device)
|
154 |
-
sub_out = model.inference(sub_ppg, sub_pit, spk, sub_len, sub_har)
|
155 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
156 |
|
157 |
sub_out = sub_out[cut_s_out:]
|
@@ -187,6 +200,8 @@ with app:
|
|
187 |
|
188 |
https://github.com/Multi-Singer/Multi-Singer.github.io
|
189 |
|
|
|
|
|
190 |
[轻度伴奏可以无需去伴奏]就能直接进行歌声转换的SVC库
|
191 |
""")
|
192 |
sid = gr.Dropdown(label="音色", choices=[
|
|
|
1 |
+
import sys,os
|
2 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
3 |
from vits.models import SynthesizerInfer
|
4 |
from omegaconf import OmegaConf
|
5 |
import torchcrepe
|
|
|
74 |
hp.data.filter_length // 2 + 1,
|
75 |
hp.data.segment_size // hp.data.hop_length,
|
76 |
hp)
|
77 |
+
load_svc_model("vits_pretrain/sovits5.0_bigvgan_mix.pth", model)
|
78 |
model.eval()
|
79 |
model.to(device)
|
80 |
|
|
|
83 |
|
84 |
argsppg = "svc_tmp.ppg.npy"
|
85 |
os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}")
|
86 |
+
argsvec = "svc_tmp.vec.npy"
|
87 |
+
os.system(f"python hubert/inference.py -w {argswave} -v {argsvec}")
|
88 |
|
89 |
spk = np.load(argsspk)
|
90 |
spk = torch.FloatTensor(spk)
|
|
|
93 |
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
|
94 |
ppg = torch.FloatTensor(ppg)
|
95 |
|
96 |
+
vec = np.load(argsvec)
|
97 |
+
vec = np.repeat(vec, 2, 0) # 320 PPG -> 160 * 2
|
98 |
+
vec = torch.FloatTensor(vec)
|
99 |
+
|
100 |
pit = compute_f0_nn(argswave, device)
|
101 |
pit = torch.FloatTensor(pit)
|
102 |
|
103 |
len_pit = pit.size()[0]
|
104 |
+
len_vec = vec.size()[0]
|
105 |
len_ppg = ppg.size()[0]
|
106 |
+
len_min = min(len_pit, len_vec)
|
107 |
+
len_min = min(len_min, len_ppg)
|
108 |
pit = pit[:len_min]
|
109 |
+
vec = vec[:len_min, :]
|
110 |
ppg = ppg[:len_min, :]
|
111 |
|
112 |
with torch.no_grad():
|
|
|
140 |
cut_e_out = -1 * hop_frame * hop_size
|
141 |
|
142 |
sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device)
|
143 |
+
sub_vec = vec[cut_s:cut_e, :].unsqueeze(0).to(device)
|
144 |
sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device)
|
145 |
sub_len = torch.LongTensor([cut_e - cut_s]).to(device)
|
146 |
sub_har = source[:, :, cut_s *
|
147 |
hop_size:cut_e * hop_size].to(device)
|
148 |
+
sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har)
|
149 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
150 |
|
151 |
sub_out = sub_out[cut_s_out:cut_e_out]
|
|
|
160 |
cut_s = 0
|
161 |
cut_s_out = 0
|
162 |
sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device)
|
163 |
+
sub_vec = vec[cut_s:, :].unsqueeze(0).to(device)
|
164 |
sub_pit = pit[cut_s:].unsqueeze(0).to(device)
|
165 |
sub_len = torch.LongTensor([all_frame - cut_s]).to(device)
|
166 |
sub_har = source[:, :, cut_s * hop_size:].to(device)
|
167 |
+
sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har)
|
168 |
sub_out = sub_out[0, 0].data.cpu().detach().numpy()
|
169 |
|
170 |
sub_out = sub_out[cut_s_out:]
|
|
|
200 |
|
201 |
https://github.com/Multi-Singer/Multi-Singer.github.io
|
202 |
|
203 |
+
mix_encoder: whisper + hubert, 提升跨语言能力和纯对白语音训练的效果
|
204 |
+
|
205 |
[轻度伴奏可以无需去伴奏]就能直接进行歌声转换的SVC库
|
206 |
""")
|
207 |
sid = gr.Dropdown(label="音色", choices=[
|
configs/base.yaml
CHANGED
@@ -28,11 +28,12 @@ data:
|
|
28 |
#############################
|
29 |
vits:
|
30 |
ppg_dim: 1024
|
|
|
31 |
spk_dim: 256
|
32 |
gin_channels: 256
|
33 |
inter_channels: 192
|
34 |
hidden_channels: 192
|
35 |
-
filter_channels:
|
36 |
#############################
|
37 |
gen:
|
38 |
upsample_input: 192
|
|
|
28 |
#############################
|
29 |
vits:
|
30 |
ppg_dim: 1024
|
31 |
+
vec_dim: 256
|
32 |
spk_dim: 256
|
33 |
gin_channels: 256
|
34 |
inter_channels: 192
|
35 |
hidden_channels: 192
|
36 |
+
filter_channels: 640
|
37 |
#############################
|
38 |
gen:
|
39 |
upsample_input: 192
|
vits/data_utils.py
CHANGED
@@ -29,13 +29,15 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
29 |
items_new = []
|
30 |
items_min = int(self.segment_size / self.hop_length * 4) # 1 S
|
31 |
items_max = int(self.segment_size / self.hop_length * 16) # 4 S
|
32 |
-
for wavpath, spec, pitch, ppg, spk in self.items:
|
33 |
if not os.path.isfile(wavpath):
|
34 |
continue
|
35 |
if not os.path.isfile(spec):
|
36 |
continue
|
37 |
if not os.path.isfile(pitch):
|
38 |
continue
|
|
|
|
|
39 |
if not os.path.isfile(ppg):
|
40 |
continue
|
41 |
if not os.path.isfile(spk):
|
@@ -46,7 +48,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
46 |
continue
|
47 |
if (usel >= items_max):
|
48 |
usel = items_max
|
49 |
-
items_new.append([wavpath, spec, pitch, ppg, spk, usel])
|
50 |
lengths.append(usel)
|
51 |
self.items = items_new
|
52 |
self.lengths = lengths
|
@@ -70,28 +72,35 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
70 |
wav = item[0]
|
71 |
spe = item[1]
|
72 |
pit = item[2]
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
76 |
|
77 |
wav = self.read_wav(wav)
|
78 |
spe = torch.load(spe)
|
79 |
|
80 |
pit = np.load(pit)
|
|
|
|
|
81 |
ppg = np.load(ppg)
|
82 |
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
|
83 |
spk = np.load(spk)
|
84 |
|
85 |
pit = torch.FloatTensor(pit)
|
|
|
86 |
ppg = torch.FloatTensor(ppg)
|
87 |
spk = torch.FloatTensor(spk)
|
88 |
|
89 |
len_pit = pit.size()[0]
|
|
|
90 |
len_ppg = ppg.size()[0] - 2 # for safe
|
91 |
-
len_min = min(len_pit,
|
|
|
92 |
len_wav = len_min * self.hop_length
|
93 |
|
94 |
pit = pit[:len_min]
|
|
|
95 |
ppg = ppg[:len_min, :]
|
96 |
spe = spe[:, :len_min]
|
97 |
wav = wav[:, :len_wav]
|
@@ -101,6 +110,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
101 |
frame_end = frame_start + use
|
102 |
|
103 |
pit = pit[frame_start:frame_end]
|
|
|
104 |
ppg = ppg[frame_start:frame_end, :]
|
105 |
spe = spe[:, frame_start:frame_end]
|
106 |
|
@@ -112,7 +122,7 @@ class TextAudioSpeakerSet(torch.utils.data.Dataset):
|
|
112 |
# print(ppg.shape)
|
113 |
# print(pit.shape)
|
114 |
# print(spk.shape)
|
115 |
-
return spe, wav, ppg, pit, spk
|
116 |
|
117 |
|
118 |
class TextAudioSpeakerCollate:
|
@@ -143,10 +153,13 @@ class TextAudioSpeakerCollate:
|
|
143 |
ppg_lengths = torch.FloatTensor(len(batch))
|
144 |
ppg_padded = torch.FloatTensor(
|
145 |
len(batch), max_ppg_len, batch[0][2].size(1))
|
|
|
|
|
146 |
pit_padded = torch.FloatTensor(len(batch), max_ppg_len)
|
147 |
ppg_padded.zero_()
|
|
|
148 |
pit_padded.zero_()
|
149 |
-
spk = torch.FloatTensor(len(batch), batch[0][
|
150 |
|
151 |
for i in range(len(ids_sorted_decreasing)):
|
152 |
row = batch[ids_sorted_decreasing[i]]
|
@@ -163,10 +176,13 @@ class TextAudioSpeakerCollate:
|
|
163 |
ppg_padded[i, : ppg.size(0), :] = ppg
|
164 |
ppg_lengths[i] = ppg.size(0)
|
165 |
|
166 |
-
|
|
|
|
|
|
|
167 |
pit_padded[i, : pit.size(0)] = pit
|
168 |
|
169 |
-
spk[i] = row[
|
170 |
# print(ppg_padded.shape)
|
171 |
# print(ppg_lengths.shape)
|
172 |
# print(pit_padded.shape)
|
@@ -178,6 +194,7 @@ class TextAudioSpeakerCollate:
|
|
178 |
return (
|
179 |
ppg_padded,
|
180 |
ppg_lengths,
|
|
|
181 |
pit_padded,
|
182 |
spk,
|
183 |
spe_padded,
|
|
|
29 |
items_new = []
|
30 |
items_min = int(self.segment_size / self.hop_length * 4) # 1 S
|
31 |
items_max = int(self.segment_size / self.hop_length * 16) # 4 S
|
32 |
+
for wavpath, spec, pitch, vec, ppg, spk in self.items:
|
33 |
if not os.path.isfile(wavpath):
|
34 |
continue
|
35 |
if not os.path.isfile(spec):
|
36 |
continue
|
37 |
if not os.path.isfile(pitch):
|
38 |
continue
|
39 |
+
if not os.path.isfile(vec):
|
40 |
+
continue
|
41 |
if not os.path.isfile(ppg):
|
42 |
continue
|
43 |
if not os.path.isfile(spk):
|
|
|
48 |
continue
|
49 |
if (usel >= items_max):
|
50 |
usel = items_max
|
51 |
+
items_new.append([wavpath, spec, pitch, vec, ppg, spk, usel])
|
52 |
lengths.append(usel)
|
53 |
self.items = items_new
|
54 |
self.lengths = lengths
|
|
|
72 |
wav = item[0]
|
73 |
spe = item[1]
|
74 |
pit = item[2]
|
75 |
+
vec = item[3]
|
76 |
+
ppg = item[4]
|
77 |
+
spk = item[5]
|
78 |
+
use = item[6]
|
79 |
|
80 |
wav = self.read_wav(wav)
|
81 |
spe = torch.load(spe)
|
82 |
|
83 |
pit = np.load(pit)
|
84 |
+
vec = np.load(vec)
|
85 |
+
vec = np.repeat(vec, 2, 0) # 320 PPG -> 160 * 2
|
86 |
ppg = np.load(ppg)
|
87 |
ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2
|
88 |
spk = np.load(spk)
|
89 |
|
90 |
pit = torch.FloatTensor(pit)
|
91 |
+
vec = torch.FloatTensor(vec)
|
92 |
ppg = torch.FloatTensor(ppg)
|
93 |
spk = torch.FloatTensor(spk)
|
94 |
|
95 |
len_pit = pit.size()[0]
|
96 |
+
len_vec = vec.size()[0] - 2 # for safe
|
97 |
len_ppg = ppg.size()[0] - 2 # for safe
|
98 |
+
len_min = min(len_pit, len_vec)
|
99 |
+
len_min = min(len_min, len_ppg)
|
100 |
len_wav = len_min * self.hop_length
|
101 |
|
102 |
pit = pit[:len_min]
|
103 |
+
vec = vec[:len_min, :]
|
104 |
ppg = ppg[:len_min, :]
|
105 |
spe = spe[:, :len_min]
|
106 |
wav = wav[:, :len_wav]
|
|
|
110 |
frame_end = frame_start + use
|
111 |
|
112 |
pit = pit[frame_start:frame_end]
|
113 |
+
vec = vec[frame_start:frame_end, :]
|
114 |
ppg = ppg[frame_start:frame_end, :]
|
115 |
spe = spe[:, frame_start:frame_end]
|
116 |
|
|
|
122 |
# print(ppg.shape)
|
123 |
# print(pit.shape)
|
124 |
# print(spk.shape)
|
125 |
+
return spe, wav, ppg, vec, pit, spk
|
126 |
|
127 |
|
128 |
class TextAudioSpeakerCollate:
|
|
|
153 |
ppg_lengths = torch.FloatTensor(len(batch))
|
154 |
ppg_padded = torch.FloatTensor(
|
155 |
len(batch), max_ppg_len, batch[0][2].size(1))
|
156 |
+
vec_padded = torch.FloatTensor(
|
157 |
+
len(batch), max_ppg_len, batch[0][3].size(1))
|
158 |
pit_padded = torch.FloatTensor(len(batch), max_ppg_len)
|
159 |
ppg_padded.zero_()
|
160 |
+
vec_padded.zero_()
|
161 |
pit_padded.zero_()
|
162 |
+
spk = torch.FloatTensor(len(batch), batch[0][5].size(0))
|
163 |
|
164 |
for i in range(len(ids_sorted_decreasing)):
|
165 |
row = batch[ids_sorted_decreasing[i]]
|
|
|
176 |
ppg_padded[i, : ppg.size(0), :] = ppg
|
177 |
ppg_lengths[i] = ppg.size(0)
|
178 |
|
179 |
+
vec = row[3]
|
180 |
+
vec_padded[i, : vec.size(0), :] = vec
|
181 |
+
|
182 |
+
pit = row[4]
|
183 |
pit_padded[i, : pit.size(0)] = pit
|
184 |
|
185 |
+
spk[i] = row[5]
|
186 |
# print(ppg_padded.shape)
|
187 |
# print(ppg_lengths.shape)
|
188 |
# print(pit_padded.shape)
|
|
|
194 |
return (
|
195 |
ppg_padded,
|
196 |
ppg_lengths,
|
197 |
+
vec_padded,
|
198 |
pit_padded,
|
199 |
spk,
|
200 |
spe_padded,
|
vits/models.py
CHANGED
@@ -14,6 +14,7 @@ from vits.modules_grl import SpeakerClassifier
|
|
14 |
class TextEncoder(nn.Module):
|
15 |
def __init__(self,
|
16 |
in_channels,
|
|
|
17 |
out_channels,
|
18 |
hidden_channels,
|
19 |
filter_channels,
|
@@ -24,6 +25,7 @@ class TextEncoder(nn.Module):
|
|
24 |
super().__init__()
|
25 |
self.out_channels = out_channels
|
26 |
self.pre = nn.Conv1d(in_channels, hidden_channels, kernel_size=5, padding=2)
|
|
|
27 |
self.pit = nn.Embedding(256, hidden_channels)
|
28 |
self.enc = attentions.Encoder(
|
29 |
hidden_channels,
|
@@ -34,13 +36,15 @@ class TextEncoder(nn.Module):
|
|
34 |
p_dropout)
|
35 |
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
36 |
|
37 |
-
def forward(self, x, x_lengths, f0):
|
38 |
x = torch.transpose(x, 1, -1) # [b, h, t]
|
39 |
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
40 |
x.dtype
|
41 |
)
|
42 |
x = self.pre(x) * x_mask
|
43 |
-
|
|
|
|
|
44 |
x = self.enc(x * x_mask, x_mask)
|
45 |
stats = self.proj(x) * x_mask
|
46 |
m, logs = torch.split(stats, self.out_channels, dim=1)
|
@@ -144,6 +148,7 @@ class SynthesizerTrn(nn.Module):
|
|
144 |
self.emb_g = nn.Linear(hp.vits.spk_dim, hp.vits.gin_channels)
|
145 |
self.enc_p = TextEncoder(
|
146 |
hp.vits.ppg_dim,
|
|
|
147 |
hp.vits.inter_channels,
|
148 |
hp.vits.hidden_channels,
|
149 |
hp.vits.filter_channels,
|
@@ -175,11 +180,12 @@ class SynthesizerTrn(nn.Module):
|
|
175 |
)
|
176 |
self.dec = Generator(hp=hp)
|
177 |
|
178 |
-
def forward(self, ppg, pit, spec, spk, ppg_l, spec_l):
|
179 |
-
ppg = ppg + torch.randn_like(ppg) # Perturbation
|
|
|
180 |
g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
|
181 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
182 |
-
ppg, ppg_l, f0=f0_to_coarse(pit))
|
183 |
z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
|
184 |
|
185 |
z_slice, pit_slice, ids_slice = commons.rand_slice_segments_with_pitch(
|
@@ -193,10 +199,10 @@ class SynthesizerTrn(nn.Module):
|
|
193 |
spk_preds = self.speaker_classifier(x)
|
194 |
return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
|
195 |
|
196 |
-
def infer(self, ppg, pit, spk, ppg_l):
|
197 |
ppg = ppg + torch.randn_like(ppg) * 0.0001 # Perturbation
|
198 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
199 |
-
ppg, ppg_l, f0=f0_to_coarse(pit))
|
200 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
201 |
o = self.dec(spk, z * ppg_mask, f0=pit)
|
202 |
return o
|
@@ -213,6 +219,7 @@ class SynthesizerInfer(nn.Module):
|
|
213 |
self.segment_size = segment_size
|
214 |
self.enc_p = TextEncoder(
|
215 |
hp.vits.ppg_dim,
|
|
|
216 |
hp.vits.inter_channels,
|
217 |
hp.vits.hidden_channels,
|
218 |
hp.vits.filter_channels,
|
@@ -241,9 +248,9 @@ class SynthesizerInfer(nn.Module):
|
|
241 |
def source2wav(self, source):
|
242 |
return self.dec.source2wav(source)
|
243 |
|
244 |
-
def inference(self, ppg, pit, spk, ppg_l, source):
|
245 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
246 |
-
ppg, ppg_l, f0=f0_to_coarse(pit))
|
247 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
248 |
o = self.dec.inference(spk, z * ppg_mask, source)
|
249 |
return o
|
|
|
14 |
class TextEncoder(nn.Module):
|
15 |
def __init__(self,
|
16 |
in_channels,
|
17 |
+
vec_channels,
|
18 |
out_channels,
|
19 |
hidden_channels,
|
20 |
filter_channels,
|
|
|
25 |
super().__init__()
|
26 |
self.out_channels = out_channels
|
27 |
self.pre = nn.Conv1d(in_channels, hidden_channels, kernel_size=5, padding=2)
|
28 |
+
self.hub = nn.Conv1d(vec_channels, hidden_channels, kernel_size=5, padding=2)
|
29 |
self.pit = nn.Embedding(256, hidden_channels)
|
30 |
self.enc = attentions.Encoder(
|
31 |
hidden_channels,
|
|
|
36 |
p_dropout)
|
37 |
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
38 |
|
39 |
+
def forward(self, x, x_lengths, v, f0):
|
40 |
x = torch.transpose(x, 1, -1) # [b, h, t]
|
41 |
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
42 |
x.dtype
|
43 |
)
|
44 |
x = self.pre(x) * x_mask
|
45 |
+
v = torch.transpose(v, 1, -1) # [b, h, t]
|
46 |
+
v = self.hub(v) * x_mask
|
47 |
+
x = x + v + self.pit(f0).transpose(1, 2)
|
48 |
x = self.enc(x * x_mask, x_mask)
|
49 |
stats = self.proj(x) * x_mask
|
50 |
m, logs = torch.split(stats, self.out_channels, dim=1)
|
|
|
148 |
self.emb_g = nn.Linear(hp.vits.spk_dim, hp.vits.gin_channels)
|
149 |
self.enc_p = TextEncoder(
|
150 |
hp.vits.ppg_dim,
|
151 |
+
hp.vits.vec_dim,
|
152 |
hp.vits.inter_channels,
|
153 |
hp.vits.hidden_channels,
|
154 |
hp.vits.filter_channels,
|
|
|
180 |
)
|
181 |
self.dec = Generator(hp=hp)
|
182 |
|
183 |
+
def forward(self, ppg, vec, pit, spec, spk, ppg_l, spec_l):
|
184 |
+
ppg = ppg + torch.randn_like(ppg) * 1 # Perturbation
|
185 |
+
vec = vec + torch.randn_like(vec) * 2 # Perturbation
|
186 |
g = self.emb_g(F.normalize(spk)).unsqueeze(-1)
|
187 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
188 |
+
ppg, ppg_l, vec, f0=f0_to_coarse(pit))
|
189 |
z_q, m_q, logs_q, spec_mask = self.enc_q(spec, spec_l, g=g)
|
190 |
|
191 |
z_slice, pit_slice, ids_slice = commons.rand_slice_segments_with_pitch(
|
|
|
199 |
spk_preds = self.speaker_classifier(x)
|
200 |
return audio, ids_slice, spec_mask, (z_f, z_r, z_p, m_p, logs_p, z_q, m_q, logs_q, logdet_f, logdet_r), spk_preds
|
201 |
|
202 |
+
def infer(self, ppg, vec, pit, spk, ppg_l):
|
203 |
ppg = ppg + torch.randn_like(ppg) * 0.0001 # Perturbation
|
204 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
205 |
+
ppg, ppg_l, vec, f0=f0_to_coarse(pit))
|
206 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
207 |
o = self.dec(spk, z * ppg_mask, f0=pit)
|
208 |
return o
|
|
|
219 |
self.segment_size = segment_size
|
220 |
self.enc_p = TextEncoder(
|
221 |
hp.vits.ppg_dim,
|
222 |
+
hp.vits.vec_dim,
|
223 |
hp.vits.inter_channels,
|
224 |
hp.vits.hidden_channels,
|
225 |
hp.vits.filter_channels,
|
|
|
248 |
def source2wav(self, source):
|
249 |
return self.dec.source2wav(source)
|
250 |
|
251 |
+
def inference(self, ppg, vec, pit, spk, ppg_l, source):
|
252 |
z_p, m_p, logs_p, ppg_mask, x = self.enc_p(
|
253 |
+
ppg, ppg_l, vec, f0=f0_to_coarse(pit))
|
254 |
z, _ = self.flow(z_p, ppg_mask, g=spk, reverse=True)
|
255 |
o = self.dec.inference(spk, z * ppg_mask, source)
|
256 |
return o
|
vits_pretrain/{sovits5.0_bigvgan.pth → sovits5.0_bigvgan_mix.pth}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6b941958b20d2eb91abdb6ff9d1344e056ec2c78116e4c3a1e2b23b022d32db1
|
3 |
+
size 79352005
|
whisper/inference.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
-
import os
|
|
|
2 |
import numpy as np
|
3 |
import argparse
|
4 |
import torch
|
|
|
1 |
+
import sys,os
|
2 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
3 |
import numpy as np
|
4 |
import argparse
|
5 |
import torch
|