Spaces:
Runtime error
Runtime error
Jarod Castillo
commited on
Commit
·
9d54d5c
1
Parent(s):
bb70eb3
comments added
Browse files- app.py +21 -4
- config.py +38 -0
- lib/infer_pack/models.py +16 -0
- vc_infer_pipeline.py +55 -22
app.py
CHANGED
|
@@ -30,9 +30,12 @@ from config import Config
|
|
| 30 |
from vocal_isolation.vocal_isolation import isolate_vocals_kim_vocals
|
| 31 |
|
| 32 |
config = Config()
|
| 33 |
-
|
|
|
|
|
|
|
| 34 |
spaces = os.getenv("SYSTEM") == "spaces"
|
| 35 |
force_support = None
|
|
|
|
| 36 |
if config.unsupported is False:
|
| 37 |
if config.device == "mps" or config.device == "cpu":
|
| 38 |
force_support = False
|
|
@@ -134,6 +137,7 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
|
|
| 134 |
return vc_fn
|
| 135 |
|
| 136 |
def load_model():
|
|
|
|
| 137 |
categories = []
|
| 138 |
if os.path.isfile("weights/folder_info.json"):
|
| 139 |
with open("weights/folder_info.json", "r", encoding="utf-8") as f:
|
|
@@ -153,9 +157,10 @@ def load_model():
|
|
| 153 |
model_title = info['title']
|
| 154 |
model_name = info['model_path']
|
| 155 |
model_author = info.get("author", None)
|
| 156 |
-
model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
|
| 157 |
model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
|
| 158 |
-
cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
|
|
|
|
| 159 |
tgt_sr = cpt["config"][-1]
|
| 160 |
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
| 161 |
if_f0 = cpt.get("f0", 1)
|
|
@@ -172,15 +177,27 @@ def load_model():
|
|
| 172 |
else:
|
| 173 |
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
|
| 174 |
model_version = "V2"
|
|
|
|
|
|
|
| 175 |
del net_g.enc_q
|
| 176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
net_g.eval().to(config.device)
|
|
|
|
| 178 |
if config.is_half:
|
| 179 |
net_g = net_g.half()
|
| 180 |
else:
|
| 181 |
net_g = net_g.float()
|
|
|
|
| 182 |
vc = VC(tgt_sr, config)
|
|
|
|
| 183 |
print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
|
|
|
|
| 184 |
models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
|
| 185 |
categories.append([category_title, category_folder, description, models])
|
| 186 |
else:
|
|
|
|
| 30 |
from vocal_isolation.vocal_isolation import isolate_vocals_kim_vocals
|
| 31 |
|
| 32 |
config = Config()
|
| 33 |
+
# This can be any name, just a way to output logs during runtime
|
| 34 |
+
logging.getLogger("smotto").setLevel(logging.WARNING)
|
| 35 |
+
# Checking if it's a huggingface space that's running this file
|
| 36 |
spaces = os.getenv("SYSTEM") == "spaces"
|
| 37 |
force_support = None
|
| 38 |
+
# If we're using CPU, disable force_support
|
| 39 |
if config.unsupported is False:
|
| 40 |
if config.device == "mps" or config.device == "cpu":
|
| 41 |
force_support = False
|
|
|
|
| 137 |
return vc_fn
|
| 138 |
|
| 139 |
def load_model():
|
| 140 |
+
logs = []
|
| 141 |
categories = []
|
| 142 |
if os.path.isfile("weights/folder_info.json"):
|
| 143 |
with open("weights/folder_info.json", "r", encoding="utf-8") as f:
|
|
|
|
| 157 |
model_title = info['title']
|
| 158 |
model_name = info['model_path']
|
| 159 |
model_author = info.get("author", None)
|
| 160 |
+
model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}" # Just a photo of the model
|
| 161 |
model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
|
| 162 |
+
cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
|
| 163 |
+
|
| 164 |
tgt_sr = cpt["config"][-1]
|
| 165 |
cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk
|
| 166 |
if_f0 = cpt.get("f0", 1)
|
|
|
|
| 177 |
else:
|
| 178 |
net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
|
| 179 |
model_version = "V2"
|
| 180 |
+
|
| 181 |
+
# Deleting the posterior encoder, assuming that it's not needed for inference.
|
| 182 |
del net_g.enc_q
|
| 183 |
+
|
| 184 |
+
logs.append(f"Net Generator after posterior encoder deletion: {net_g}\n{info}")
|
| 185 |
+
|
| 186 |
+
# Loading weights from the checkpoint into the neural network. Strict means we can load with missing dictionary keys
|
| 187 |
+
net_g.load_state_dict(cpt["weight"], strict=False)
|
| 188 |
+
|
| 189 |
+
# Prepare the model for inference
|
| 190 |
net_g.eval().to(config.device)
|
| 191 |
+
|
| 192 |
if config.is_half:
|
| 193 |
net_g = net_g.half()
|
| 194 |
else:
|
| 195 |
net_g = net_g.float()
|
| 196 |
+
|
| 197 |
vc = VC(tgt_sr, config)
|
| 198 |
+
|
| 199 |
print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
|
| 200 |
+
# Create the voice conversion method
|
| 201 |
models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
|
| 202 |
categories.append([category_title, category_folder, description, models])
|
| 203 |
else:
|
config.py
CHANGED
|
@@ -4,7 +4,18 @@ import torch
|
|
| 4 |
from multiprocessing import cpu_count
|
| 5 |
|
| 6 |
class Config:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
def __init__(self):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
self.device = "cuda:0"
|
| 9 |
self.is_half = True
|
| 10 |
self.n_cpu = 0
|
|
@@ -19,6 +30,12 @@ class Config:
|
|
| 19 |
|
| 20 |
@staticmethod
|
| 21 |
def arg_parse() -> tuple:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
parser = argparse.ArgumentParser()
|
| 23 |
parser.add_argument("--colab", action="store_true", help="Launch in colab")
|
| 24 |
parser.add_argument("--api", action="store_true", help="Launch with api")
|
|
@@ -35,6 +52,11 @@ class Config:
|
|
| 35 |
# check `getattr` and try it for compatibility
|
| 36 |
@staticmethod
|
| 37 |
def has_mps() -> bool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
if not torch.backends.mps.is_available():
|
| 39 |
return False
|
| 40 |
try:
|
|
@@ -44,6 +66,22 @@ class Config:
|
|
| 44 |
return False
|
| 45 |
|
| 46 |
def device_config(self) -> tuple:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
if torch.cuda.is_available():
|
| 48 |
i_device = int(self.device.split(":")[-1])
|
| 49 |
self.gpu_name = torch.cuda.get_device_name(i_device)
|
|
|
|
| 4 |
from multiprocessing import cpu_count
|
| 5 |
|
| 6 |
class Config:
|
| 7 |
+
"""
|
| 8 |
+
The code focuses on adapting the configuration based on available
|
| 9 |
+
hardware resources and specified command-line arguments,
|
| 10 |
+
aiming to optimize the performance and capabilities of the voice conversion process.
|
| 11 |
+
"""
|
| 12 |
def __init__(self):
|
| 13 |
+
"""
|
| 14 |
+
Calls the arg_parse() and device_config() methods to set up configuration based on command-line arguments
|
| 15 |
+
and available hardware.
|
| 16 |
+
|
| 17 |
+
Returns: None
|
| 18 |
+
"""
|
| 19 |
self.device = "cuda:0"
|
| 20 |
self.is_half = True
|
| 21 |
self.n_cpu = 0
|
|
|
|
| 30 |
|
| 31 |
@staticmethod
|
| 32 |
def arg_parse() -> tuple:
|
| 33 |
+
"""
|
| 34 |
+
Uses the argparse library to parse command-line arguments.
|
| 35 |
+
Three boolean arguments are defined: --colab, --api, and --unsupported.
|
| 36 |
+
|
| 37 |
+
Returns: a tuple indicating whether each argument is specified or not.
|
| 38 |
+
"""
|
| 39 |
parser = argparse.ArgumentParser()
|
| 40 |
parser.add_argument("--colab", action="store_true", help="Launch in colab")
|
| 41 |
parser.add_argument("--api", action="store_true", help="Launch with api")
|
|
|
|
| 52 |
# check `getattr` and try it for compatibility
|
| 53 |
@staticmethod
|
| 54 |
def has_mps() -> bool:
|
| 55 |
+
"""
|
| 56 |
+
Determines if Multi-Process Service (MPS) is available in the current PyTorch backend.
|
| 57 |
+
If MPS is available, it checks whether it can be used by trying to move a tensor to the "mps" device.
|
| 58 |
+
Returns a boolean indicating MPS support.
|
| 59 |
+
"""
|
| 60 |
if not torch.backends.mps.is_available():
|
| 61 |
return False
|
| 62 |
try:
|
|
|
|
| 66 |
return False
|
| 67 |
|
| 68 |
def device_config(self) -> tuple:
|
| 69 |
+
"""
|
| 70 |
+
Checks if a CUDA-compatible GPU is available.
|
| 71 |
+
|
| 72 |
+
If a compatible GPU is found:
|
| 73 |
+
Determines the GPU's name and memory capacity.
|
| 74 |
+
Adjusts the is_half parameter based on the GPU's characteristics.
|
| 75 |
+
|
| 76 |
+
If no compatible GPU is found and MPS is available, configures the device to use MPS.
|
| 77 |
+
|
| 78 |
+
If no compatible GPU and MPS support, configures the device to use CPU.
|
| 79 |
+
|
| 80 |
+
Determines the number of available CPU cores (n_cpu).
|
| 81 |
+
|
| 82 |
+
Based on the is_half value and GPU memory capacity, configures several variables related to voice conversion,
|
| 83 |
+
such as x_pad, x_query, x_center, and x_max.
|
| 84 |
+
"""
|
| 85 |
if torch.cuda.is_available():
|
| 86 |
i_device = int(self.device.split(":")[-1])
|
| 87 |
self.gpu_name = torch.cuda.get_device_name(i_device)
|
lib/infer_pack/models.py
CHANGED
|
@@ -158,6 +158,12 @@ class ResidualCouplingBlock(nn.Module):
|
|
| 158 |
|
| 159 |
|
| 160 |
class PosteriorEncoder(nn.Module):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
def __init__(
|
| 162 |
self,
|
| 163 |
in_channels,
|
|
@@ -168,6 +174,16 @@ class PosteriorEncoder(nn.Module):
|
|
| 168 |
n_layers,
|
| 169 |
gin_channels=0,
|
| 170 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
super().__init__()
|
| 172 |
self.in_channels = in_channels
|
| 173 |
self.out_channels = out_channels
|
|
|
|
| 158 |
|
| 159 |
|
| 160 |
class PosteriorEncoder(nn.Module):
|
| 161 |
+
"""
|
| 162 |
+
Responsible for transforming input data (possibly spectrogram representations of audio)
|
| 163 |
+
into a latent representation (or "posterior") that can be used for generating audio samples
|
| 164 |
+
or for other purposes within a generative model.
|
| 165 |
+
|
| 166 |
+
"""
|
| 167 |
def __init__(
|
| 168 |
self,
|
| 169 |
in_channels,
|
|
|
|
| 174 |
n_layers,
|
| 175 |
gin_channels=0,
|
| 176 |
):
|
| 177 |
+
"""
|
| 178 |
+
Initializes the parameters and layers of the posterior encoder.
|
| 179 |
+
in_channels: Number of input channels (e.g., frequency bins in a spectrogram).
|
| 180 |
+
out_channels: Number of output channels in the latent representation.
|
| 181 |
+
hidden_channels: Number of hidden channels in intermediate layers.
|
| 182 |
+
kernel_size: Size of the convolutional kernel.
|
| 183 |
+
dilation_rate: Dilation rate for dilated convolutions.
|
| 184 |
+
n_layers: Number of layers in the posterior encoder.
|
| 185 |
+
gin_channels: Optional number of global conditioning channels (if applicable).
|
| 186 |
+
"""
|
| 187 |
super().__init__()
|
| 188 |
self.in_channels = in_channels
|
| 189 |
self.out_channels = out_channels
|
vc_infer_pipeline.py
CHANGED
|
@@ -9,7 +9,7 @@ from functools import lru_cache
|
|
| 9 |
now_dir = os.getcwd()
|
| 10 |
sys.path.append(now_dir)
|
| 11 |
|
| 12 |
-
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
|
| 13 |
|
| 14 |
input_audio_path2wav = {}
|
| 15 |
|
|
@@ -51,6 +51,9 @@ def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出
|
|
| 51 |
|
| 52 |
|
| 53 |
class VC(object):
|
|
|
|
|
|
|
|
|
|
| 54 |
def __init__(self, tgt_sr, config):
|
| 55 |
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
|
| 56 |
config.x_pad,
|
|
@@ -59,14 +62,18 @@ class VC(object):
|
|
| 59 |
config.x_max,
|
| 60 |
config.is_half,
|
| 61 |
)
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
self.t_pad_tgt = tgt_sr * self.x_pad
|
| 66 |
self.t_pad2 = self.t_pad * 2
|
| 67 |
-
self.t_query = self.sr * self.x_query #
|
| 68 |
-
self.t_center = self.sr * self.x_center #
|
| 69 |
-
self.t_max = self.sr * self.x_max #
|
| 70 |
self.device = config.device
|
| 71 |
|
| 72 |
def get_f0(
|
|
@@ -79,6 +86,11 @@ class VC(object):
|
|
| 79 |
filter_radius,
|
| 80 |
inp_f0=None,
|
| 81 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
global input_audio_path2wav
|
| 83 |
time_step = self.window / self.sr * 1000
|
| 84 |
f0_min = 50
|
|
@@ -168,7 +180,7 @@ class VC(object):
|
|
| 168 |
sid,
|
| 169 |
audio0,
|
| 170 |
pitch,
|
| 171 |
-
pitchf,
|
| 172 |
times,
|
| 173 |
index,
|
| 174 |
big_npy,
|
|
@@ -176,6 +188,13 @@ class VC(object):
|
|
| 176 |
version,
|
| 177 |
protect,
|
| 178 |
): # ,file_index,file_big_npy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
feats = torch.from_numpy(audio0)
|
| 180 |
if self.is_half:
|
| 181 |
feats = feats.half()
|
|
@@ -261,6 +280,7 @@ class VC(object):
|
|
| 261 |
t2 = ttime()
|
| 262 |
times[0] += t1 - t0
|
| 263 |
times[2] += t2 - t1
|
|
|
|
| 264 |
return audio1
|
| 265 |
|
| 266 |
def pipeline(
|
|
@@ -285,6 +305,14 @@ class VC(object):
|
|
| 285 |
protect,
|
| 286 |
f0_file=None,
|
| 287 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
if (
|
| 289 |
file_index != ""
|
| 290 |
# and file_big_npy != ""
|
|
@@ -293,23 +321,25 @@ class VC(object):
|
|
| 293 |
and index_rate != 0
|
| 294 |
):
|
| 295 |
try:
|
| 296 |
-
index = faiss.read_index(file_index)
|
| 297 |
# big_npy = np.load(file_big_npy)
|
| 298 |
-
big_npy = index.reconstruct_n(0, index.ntotal)
|
| 299 |
except:
|
| 300 |
traceback.print_exc()
|
| 301 |
index = big_npy = None
|
| 302 |
else:
|
| 303 |
-
index = big_npy = None
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
|
|
|
|
|
|
| 307 |
if audio_pad.shape[0] > self.t_max:
|
| 308 |
-
audio_sum = np.zeros_like(audio)
|
| 309 |
for i in range(self.window):
|
| 310 |
audio_sum += audio_pad[i : i - self.window]
|
| 311 |
for t in range(self.t_center, audio.shape[0], self.t_center):
|
| 312 |
-
|
| 313 |
t
|
| 314 |
- self.t_query
|
| 315 |
+ np.where(
|
|
@@ -317,12 +347,13 @@ class VC(object):
|
|
| 317 |
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
| 318 |
)[0][0]
|
| 319 |
)
|
|
|
|
| 320 |
s = 0
|
| 321 |
audio_opt = []
|
| 322 |
t = None
|
| 323 |
t1 = ttime()
|
| 324 |
-
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
|
| 325 |
-
|
| 326 |
inp_f0 = None
|
| 327 |
if hasattr(f0_file, "name") == True:
|
| 328 |
try:
|
|
@@ -337,24 +368,26 @@ class VC(object):
|
|
| 337 |
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
| 338 |
pitch, pitchf = None, None
|
| 339 |
if if_f0 == 1:
|
|
|
|
|
|
|
| 340 |
pitch, pitchf = self.get_f0(
|
| 341 |
input_audio_path,
|
| 342 |
audio_pad,
|
| 343 |
-
|
| 344 |
f0_up_key,
|
| 345 |
f0_method,
|
| 346 |
filter_radius,
|
| 347 |
inp_f0,
|
| 348 |
)
|
| 349 |
-
pitch = pitch[:
|
| 350 |
-
pitchf = pitchf[:
|
| 351 |
if self.device == "mps":
|
| 352 |
pitchf = pitchf.astype(np.float32)
|
| 353 |
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
| 354 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
| 355 |
t2 = ttime()
|
| 356 |
times[1] += t2 - t1
|
| 357 |
-
for t in
|
| 358 |
t = t // self.window * self.window
|
| 359 |
if if_f0 == 1:
|
| 360 |
audio_opt.append(
|
|
|
|
| 9 |
now_dir = os.getcwd()
|
| 10 |
sys.path.append(now_dir)
|
| 11 |
|
| 12 |
+
bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) # Design for the audio filter
|
| 13 |
|
| 14 |
input_audio_path2wav = {}
|
| 15 |
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
class VC(object):
|
| 54 |
+
"""
|
| 55 |
+
Voice Conversion system.
|
| 56 |
+
"""
|
| 57 |
def __init__(self, tgt_sr, config):
|
| 58 |
self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
|
| 59 |
config.x_pad,
|
|
|
|
| 62 |
config.x_max,
|
| 63 |
config.is_half,
|
| 64 |
)
|
| 65 |
+
"""
|
| 66 |
+
The class has several parameters that get initialized such as `x_pad`, `x_query`, and so on from the configuration object provided.
|
| 67 |
+
These parameters appear to be related to audio processing, specifying things like sample rate, window size, padding amounts, etc.
|
| 68 |
+
"""
|
| 69 |
+
self.sr = 16000 # Hubert input sample rate
|
| 70 |
+
self.window = 160 # Number of points per frame
|
| 71 |
+
self.t_pad = self.sr * self.x_pad # Padding time before and after each segment
|
| 72 |
self.t_pad_tgt = tgt_sr * self.x_pad
|
| 73 |
self.t_pad2 = self.t_pad * 2
|
| 74 |
+
self.t_query = self.sr * self.x_query # Query time before and after each query point
|
| 75 |
+
self.t_center = self.sr * self.x_center # Query point position
|
| 76 |
+
self.t_max = self.sr * self.x_max # Duration threshold for non-query time
|
| 77 |
self.device = config.device
|
| 78 |
|
| 79 |
def get_f0(
|
|
|
|
| 86 |
filter_radius,
|
| 87 |
inp_f0=None,
|
| 88 |
):
|
| 89 |
+
"""
|
| 90 |
+
Extracts fundamental frequency ('F0' or pitch) from a given audio signal
|
| 91 |
+
Multiple methods are available, such as 'pm', 'harvest', 'crepe', 'rmvpe'
|
| 92 |
+
Libraries 'parselmouth', 'torchcrepe' compute pitch, and 'cache_harvest_f0' is being used to compute pitch
|
| 93 |
+
"""
|
| 94 |
global input_audio_path2wav
|
| 95 |
time_step = self.window / self.sr * 1000
|
| 96 |
f0_min = 50
|
|
|
|
| 180 |
sid,
|
| 181 |
audio0,
|
| 182 |
pitch,
|
| 183 |
+
pitchf, # ???
|
| 184 |
times,
|
| 185 |
index,
|
| 186 |
big_npy,
|
|
|
|
| 188 |
version,
|
| 189 |
protect,
|
| 190 |
): # ,file_index,file_big_npy
|
| 191 |
+
"""
|
| 192 |
+
The holy grail, the main conversion function.
|
| 193 |
+
Takes an numpy audio signal, processes it through a model, spits out a numpy audio signal.
|
| 194 |
+
Modifies the pitch (or 'F0') of the audio signal, given the 'pitch' and 'pitchf' parameters.
|
| 195 |
+
Neural network generator (net_g) infers the voice.
|
| 196 |
+
'index' and 'big_npy' is used to retrieve similar audio features from a pre-computed database for better conversion quality.
|
| 197 |
+
"""
|
| 198 |
feats = torch.from_numpy(audio0)
|
| 199 |
if self.is_half:
|
| 200 |
feats = feats.half()
|
|
|
|
| 280 |
t2 = ttime()
|
| 281 |
times[0] += t1 - t0
|
| 282 |
times[2] += t2 - t1
|
| 283 |
+
|
| 284 |
return audio1
|
| 285 |
|
| 286 |
def pipeline(
|
|
|
|
| 305 |
protect,
|
| 306 |
f0_file=None,
|
| 307 |
):
|
| 308 |
+
"""
|
| 309 |
+
This is a pipeline function that strings together multiple operations for voice conversion.
|
| 310 |
+
The function does some preprocessing on the input audio(e.g. filtering)
|
| 311 |
+
The function then segments the audio into pieces and processes each segment through the voice conversion ('vc') method
|
| 312 |
+
The converted segments are then concatenated to produce the final converted audio
|
| 313 |
+
"""
|
| 314 |
+
|
| 315 |
+
# Phase 1: Load index file
|
| 316 |
if (
|
| 317 |
file_index != ""
|
| 318 |
# and file_big_npy != ""
|
|
|
|
| 321 |
and index_rate != 0
|
| 322 |
):
|
| 323 |
try:
|
| 324 |
+
index = faiss.read_index(file_index) # Read from the vector store
|
| 325 |
# big_npy = np.load(file_big_npy)
|
| 326 |
+
big_npy = index.reconstruct_n(0, index.ntotal) # Convert index into a big numpy array
|
| 327 |
except:
|
| 328 |
traceback.print_exc()
|
| 329 |
index = big_npy = None
|
| 330 |
else:
|
| 331 |
+
index = big_npy = None # If we don't have the index file, it's ok we won't use it.
|
| 332 |
+
|
| 333 |
+
# Phase 2: Filter audio signal
|
| 334 |
+
audio = signal.filtfilt(bh, ah, audio) # Avoid phase distortion
|
| 335 |
+
audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") # Padding to ensure we calculate the beginning and end accurately
|
| 336 |
+
optimal_time_shifts = []
|
| 337 |
if audio_pad.shape[0] > self.t_max:
|
| 338 |
+
audio_sum = np.zeros_like(audio) # Create numpy array filled with zeros and same shape as audio
|
| 339 |
for i in range(self.window):
|
| 340 |
audio_sum += audio_pad[i : i - self.window]
|
| 341 |
for t in range(self.t_center, audio.shape[0], self.t_center):
|
| 342 |
+
optimal_time_shifts.append(
|
| 343 |
t
|
| 344 |
- self.t_query
|
| 345 |
+ np.where(
|
|
|
|
| 347 |
== np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
|
| 348 |
)[0][0]
|
| 349 |
)
|
| 350 |
+
|
| 351 |
s = 0
|
| 352 |
audio_opt = []
|
| 353 |
t = None
|
| 354 |
t1 = ttime()
|
| 355 |
+
audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") # Reflection of the signal's edges on both ends
|
| 356 |
+
number_of_analysis_frames = audio_pad.shape[0] // self.window
|
| 357 |
inp_f0 = None
|
| 358 |
if hasattr(f0_file, "name") == True:
|
| 359 |
try:
|
|
|
|
| 368 |
sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
|
| 369 |
pitch, pitchf = None, None
|
| 370 |
if if_f0 == 1:
|
| 371 |
+
# Calls the `get_f0` method to calculate pitch values based on audio features.
|
| 372 |
+
# These calculated pitch values are used for voice conversion.
|
| 373 |
pitch, pitchf = self.get_f0(
|
| 374 |
input_audio_path,
|
| 375 |
audio_pad,
|
| 376 |
+
number_of_analysis_frames,
|
| 377 |
f0_up_key,
|
| 378 |
f0_method,
|
| 379 |
filter_radius,
|
| 380 |
inp_f0,
|
| 381 |
)
|
| 382 |
+
pitch = pitch[:number_of_analysis_frames]
|
| 383 |
+
pitchf = pitchf[:number_of_analysis_frames]
|
| 384 |
if self.device == "mps":
|
| 385 |
pitchf = pitchf.astype(np.float32)
|
| 386 |
pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
|
| 387 |
pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
|
| 388 |
t2 = ttime()
|
| 389 |
times[1] += t2 - t1
|
| 390 |
+
for t in optimal_time_shifts:
|
| 391 |
t = t // self.window * self.window
|
| 392 |
if if_f0 == 1:
|
| 393 |
audio_opt.append(
|