Spaces:

Smotto
/

Sing-For-Me

Runtime error

App Files Files Community

Jarod Castillo commited on Sep 1, 2023

Commit

9d54d5c

1 Parent(s): bb70eb3

comments added

Browse files

Files changed (4) hide show

app.py +21 -4
config.py +38 -0
lib/infer_pack/models.py +16 -0
vc_infer_pipeline.py +55 -22

app.py CHANGED Viewed

@@ -30,9 +30,12 @@ from config import Config
 from vocal_isolation.vocal_isolation import isolate_vocals_kim_vocals
 config = Config()
-logging.getLogger("numba").setLevel(logging.WARNING)
 spaces = os.getenv("SYSTEM") == "spaces"
 force_support = None
 if config.unsupported is False:
     if config.device == "mps" or config.device == "cpu":
         force_support = False
@@ -134,6 +137,7 @@ def create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, file_index):
     return vc_fn
 def load_model():
     categories = []
     if os.path.isfile("weights/folder_info.json"):
         with open("weights/folder_info.json", "r", encoding="utf-8") as f:
@@ -153,9 +157,10 @@ def load_model():
                 model_title = info['title']
                 model_name = info['model_path']
                 model_author = info.get("author", None)
-                model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}"
                 model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
-                cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
                 tgt_sr = cpt["config"][-1]
                 cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
                 if_f0 = cpt.get("f0", 1)
@@ -172,15 +177,27 @@ def load_model():
                     else:
                         net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
                     model_version = "V2"
                 del net_g.enc_q
-                print(net_g.load_state_dict(cpt["weight"], strict=False))
                 net_g.eval().to(config.device)
                 if config.is_half:
                     net_g = net_g.half()
                 else:
                     net_g = net_g.float()
                 vc = VC(tgt_sr, config)
                 print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
                 models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
             categories.append([category_title, category_folder, description, models])
     else:

 from vocal_isolation.vocal_isolation import isolate_vocals_kim_vocals
 config = Config()
+# This can be any name, just a way to output logs during runtime
+logging.getLogger("smotto").setLevel(logging.WARNING)
+# Checking if it's a huggingface space that's running this file
 spaces = os.getenv("SYSTEM") == "spaces"
 force_support = None
+# If we're using CPU, disable force_support
 if config.unsupported is False:
     if config.device == "mps" or config.device == "cpu":
         force_support = False
     return vc_fn
 def load_model():
+    logs = []
     categories = []
     if os.path.isfile("weights/folder_info.json"):
         with open("weights/folder_info.json", "r", encoding="utf-8") as f:
                 model_title = info['title']
                 model_name = info['model_path']
                 model_author = info.get("author", None)
+                model_cover = f"weights/{category_folder}/{character_name}/{info['cover']}" # Just a photo of the model
                 model_index = f"weights/{category_folder}/{character_name}/{info['feature_retrieval_library']}"
+                cpt = torch.load(f"weights/{category_folder}/{character_name}/{model_name}", map_location="cpu")
                 tgt_sr = cpt["config"][-1]
                 cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]  # n_spk
                 if_f0 = cpt.get("f0", 1)
                     else:
                         net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
                     model_version = "V2"
+                # Deleting the posterior encoder, assuming that it's not needed for inference.
                 del net_g.enc_q
+                logs.append(f"Net Generator after posterior encoder deletion: {net_g}\n{info}")
+                # Loading weights from the checkpoint into the neural network. Strict means we can load with missing dictionary keys
+                net_g.load_state_dict(cpt["weight"], strict=False)
+                # Prepare the model for inference
                 net_g.eval().to(config.device)
                 if config.is_half:
                     net_g = net_g.half()
                 else:
                     net_g = net_g.float()
                 vc = VC(tgt_sr, config)
                 print(f"Model loaded: {character_name} / {info['feature_retrieval_library']} | ({model_version})")
+                # Create the voice conversion method
                 models.append((character_name, model_title, model_author, model_cover, model_version, create_vc_fn(model_name, tgt_sr, net_g, vc, if_f0, version, model_index)))
             categories.append([category_title, category_folder, description, models])
     else:

config.py CHANGED Viewed

@@ -4,7 +4,18 @@ import torch
 from multiprocessing import cpu_count
 class Config:
     def __init__(self):
         self.device = "cuda:0"
         self.is_half = True
         self.n_cpu = 0
@@ -19,6 +30,12 @@ class Config:
     @staticmethod
     def arg_parse() -> tuple:
         parser = argparse.ArgumentParser()
         parser.add_argument("--colab", action="store_true", help="Launch in colab")
         parser.add_argument("--api", action="store_true", help="Launch with api")
@@ -35,6 +52,11 @@ class Config:
     # check `getattr` and try it for compatibility
     @staticmethod
     def has_mps() -> bool:
         if not torch.backends.mps.is_available():
             return False
         try:
@@ -44,6 +66,22 @@ class Config:
             return False
     def device_config(self) -> tuple:
         if torch.cuda.is_available():
             i_device = int(self.device.split(":")[-1])
             self.gpu_name = torch.cuda.get_device_name(i_device)

 from multiprocessing import cpu_count
 class Config:
+    """
+    The code focuses on adapting the configuration based on available
+    hardware resources and specified command-line arguments,
+    aiming to optimize the performance and capabilities of the voice conversion process.
+    """
     def __init__(self):
+        """
+        Calls the arg_parse() and device_config() methods to set up configuration based on command-line arguments
+        and available hardware.
+        Returns: None
+        """
         self.device = "cuda:0"
         self.is_half = True
         self.n_cpu = 0
     @staticmethod
     def arg_parse() -> tuple:
+        """
+        Uses the argparse library to parse command-line arguments.
+        Three boolean arguments are defined: --colab, --api, and --unsupported.
+        Returns: a tuple indicating whether each argument is specified or not.
+        """
         parser = argparse.ArgumentParser()
         parser.add_argument("--colab", action="store_true", help="Launch in colab")
         parser.add_argument("--api", action="store_true", help="Launch with api")
     # check `getattr` and try it for compatibility
     @staticmethod
     def has_mps() -> bool:
+        """
+        Determines if Multi-Process Service (MPS) is available in the current PyTorch backend.
+        If MPS is available, it checks whether it can be used by trying to move a tensor to the "mps" device.
+        Returns a boolean indicating MPS support.
+        """
         if not torch.backends.mps.is_available():
             return False
         try:
             return False
     def device_config(self) -> tuple:
+        """
+        Checks if a CUDA-compatible GPU is available.
+        If a compatible GPU is found:
+            Determines the GPU's name and memory capacity.
+            Adjusts the is_half parameter based on the GPU's characteristics.
+        If no compatible GPU is found and MPS is available, configures the device to use MPS.
+        If no compatible GPU and MPS support, configures the device to use CPU.
+        Determines the number of available CPU cores (n_cpu).
+        Based on the is_half value and GPU memory capacity, configures several variables related to voice conversion,
+          such as x_pad, x_query, x_center, and x_max.
+        """
         if torch.cuda.is_available():
             i_device = int(self.device.split(":")[-1])
             self.gpu_name = torch.cuda.get_device_name(i_device)

lib/infer_pack/models.py CHANGED Viewed

@@ -158,6 +158,12 @@ class ResidualCouplingBlock(nn.Module):
 class PosteriorEncoder(nn.Module):
     def __init__(
         self,
         in_channels,
@@ -168,6 +174,16 @@ class PosteriorEncoder(nn.Module):
         n_layers,
         gin_channels=0,
     ):
         super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels

 class PosteriorEncoder(nn.Module):
+    """
+    Responsible for transforming input data (possibly spectrogram representations of audio)
+    into a latent representation (or "posterior") that can be used for generating audio samples
+    or for other purposes within a generative model.
+    """
     def __init__(
         self,
         in_channels,
         n_layers,
         gin_channels=0,
     ):
+        """
+        Initializes the parameters and layers of the posterior encoder.
+        in_channels: Number of input channels (e.g., frequency bins in a spectrogram).
+        out_channels: Number of output channels in the latent representation.
+        hidden_channels: Number of hidden channels in intermediate layers.
+        kernel_size: Size of the convolutional kernel.
+        dilation_rate: Dilation rate for dilated convolutions.
+        n_layers: Number of layers in the posterior encoder.
+        gin_channels: Optional number of global conditioning channels (if applicable).
+        """
         super().__init__()
         self.in_channels = in_channels
         self.out_channels = out_channels

vc_infer_pipeline.py CHANGED Viewed

@@ -9,7 +9,7 @@ from functools import lru_cache
 now_dir = os.getcwd()
 sys.path.append(now_dir)
-bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000)
 input_audio_path2wav = {}
@@ -51,6 +51,9 @@ def change_rms(data1, sr1, data2, sr2, rate):  # 1是输入音频，2是输出
 class VC(object):
     def __init__(self, tgt_sr, config):
         self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
             config.x_pad,
@@ -59,14 +62,18 @@ class VC(object):
             config.x_max,
             config.is_half,
         )
-        self.sr = 16000  # hubert输入采样率
-        self.window = 160  # 每帧点数
-        self.t_pad = self.sr * self.x_pad  # 每条前后pad时间
         self.t_pad_tgt = tgt_sr * self.x_pad
         self.t_pad2 = self.t_pad * 2
-        self.t_query = self.sr * self.x_query  # 查询切点前后查询时间
-        self.t_center = self.sr * self.x_center  # 查询切点位置
-        self.t_max = self.sr * self.x_max  # 免查询时长阈值
         self.device = config.device
     def get_f0(
@@ -79,6 +86,11 @@ class VC(object):
         filter_radius,
         inp_f0=None,
     ):
         global input_audio_path2wav
         time_step = self.window / self.sr * 1000
         f0_min = 50
@@ -168,7 +180,7 @@ class VC(object):
         sid,
         audio0,
         pitch,
-        pitchf,
         times,
         index,
         big_npy,
@@ -176,6 +188,13 @@ class VC(object):
         version,
         protect,
     ):  # ,file_index,file_big_npy
         feats = torch.from_numpy(audio0)
         if self.is_half:
             feats = feats.half()
@@ -261,6 +280,7 @@ class VC(object):
         t2 = ttime()
         times[0] += t1 - t0
         times[2] += t2 - t1
         return audio1
     def pipeline(
@@ -285,6 +305,14 @@ class VC(object):
         protect,
         f0_file=None,
     ):
         if (
             file_index != ""
             # and file_big_npy != ""
@@ -293,23 +321,25 @@ class VC(object):
             and index_rate != 0
         ):
             try:
-                index = faiss.read_index(file_index)
                 # big_npy = np.load(file_big_npy)
-                big_npy = index.reconstruct_n(0, index.ntotal)
             except:
                 traceback.print_exc()
                 index = big_npy = None
         else:
-            index = big_npy = None
-        audio = signal.filtfilt(bh, ah, audio)
-        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect")
-        opt_ts = []
         if audio_pad.shape[0] > self.t_max:
-            audio_sum = np.zeros_like(audio)
             for i in range(self.window):
                 audio_sum += audio_pad[i : i - self.window]
             for t in range(self.t_center, audio.shape[0], self.t_center):
-                opt_ts.append(
                     t
                     - self.t_query
                     + np.where(
@@ -317,12 +347,13 @@ class VC(object):
                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
                     )[0][0]
                 )
         s = 0
         audio_opt = []
         t = None
         t1 = ttime()
-        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect")
-        p_len = audio_pad.shape[0] // self.window
         inp_f0 = None
         if hasattr(f0_file, "name") == True:
             try:
@@ -337,24 +368,26 @@ class VC(object):
         sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
         pitch, pitchf = None, None
         if if_f0 == 1:
             pitch, pitchf = self.get_f0(
                 input_audio_path,
                 audio_pad,
-                p_len,
                 f0_up_key,
                 f0_method,
                 filter_radius,
                 inp_f0,
             )
-            pitch = pitch[:p_len]
-            pitchf = pitchf[:p_len]
             if self.device == "mps":
                 pitchf = pitchf.astype(np.float32)
             pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
             pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
         t2 = ttime()
         times[1] += t2 - t1
-        for t in opt_ts:
             t = t // self.window * self.window
             if if_f0 == 1:
                 audio_opt.append(

 now_dir = os.getcwd()
 sys.path.append(now_dir)
+bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) # Design for the audio filter
 input_audio_path2wav = {}
 class VC(object):
+    """
+    Voice Conversion system.
+    """
     def __init__(self, tgt_sr, config):
         self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = (
             config.x_pad,
             config.x_max,
             config.is_half,
         )
+        """
+        The class has several parameters that get initialized such as `x_pad`, `x_query`, and so on from the configuration object provided.
+        These parameters appear to be related to audio processing, specifying things like sample rate, window size, padding amounts, etc.
+        """
+        self.sr = 16000 # Hubert input sample rate
+        self.window = 160  # Number of points per frame
+        self.t_pad = self.sr * self.x_pad  # Padding time before and after each segment
         self.t_pad_tgt = tgt_sr * self.x_pad
         self.t_pad2 = self.t_pad * 2
+        self.t_query = self.sr * self.x_query  # Query time before and after each query point
+        self.t_center = self.sr * self.x_center  # Query point position
+        self.t_max = self.sr * self.x_max  # Duration threshold for non-query time
         self.device = config.device
     def get_f0(
         filter_radius,
         inp_f0=None,
     ):
+        """
+        Extracts fundamental frequency ('F0' or pitch) from a given audio signal
+        Multiple methods are available, such as 'pm', 'harvest', 'crepe', 'rmvpe'
+        Libraries 'parselmouth', 'torchcrepe' compute pitch, and 'cache_harvest_f0' is being used to compute pitch
+        """
         global input_audio_path2wav
         time_step = self.window / self.sr * 1000
         f0_min = 50
         sid,
         audio0,
         pitch,
+        pitchf, # ???
         times,
         index,
         big_npy,
         version,
         protect,
     ):  # ,file_index,file_big_npy
+        """
+        The holy grail, the main conversion function.
+        Takes an numpy audio signal, processes it through a model, spits out a numpy audio signal.
+        Modifies the pitch (or 'F0') of the audio signal, given the 'pitch' and 'pitchf' parameters.
+        Neural network generator (net_g) infers the voice.
+        'index' and 'big_npy' is used to retrieve similar audio features from a pre-computed database for better conversion quality.
+        """
         feats = torch.from_numpy(audio0)
         if self.is_half:
             feats = feats.half()
         t2 = ttime()
         times[0] += t1 - t0
         times[2] += t2 - t1
         return audio1
     def pipeline(
         protect,
         f0_file=None,
     ):
+        """
+        This is a pipeline function that strings together multiple operations for voice conversion.
+        The function does some preprocessing on the input audio(e.g. filtering)
+        The function then segments the audio into pieces and processes each segment through the voice conversion ('vc') method
+        The converted segments are then concatenated to produce the final converted audio
+        """
+        # Phase 1: Load index file
         if (
             file_index != ""
             # and file_big_npy != ""
             and index_rate != 0
         ):
             try:
+                index = faiss.read_index(file_index) # Read from the vector store
                 # big_npy = np.load(file_big_npy)
+                big_npy = index.reconstruct_n(0, index.ntotal) # Convert index into a big numpy array
             except:
                 traceback.print_exc()
                 index = big_npy = None
         else:
+            index = big_npy = None # If we don't have the index file, it's ok we won't use it.
+        # Phase 2: Filter audio signal
+        audio = signal.filtfilt(bh, ah, audio) # Avoid phase distortion
+        audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") # Padding to ensure we calculate the beginning and end accurately
+        optimal_time_shifts = []
         if audio_pad.shape[0] > self.t_max:
+            audio_sum = np.zeros_like(audio) # Create numpy array filled with zeros and same shape as audio
             for i in range(self.window):
                 audio_sum += audio_pad[i : i - self.window]
             for t in range(self.t_center, audio.shape[0], self.t_center):
+                optimal_time_shifts.append(
                     t
                     - self.t_query
                     + np.where(
                         == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min()
                     )[0][0]
                 )
         s = 0
         audio_opt = []
         t = None
         t1 = ttime()
+        audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") # Reflection of the signal's edges on both ends
+        number_of_analysis_frames = audio_pad.shape[0] // self.window
         inp_f0 = None
         if hasattr(f0_file, "name") == True:
             try:
         sid = torch.tensor(sid, device=self.device).unsqueeze(0).long()
         pitch, pitchf = None, None
         if if_f0 == 1:
+            # Calls the `get_f0` method to calculate pitch values based on audio features.
+           # These calculated pitch values are used for voice conversion.
             pitch, pitchf = self.get_f0(
                 input_audio_path,
                 audio_pad,
+                number_of_analysis_frames,
                 f0_up_key,
                 f0_method,
                 filter_radius,
                 inp_f0,
             )
+            pitch = pitch[:number_of_analysis_frames]
+            pitchf = pitchf[:number_of_analysis_frames]
             if self.device == "mps":
                 pitchf = pitchf.astype(np.float32)
             pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long()
             pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float()
         t2 = ttime()
         times[1] += t2 - t1
+        for t in optimal_time_shifts:
             t = t // self.window * self.window
             if if_f0 == 1:
                 audio_opt.append(