NeoPy commited on
Commit
5c3584e
·
verified ·
1 Parent(s): d5176a4

Update rvc_inferpy/infer.py

Browse files
Files changed (1) hide show
  1. rvc_inferpy/infer.py +222 -268
rvc_inferpy/infer.py CHANGED
@@ -19,82 +19,63 @@ from rvc_inferpy.split_audio import (
19
  combine_silence_nonsilent,
20
  )
21
 
22
- # Configure logging
23
- logging.basicConfig(level=logging.INFO)
 
 
 
 
 
 
 
24
  logger = logging.getLogger(__name__)
25
 
26
- # Download URL and model filenames
27
  RVC_DOWNLOAD_LINK = "https://huggingface.co/NeoPy/rvc-base/resolve/main"
28
- RVC_MODELS = [
29
- "hubert_base.pt",
30
- "rmvpe.pt",
31
- "fcpe.pt",
32
- ]
33
-
34
  BASE_DIR = Path(".").resolve()
35
  MODELS_DIR = BASE_DIR / "models"
36
  MODELS_DIR.mkdir(exist_ok=True)
37
 
38
-
39
- def download_manager(url: str, dest_path: Path) -> None:
40
- """
41
- Download a file from the given URL to the destination path.
42
- """
43
- if dest_path.exists():
44
- logger.info(f"File {dest_path} already exists. Skipping download.")
45
- return
46
-
47
- logger.info(f"Downloading {url} to {dest_path}...")
48
- response = requests.get(url, stream=True)
49
- if response.status_code == 200:
50
- with open(dest_path, "wb") as f:
51
- shutil.copyfileobj(response.raw, f)
52
- logger.info(f"Downloaded {dest_path}.")
53
- else:
54
- logger.error(f"Failed to download {url}. Status code: {response.status_code}")
55
- raise Exception(f"Download failed for {url}")
56
-
57
-
58
  class Configs:
59
- """
60
- Configuration for device and inference parameters.
61
- """
62
  def __init__(self, device: str, is_half: bool):
63
- self.device: str = device
64
- self.is_half: bool = is_half
65
- self.n_cpu: int = cpu_count()
66
- self.gpu_name: str | None = None
67
- self.gpu_mem: int | None = None
68
- self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
69
-
70
- def device_config(self) -> tuple:
71
- # Determine the proper device based on available hardware
72
- if torch.cuda.is_available():
73
- i_device = int(self.device.split(":")[-1])
74
- self.gpu_name = torch.cuda.get_device_name(i_device)
75
- elif torch.backends.mps.is_available():
76
- logger.info("No supported NVIDIA GPU found, using MPS for inference")
77
- self.device = "mps"
78
- else:
79
- logger.info("No supported GPU found, using CPU for inference")
80
- self.device = "cpu"
81
-
82
- # Memory configuration based on is_half flag
 
 
 
 
 
83
  if self.is_half:
84
  x_pad, x_query, x_center, x_max = 3, 10, 60, 65 # 6G memory configuration
85
  else:
86
  x_pad, x_query, x_center, x_max = 1, 6, 38, 41 # 5G memory configuration
87
-
88
- if self.gpu_mem is not None and self.gpu_mem <= 4:
89
- x_pad, x_query, x_center, x_max = 1, 5, 30, 32
90
-
91
- return x_pad, x_query, x_center, x_max
92
-
93
 
94
  class RVCConverter:
95
- """
96
- Handles model setup and voice conversion inference.
97
- """
98
  def __init__(
99
  self,
100
  device: str = "cuda:0",
@@ -102,63 +83,49 @@ class RVCConverter:
102
  models_dir: Path = MODELS_DIR,
103
  download_if_missing: bool = True,
104
  ):
105
- self.device: str = device
106
- self.is_half: bool = is_half
107
- self.models_dir: Path = models_dir
108
- self.download_if_missing: bool = download_if_missing
109
-
110
- # Retrieve model paths from environment or download defaults if missing.
111
- self.hubert_model_path = self.get_or_download_model(
112
- env_var="hubert_model_path", filename="hubert_base.pt"
113
- )
114
- self.rmvpe_model_path = self.get_or_download_model(
115
- env_var="rmvpe_model_path", filename="rmvpe.pt"
116
- )
117
- self.fcpe_model_path = self.get_or_download_model(
118
- env_var="fcpe_model_path", filename="fcpe.pt"
119
- )
120
-
121
  self.configs = Configs(self.device, self.is_half)
122
- self.vc = VC(self.configs)
 
 
 
 
 
 
 
123
 
124
- def get_or_download_model(self, env_var: str, filename: str) -> str:
125
- model_path = os.environ.get(env_var)
126
- if model_path:
127
- logger.info(f"Using {env_var} from environment: {model_path}")
128
- else:
129
- model_path = str(self.models_dir / filename)
130
- if self.download_if_missing and not Path(model_path).exists():
131
- download_manager(f"{RVC_DOWNLOAD_LINK}/{filename}", Path(model_path))
132
- return model_path
133
-
134
- @staticmethod
135
- def get_model(voice_model: str) -> tuple:
136
- """
137
- Return the pth and index file paths for the given voice model.
138
- Expects the voice model files to reside in:
139
- {current_working_dir}/models/{voice_model}/
140
- """
141
- model_dir = Path(os.getcwd()) / "models" / voice_model
142
- if not model_dir.exists():
143
- logger.error(f"Model directory {model_dir} does not exist.")
144
- return None, None
145
-
146
- model_filename = None
147
- index_filename = None
148
- for file in os.listdir(model_dir):
149
- ext = Path(file).suffix
150
- if ext == ".pth":
151
- model_filename = file
152
- elif ext == ".index":
153
- index_filename = file
154
-
155
- if not model_filename:
156
- logger.error(f"No model file exists in {model_dir}.")
157
- return None, None
158
 
159
- pth_path = str(model_dir / model_filename)
160
- index_path = str(model_dir / index_filename) if index_filename else ""
161
- return pth_path, index_path
 
 
 
 
 
 
162
 
163
  def _run_inference(
164
  self,
@@ -180,162 +147,149 @@ class RVCConverter:
180
  max_pitch: str,
181
  f0_autotune: bool,
182
  ) -> tuple:
183
- """
184
- Helper function to run inference on a single audio segment.
185
- """
186
- inference_info, audio_data, output_path = self.vc.vc_single(
187
- 0,
188
- input_audio,
189
- f0_change,
190
- f0_method,
191
- index_path,
192
- index_path,
193
- index_rate,
194
- filter_radius,
195
- resample_sr,
196
- rms_mix_rate,
197
- protect,
198
- audio_format,
199
- crepe_hop_length,
200
- do_formant,
201
- quefrency,
202
- timbre,
203
- min_pitch,
204
- max_pitch,
205
- f0_autotune,
206
- self.hubert_model_path,
207
- )
208
- if inference_info[0] == "Success.":
209
- logger.info("Inference ran successfully.")
210
- logger.info(inference_info[1])
211
- times = inference_info[2]
212
- logger.info(
213
- f"Times:\nnpy: {times[0]:.2f}s f0: {times[1]:.2f}s infer: {times[2]:.2f}s\nTotal time: {sum(times):.2f}s"
214
- )
215
- else:
216
- logger.error(f"An error occurred: {inference_info[0]}")
217
- return inference_info, audio_data, output_path
 
 
 
 
 
 
 
 
218
 
219
- def infer_audio(
220
- self,
221
- voice_model: str,
222
- audio_path: str,
223
- f0_change: int = 0,
224
- f0_method: str = "rmvpe+",
225
- min_pitch: str = "50",
226
- max_pitch: str = "1100",
227
- crepe_hop_length: int = 128,
228
- index_rate: float = 0.75,
229
- filter_radius: int = 3,
230
- rms_mix_rate: float = 0.25,
231
- protect: float = 0.33,
232
- split_infer: bool = False,
233
- min_silence: int = 500,
234
- silence_threshold: int = -50,
235
- seek_step: int = 1,
236
- keep_silence: int = 100,
237
- do_formant: bool = False,
238
- quefrency: int = 0,
239
- timbre: int = 1,
240
- f0_autotune: bool = False,
241
- audio_format: str = "wav",
242
- resample_sr: int = 0,
243
- ) -> str:
244
- """
245
- Perform voice conversion inference on the provided audio file.
246
- If split_infer is True, the audio will first be segmented based on silence.
247
- """
248
- pth_path, index_path = self.get_model(voice_model)
249
- if pth_path is None:
250
- logger.error("Model loading failed.")
251
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
- # Preload the voice conversion engine
254
- self.vc.get_vc(pth_path, protect, 0.5)
 
 
 
 
 
 
 
 
255
 
256
- if split_infer:
257
- inferred_files = []
258
- temp_dir = Path(os.getcwd()) / "separate" / "temp"
259
- temp_dir.mkdir(parents=True, exist_ok=True)
260
- logger.info("Splitting audio into silence and nonsilent segments.")
261
- silence_files, nonsilent_files = split_silence_nonsilent(
262
- audio_path, min_silence, silence_threshold, seek_step, keep_silence
263
- )
264
- logger.info(
265
- f"Total silence segments: {len(silence_files)}. Total nonsilent segments: {len(nonsilent_files)}."
266
- )
267
- for i, segment in enumerate(nonsilent_files):
268
- logger.info(f"Inferring nonsilent audio segment {i+1}")
269
- inference_info, _, segment_output = self._run_inference(
270
- input_audio=segment,
271
  index_path=index_path,
272
- f0_change=f0_change,
273
- f0_method=f0_method,
274
- index_rate=index_rate,
275
- filter_radius=filter_radius,
276
- resample_sr=resample_sr,
277
- rms_mix_rate=rms_mix_rate,
278
- protect=protect,
279
- audio_format=audio_format,
280
- crepe_hop_length=crepe_hop_length,
281
- do_formant=do_formant,
282
- quefrency=quefrency,
283
- timbre=timbre,
284
- min_pitch=min_pitch,
285
- max_pitch=max_pitch,
286
- f0_autotune=f0_autotune,
287
  )
 
288
  if inference_info[0] != "Success.":
289
- return ""
290
- inferred_files.append(segment_output)
291
-
292
- logger.info("Adjusting inferred audio lengths.")
293
- adjusted_inferred_files = adjust_audio_lengths(nonsilent_files, inferred_files)
294
- logger.info("Combining silence and inferred audios.")
295
- output_dir = Path(os.getcwd()) / "output"
296
- output_dir.mkdir(exist_ok=True)
297
- output_count = 1
298
- while True:
299
- output_path = output_dir / f"{Path(audio_path).stem}_{voice_model}_{f0_method.capitalize()}_{output_count}.{audio_format}"
300
- if not output_path.exists():
301
- break
302
- output_count += 1
303
-
304
- output_path = combine_silence_nonsilent(
305
- silence_files, adjusted_inferred_files, keep_silence, str(output_path)
306
- )
307
- # Move temporary inferred files to temp directory and clean up
308
- for file in inferred_files:
309
- shutil.move(file, temp_dir)
310
- shutil.rmtree(temp_dir)
311
- else:
312
- inference_info, _, output_path = self._run_inference(
313
- input_audio=audio_path,
314
- index_path=index_path,
315
- f0_change=f0_change,
316
- f0_method=f0_method,
317
- index_rate=index_rate,
318
- filter_radius=filter_radius,
319
- resample_sr=resample_sr,
320
- rms_mix_rate=rms_mix_rate,
321
- protect=protect,
322
- audio_format=audio_format,
323
- crepe_hop_length=crepe_hop_length,
324
- do_formant=do_formant,
325
- quefrency=quefrency,
326
- timbre=timbre,
327
- min_pitch=min_pitch,
328
- max_pitch=max_pitch,
329
- f0_autotune=f0_autotune,
330
- )
331
- if inference_info[0] != "Success.":
332
- return ""
333
- return output_path
334
-
335
-
336
- if __name__ == "__main__":
337
- # Download base models if not provided via environment variables.
338
- for model_file in RVC_MODELS:
339
- model_path = BASE_DIR / model_file
340
- if not model_path.exists():
341
- download_manager(f"{RVC_DOWNLOAD_LINK}/{model_file}", model_path)
 
19
  combine_silence_nonsilent,
20
  )
21
 
22
+ # Configure logging with better error handling
23
+ logging.basicConfig(
24
+ level=logging.INFO,
25
+ format='%(asctime)s - %(levelname)s - %(message)s',
26
+ handlers=[
27
+ logging.FileHandler('rvc_converter.log'),
28
+ logging.StreamHandler()
29
+ ]
30
+ )
31
  logger = logging.getLogger(__name__)
32
 
33
+ # Constants
34
  RVC_DOWNLOAD_LINK = "https://huggingface.co/NeoPy/rvc-base/resolve/main"
35
+ RVC_MODELS = ["hubert_base.pt", "rmvpe.pt", "fcpe.pt"]
 
 
 
 
 
36
  BASE_DIR = Path(".").resolve()
37
  MODELS_DIR = BASE_DIR / "models"
38
  MODELS_DIR.mkdir(exist_ok=True)
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  class Configs:
41
+ """Configuration for device and inference parameters."""
42
+
 
43
  def __init__(self, device: str, is_half: bool):
44
+ self.device = device
45
+ self.is_half = is_half
46
+ self.n_cpu = cpu_count()
47
+
48
+ # Initialize device settings with proper error handling
49
+ try:
50
+ if torch.cuda.is_available():
51
+ i_device = int(self.device.split(":")[-1])
52
+ self.gpu_name = torch.cuda.get_device_name(i_device)
53
+ torch.cuda.set_per_process_memory_fraction(0.8, i_device)
54
+ elif torch.backends.mps.is_available():
55
+ logger.info("No supported NVIDIA GPU found, using MPS for inference")
56
+ self.device = "mps"
57
+ else:
58
+ logger.info("No supported GPU found, using CPU for inference")
59
+ self.device = "cpu"
60
+
61
+ # Enable cuDNN benchmark for better performance
62
+ torch.backends.cudnn.benchmark = True
63
+
64
+ except Exception as e:
65
+ logger.error(f"Device initialization failed: {str(e)}")
66
+ raise
67
+
68
+ # Memory configuration
69
  if self.is_half:
70
  x_pad, x_query, x_center, x_max = 3, 10, 60, 65 # 6G memory configuration
71
  else:
72
  x_pad, x_query, x_center, x_max = 1, 6, 38, 41 # 5G memory configuration
73
+
74
+ self.x_pad, self.x_query, self.x_center, self.x_max = x_pad, x_query, x_center, x_max
 
 
 
 
75
 
76
  class RVCConverter:
77
+ """Handles model setup and voice conversion inference."""
78
+
 
79
  def __init__(
80
  self,
81
  device: str = "cuda:0",
 
83
  models_dir: Path = MODELS_DIR,
84
  download_if_missing: bool = True,
85
  ):
86
+ self.device = device
87
+ self.is_half = is_half
88
+ self.models_dir = models_dir
89
+ self.download_if_missing = download_if_missing
90
+
91
+ # Initialize configurations
 
 
 
 
 
 
 
 
 
 
92
  self.configs = Configs(self.device, self.is_half)
93
+
94
+ # Model paths initialization
95
+ self.model_paths = {}
96
+ self._initialize_model_paths()
97
+
98
+ # Initialize VC engine
99
+ self.vc = None
100
+ self._initialize_vc_engine()
101
 
102
+ def _initialize_model_paths(self):
103
+ """Initialize model paths with proper error handling."""
104
+ for filename in RVC_MODELS:
105
+ env_var = f"{filename.replace('.pt', '_model_path').lower()}"
106
+ model_path = os.environ.get(env_var)
107
+ if model_path:
108
+ logger.info(f"Using {env_var} from environment: {model_path}")
109
+ else:
110
+ model_path = str(self.models_dir / filename)
111
+
112
+ if self.download_if_missing and not Path(model_path).exists():
113
+ try:
114
+ download_manager(f"{RVC_DOWNLOAD_LINK}/{filename}", Path(model_path))
115
+ except Exception as e:
116
+ logger.error(f"Failed to initialize model path {filename}: {str(e)}")
117
+
118
+ self.model_paths[filename] = model_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ def _initialize_vc_engine(self):
121
+ """Initialize VC engine with proper cleanup."""
122
+ try:
123
+ self.vc = VC(self.configs)
124
+ gc.collect()
125
+ torch.cuda.empty_cache()
126
+ except Exception as e:
127
+ logger.error(f"Failed to initialize VC engine: {str(e)}")
128
+ raise
129
 
130
  def _run_inference(
131
  self,
 
147
  max_pitch: str,
148
  f0_autotune: bool,
149
  ) -> tuple:
150
+ """Helper function to run inference on a single audio segment."""
151
+
152
+ # Enable inference mode for better performance
153
+ with torch.inference_mode():
154
+ try:
155
+ inference_info, audio_data, output_path = self.vc.vc_single(
156
+ 0,
157
+ input_audio,
158
+ f0_change,
159
+ f0_method,
160
+ index_path,
161
+ index_path,
162
+ index_rate,
163
+ filter_radius,
164
+ resample_sr,
165
+ rms_mix_rate,
166
+ protect,
167
+ audio_format,
168
+ crepe_hop_length,
169
+ do_formant,
170
+ quefrency,
171
+ timbre,
172
+ min_pitch,
173
+ max_pitch,
174
+ f0_autotune,
175
+ self.model_paths["hubert_base.pt"]
176
+ )
177
+
178
+ if inference_info[0] == "Success.":
179
+ logger.info("Inference ran successfully.")
180
+ logger.info(inference_info[1])
181
+ times = inference_info[2]
182
+ logger.info(
183
+ f"Times:\nnpy: {times[0]:.2f}s f0: {times[1]:.2f}s infer: {times[2]:.2f}s\nTotal time: {sum(times):.2f}s"
184
+ )
185
+ return inference_info, audio_data, output_path
186
+
187
+ except Exception as e:
188
+ logger.error(f"Inference failed: {str(e)}")
189
+ raise
190
+ finally:
191
+ gc.collect()
192
+ torch.cuda.empty_cache()
193
 
194
+ def infer_audio(self, voice_model: str, audio_path: str, **kwargs) -> str:
195
+ """Perform voice conversion inference on the provided audio file."""
196
+
197
+ try:
198
+ pth_path, index_path = self.get_model(voice_model)
199
+ if pth_path is None:
200
+ logger.error("Model loading failed.")
201
+ return ""
202
+
203
+ # Preload the voice conversion engine
204
+ self.vc.get_vc(pth_path, kwargs.get('protect', 0.33), 0.5)
205
+
206
+ if kwargs.get('split_infer', False):
207
+ inferred_files = []
208
+ temp_dir = Path(os.getcwd()) / "separate" / "temp"
209
+ temp_dir.mkdir(parents=True, exist_ok=True)
210
+
211
+ silence_files, nonsilent_files = split_silence_nonsilent(
212
+ audio_path,
213
+ kwargs.get('min_silence', 500),
214
+ kwargs.get('silence_threshold', -50),
215
+ kwargs.get('seek_step', 1),
216
+ kwargs.get('keep_silence', 100)
217
+ )
218
+
219
+ for i, segment in enumerate(nonsilent_files):
220
+ logger.info(f"Inferring nonsilent audio segment {i+1}")
221
+ inference_info, _, segment_output = self._run_inference(
222
+ input_audio=segment,
223
+ index_path=index_path,
224
+ f0_change=kwargs.get('f0_change', 0),
225
+ f0_method=kwargs.get('f0_method', "rmvpe+"),
226
+ index_rate=kwargs.get('index_rate', 0.75),
227
+ filter_radius=kwargs.get('filter_radius', 3),
228
+ resample_sr=kwargs.get('resample_sr', 0),
229
+ rms_mix_rate=kwargs.get('rms_mix_rate', 0.25),
230
+ protect=kwargs.get('protect', 0.33),
231
+ audio_format=kwargs.get('audio_format', "wav"),
232
+ crepe_hop_length=kwargs.get('crepe_hop_length', 128),
233
+ do_formant=kwargs.get('do_formant', False),
234
+ quefrency=kwargs.get('quefrency', 0),
235
+ timbre=kwargs.get('timbre', 1),
236
+ min_pitch=kwargs.get('min_pitch', "50"),
237
+ max_pitch=kwargs.get('max_pitch', "1100"),
238
+ f0_autotune=kwargs.get('f0_autotune', False)
239
+ )
240
+
241
+ if inference_info[0] != "Success.":
242
+ raise RuntimeError("Inference failed")
243
+
244
+ inferred_files.append(segment_output)
245
 
246
+ adjusted_inferred_files = adjust_audio_lengths(nonsilent_files, inferred_files)
247
+
248
+ output_dir = Path(os.getcwd()) / "output"
249
+ output_dir.mkdir(exist_ok=True)
250
+ output_count = 1
251
+ while True:
252
+ output_path = output_dir / f"{Path(audio_path).stem}_{voice_model}_{kwargs.get('f0_method', 'rmvpe+').capitalize()}_{output_count}.{kwargs.get('audio_format', 'wav')}"
253
+ if not output_path.exists():
254
+ break
255
+ output_count += 1
256
 
257
+ output_path = combine_silence_nonsilent(
258
+ silence_files, adjusted_inferred_files,
259
+ kwargs.get('keep_silence', 100), str(output_path)
260
+ )
261
+
262
+ # Cleanup temporary files
263
+ for file in inferred_files:
264
+ shutil.move(file, temp_dir)
265
+ shutil.rmtree(temp_dir)
266
+
267
+ else:
268
+ inference_info, _, output_path = self._run_inference(
269
+ input_audio=audio_path,
 
 
270
  index_path=index_path,
271
+ f0_change=kwargs.get('f0_change', 0),
272
+ f0_method=kwargs.get('f0_method', "rmvpe+"),
273
+ index_rate=kwargs.get('index_rate', 0.75),
274
+ filter_radius=kwargs.get('filter_radius', 3),
275
+ resample_sr=kwargs.get('resample_sr', 0),
276
+ rms_mix_rate=kwargs.get('rms_mix_rate', 0.25),
277
+ protect=kwargs.get('protect', 0.33),
278
+ audio_format=kwargs.get('audio_format', "wav"),
279
+ crepe_hop_length=kwargs.get('crepe_hop_length', 128),
280
+ do_formant=kwargs.get('do_formant', False),
281
+ quefrency=kwargs.get('quefrency', 0),
282
+ timbre=kwargs.get('timbre', 1),
283
+ min_pitch=kwargs.get('min_pitch', "50"),
284
+ max_pitch=kwargs.get('max_pitch', "1100"),
285
+ f0_autotune=kwargs.get('f0_autotune', False)
286
  )
287
+
288
  if inference_info[0] != "Success.":
289
+ raise RuntimeError("Inference failed")
290
+
291
+ return str(output_path)
292
+
293
+ except Exception as e:
294
+ logger.error(f"Inference failed: {str(e)}")
295
+ raise