Reference: https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L21 # noqa in_hf_space = getenv('SYSTEM') == 'spaces' # Set High Quality (.wav) or not (.mp3) high_quality = True # Read config.json config_json = json.loads(open("config.json").read()) # Load hubert model hubert_model = util.load_hubert_model(device, 'hubert_base.pt') hubert_model.eval() # Load models loaded_models = [] for model_name in config_json.get('models'): print(f'Loading model: {model_name}') # Load model info model_info = json.load( open(path.join('model', model_name, 'config.json'), 'r') ) # Load RVC checkpoint cpt = torch.load( path.join('model', model_name, model_info['model']), map_location='cpu' ) tgt_sr = cpt['config'][-1] cpt['config'][-3] = cpt['weight']['emb_g.weight'].shape[0] # n_spk if_f0 = cpt.get('f0', 1) net_g: Union[SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono] if if_f0 == 1: net_g = SynthesizerTrnMs256NSFsid( *cpt['config'], is_half=util.is_half(device) ) else: net_g = SynthesizerTrnMs256NSFsid_nono(*cpt['config']) del net_g.enc_q # According to original code, this thing seems necessary. print(net_g.load_state_dict(cpt['weight'], strict=False)) net_g.eval().to(device) net_g = net_g.half() if util.is_half(device) else net_g.float() vc = VC(tgt_sr, device) loaded_models.append(dict( name=model_name, metadata=model_info, vc=vc, net_g=net_g, if_f0=if_f0, target_sr=tgt_sr )) print(f'Models loaded: {len(loaded_models)}') def make_bars_image(height_values, index, new_height): # Define the size of the image width = 512 height = new_height # Create a new image with a transparent background image = Image.new('RGBA', (width, height), color=(0, 0, 0, 0)) # Get the image drawing context draw = ImageDraw.Draw(image) # Define the rectangle width and spacing rect_width = 2 spacing = 2 # Define the list of height values for the rectangles #height_values = [20, 40, 60, 80, 100, 80, 60, 40] num_bars = len(height_values) # Calculate the total width of the rectangles and the spacing total_width = num_bars * rect_width + (num_bars - 1) * spacing # Calculate the starting position for the first rectangle start_x = int((width - total_width) / 2) # Define the buffer size buffer_size = 80 # Draw the rectangles from left to right x = start_x for i, height in enumerate(height_values): # Define the rectangle coordinates y0 = buffer_size y1 = height + buffer_size x0 = x x1 = x + rect_width # Draw the rectangle draw.rectangle([x0, y0, x1, y1], fill='white') # Move to the next rectangle position if i < num_bars - 1: x += rect_width + spacing # Rotate the image by 180 degrees image = image.rotate(180) # Mirror the image image = image.transpose(Image.FLIP_LEFT_RIGHT) # Save the image image.save('audio_bars_'+ str(index) + '.png') return 'audio_bars_'+ str(index) + '.png' def db_to_height(db_value): # Scale the dB value to a range between 0 and 1 scaled_value = (db_value + 80) / 80 # Convert the scaled value to a height between 0 and 100 height = scaled_value * 50 return height def infer(title, audio_in, image_in): # Load the audio file audio_path = audio_in audio_data, sr = librosa.load(audio_path) # Get the duration in seconds duration = librosa.get_duration(y=audio_data, sr=sr) # Extract the audio data for the desired time start_time = 0 # start time in seconds end_time = duration # end time in seconds start_index = int(start_time * sr) end_index = int(end_time * sr) audio_data = audio_data[start_index:end_index] # Compute the short-time Fourier transform hop_length = 512 stft = librosa.stft(audio_data, hop_length=hop_length) spectrogram = librosa.amplitude_to_db(np.abs(stft), ref=np.max) # Get the frequency values freqs = librosa.fft_frequencies(sr=sr, n_fft=stft.shape[0]) # Select the indices of the frequency values that correspond to the desired frequencies n_freqs = 114 freq_indices = np.linspace(0, len(freqs) - 1, n_freqs, dtype=int) # Extract the dB values for the desired frequencies db_values = [] for i in range(spectrogram.shape[1]): db_values.append(list(zip(freqs[freq_indices], spectrogram[freq_indices, i]))) # Print the dB values for the first time frame print(db_values[0]) proportional_values = [] for frame in db_values: proportional_frame = [db_to_height(db) for f, db in frame] proportional_values.append(proportional_frame) print(proportional_values[0]) print("AUDIO CHUNK: " + str(len(proportional_values))) # Open the background image background_image = Image.open(image_in) # Resize the image while keeping its aspect ratio bg_width, bg_height = background_image.size aspect_ratio = bg_width / bg_height new_width = 512 new_height = int(new_width / aspect_ratio) resized_bg = background_image.resize((new_width, new_height)) # Apply black cache for better visibility of the white text bg_cache = Image.open('black_cache.png') resized_bg.paste(bg_cache, (0, resized_bg.height - bg_cache.height), mask=bg_cache) # Create a new ImageDraw object draw = ImageDraw.Draw(resized_bg) # Define the text to be added text = title font = ImageFont.truetype("Lato-Regular.ttf", 16) text_color = (255, 255, 255) # white color # Calculate the position of the text text_width, text_height = draw.textsize(text, font=font) x = 30 y = new_height - 70 # Draw the text on the image draw.text((x, y), text, fill=text_color, font=font) # Save the resized image resized_bg.save('resized_background.jpg') generated_frames = [] for i, frame in enumerate(proportional_values): bars_img = make_bars_image(frame, i, new_height) bars_img = Image.open(bars_img) # Paste the audio bars image on top of the background image fresh_bg = Image.open('resized_background.jpg') fresh_bg.paste(bars_img, (0, 0), mask=bars_img) # Save the image fresh_bg.save('audio_bars_with_bg' + str(i) + '.jpg') generated_frames.append('audio_bars_with_bg' + str(i) + '.jpg') print(generated_frames) # Create a video clip from the images clip = ImageSequenceClip(generated_frames, fps=len(generated_frames)/(end_time-start_time)) audio_clip = AudioFileClip(audio_in) clip = clip.set_audio(audio_clip) # Set the output codec codec = 'libx264' audio_codec = 'aac' # Save the video to a file clip.write_videofile("my_video.mp4", codec=codec, audio_codec=audio_codec) retimed_clip = VideoFileClip("my_video.mp4") # Set the desired frame rate new_fps = 25 # Create a new clip with the new frame rate new_clip = retimed_clip.set_fps(new_fps) # Save the new clip as a new video file new_clip.write_videofile("my_video_retimed.mp4", codec=codec, audio_codec=audio_codec) return "my_video_retimed.mp4" # Command line test def command_line_test(): command = "df -h /home/user/app" process = subprocess.run(command.split(), stdout=subprocess.PIPE) result = process.stdout.decode() return gr.HTML(value=result) def mix(audio1, audio2): sound1 = AudioSegment.from_file(audio1) sound2 = AudioSegment.from_file(audio2) length = len(sound1) mixed = sound1[:length].overlay(sound2) mixed.export("song.wav", format="wav") return "song.wav" # Function YouTube Downloader Audio def youtube_downloader( video_identifier, start_time, end_time, output_filename="track.wav", num_attempts=5, url_base="", quiet=False, force=True, ): output_path = Path(output_filename) if output_path.exists(): if not force: return output_path else: output_path.unlink() quiet = "--quiet --no-warnings" if quiet else "" command = f""" yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501 """.strip() attempts = 0 while True: try: _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) except subprocess.CalledProcessError: attempts += 1 if attempts == num_attempts: return None else: break if output_path.exists(): return output_path else: return None # Function Audio Separated def audio_separated(audio_input, progress=gr.Progress()): # start progress progress(progress=0, desc="Starting...") time.sleep(0.1) # check file input if audio_input is None: # show progress for i in progress.tqdm(range(100), desc="Please wait..."): time.sleep(0.01) return (None, None, 'Please input audio.') # create filename filename = str(random.randint(10000,99999))+datetime.now().strftime("%d%m%Y%H%M%S") # progress progress(progress=0.10, desc="Please wait...") # make dir output os.makedirs("output", exist_ok=True) # progress progress(progress=0.20, desc="Please wait...") # write if high_quality: write(filename+".wav", audio_input[0], audio_input[1]) else: write(filename+".mp3", audio_input[0], audio_input[1]) # progress progress(progress=0.50, desc="Please wait...") # demucs process if high_quality: command_demucs = "python3 -m demucs --two-stems=vocals -d cpu "+filename+".wav -o output" else: command_demucs = "python3 -m demucs --two-stems=vocals --mp3 --mp3-bitrate 128 -d cpu "+filename+".mp3 -o output" os.system(command_demucs) # progress progress(progress=0.70, desc="Please wait...") # remove file audio if high_quality: command_delete = "rm -v ./"+filename+".wav" else: command_delete = "rm -v ./"+filename+".mp3" os.system(command_delete) # progress progress(progress=0.80, desc="Please wait...") # progress for i in progress.tqdm(range(80,100), desc="Please wait..."): time.sleep(0.1) if high_quality: return "./output/htdemucs/"+filename+"/vocals.wav","./output/htdemucs/"+filename+"/no_vocals.wav","Successfully..." else: return "./output/htdemucs/"+filename+"/vocals.mp3","./output/htdemucs/"+filename+"/no_vocals.mp3","Successfully..." # Function Voice Changer def voice_changer(audio_input, model_index, pitch_adjust, f0_method, feat_ratio, progress=gr.Progress()): # start progress progress(progress=0, desc="Starting...") time.sleep(1) # check file input if audio_input is None: # progress for i in progress.tqdm(range(100), desc="Please wait..."): time.sleep(0.1) return (None, 'Please input audio.') # check model input if model_index is None: # progress for i in progress.tqdm(range(100), desc="Please wait..."): time.sleep(0.1) return (None, 'Please select a model.') model = loaded_models[model_index] # Reference: so-vits (audio_samp, audio_npy) = audio_input # progress progress(progress=0.10, desc="Please wait...") # https://huggingface.co/spaces/zomehwh/rvc-models/blob/main/app.py#L49 if (audio_npy.shape[0] / audio_samp) > 60 and in_hf_space: # progress for i in progress.tqdm(range(10,100), desc="Please wait..."): time.sleep(0.1) return (None, 'Input audio is longer than 60 secs.') # Bloody hell: https://stackoverflow.com/questions/26921836/ if audio_npy.dtype != np.float32: # :thonk: audio_npy = ( audio_npy / np.iinfo(audio_npy.dtype).max ).astype(np.float32) # progress progress(progress=0.30, desc="Please wait...") if len(audio_npy.shape) > 1: audio_npy = librosa.to_mono(audio_npy.transpose(1, 0)) # progress progress(progress=0.40, desc="Please wait...") if audio_samp != 16000: audio_npy = librosa.resample( audio_npy, orig_sr=audio_samp, target_sr=16000 ) # progress progress(progress=0.50, desc="Please wait...") pitch_int = int(pitch_adjust) times = [0, 0, 0] output_audio = model['vc'].pipeline( hubert_model, model['net_g'], model['metadata'].get('speaker_id', 0), audio_npy, times, pitch_int, f0_method, path.join('model', model['name'], model['metadata']['feat_index']), path.join('model', model['name'], model['metadata']['feat_npy']), feat_ratio, model['if_f0'] ) # progress progress(progress=0.80, desc="Please wait...") print(f'npy: {times[0]}s, f0: {times[1]}s, infer: {times[2]}s') # progress for i in progress.tqdm(range(80,100), desc="Please wait..."): time.sleep(0.1) return ((model['target_sr'], output_audio), 'Successfully...') # Function Text to Voice def text_to_voice(text_input, model_index): # start progress progress(progress=0, desc="Starting...") time.sleep(1) # check text input if text_input is None: # progress for i in progress.tqdm(range(2,100), desc="Please wait..."): time.sleep(0.1) return (None, 'Please write text.') # check model input if model_index is None: # progress for i in progress.tqdm(range(2,100), desc="Please wait..."): time.sleep(0.1) return (None, 'Please select a model.') # progress for i in progress.tqdm(range(2,100), desc="Please wait..."): time.sleep(0.1) return None, "Sorry, you can't use it yet because this program is being developed!" # Themes theme = gr.themes.Base() # CSS css = "footer {visibility: hidden}" # Blocks with gr.Blocks(theme=theme, css=css) as App: # Header gr.HTML("<center>" "<h1>🥳🎶🎡 - AI歌手,RVC歌声转换</h1>" "</center>") gr.Markdown("### <center>🦄 - 能够自动提取视频中的声音,并去除背景音;Powered by [RVC-Project](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI)</center>") gr.Markdown("### <center>更多精彩应用,敬请关注[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕</center>") # Information with gr.Accordion("🎙️您的AI歌手Stefanie:"): gr.Markdown(title_markdown) with gr.Accordion("🎤您的AI歌手Vae:"): gr.Markdown(title_markdown2) # Tab YouTube Downloader with gr.Tab("🤗 - B站视频提取声音"): with gr.Row(): with gr.Column(): ydl_url_input = gr.Textbox(label="B站视频网址(请填写相应的BV号)", value = "https://www.bilibili.com/video/BV...") start = gr.Number(value=0, label="起始时间 (秒)") end = gr.Number(value=15, label="结束时间 (秒)") ydl_url_submit = gr.Button("提取声音文件吧", variant="primary") as_audio_submit = gr.Button("去除背景音吧", variant="primary") with gr.Column(): ydl_audio_output = gr.Audio(label="Audio from Bilibili") as_audio_input = ydl_audio_output as_audio_vocals = gr.Audio(label="Vocal only") as_audio_no_vocals = gr.Audio(label="Music only", type="filepath") as_audio_message = gr.Textbox(label="Message", visible=False) ydl_url_submit.click(fn=youtube_downloader, inputs=[ydl_url_input, start, end], outputs=[ydl_audio_output]) as_audio_submit.click(fn=audio_separated, inputs=[as_audio_input], outputs=[as_audio_vocals, as_audio_no_vocals, as_audio_message], show_progress=True, queue=True) # gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>") Launch App.queue(concurrency_count=1, max_size=20).launch(server_name="", server_port=7860, show_error=True) # Enjoy