Spaces:
Runtime error
Runtime error
| import os | |
| import binascii | |
| import warnings | |
| import json | |
| import argparse | |
| import copy | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import torch | |
| import tqdm | |
| import librosa | |
| import soundfile as sf | |
| import gradio as gr | |
| import pytube as pt | |
| from pytube.exceptions import VideoUnavailable | |
| from inference.style_transfer import * | |
| yt_video_dir = f"./yt_dir/0" | |
| os.makedirs(yt_video_dir, exist_ok=True) | |
| def get_audio_from_yt_video_input(yt_link: str, start_point_in_second=0, duration_in_second=30): | |
| try: | |
| yt = pt.YouTube(yt_link) | |
| t = yt.streams.filter(only_audio=True) | |
| filename_in = os.path.join(yt_video_dir, "input.wav") | |
| t[0].download(filename=filename_in) | |
| except VideoUnavailable as e: | |
| warnings.warn(f"Video Not Found at {yt_link} ({e})") | |
| filename_in = None | |
| # trim audio length - due to computation time on HuggingFace environment | |
| trim_audio(target_file_path=filename_in, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second) | |
| return filename_in, filename_in | |
| def get_audio_from_yt_video_ref(yt_link: str, start_point_in_second=0, duration_in_second=30): | |
| try: | |
| yt = pt.YouTube(yt_link) | |
| t = yt.streams.filter(only_audio=True) | |
| filename_ref = os.path.join(yt_video_dir, "reference.wav") | |
| t[0].download(filename=filename_ref) | |
| except VideoUnavailable as e: | |
| warnings.warn(f"Video Not Found at {yt_link} ({e})") | |
| filename_ref = None | |
| # trim audio length - due to computation time on HuggingFace environment | |
| trim_audio(target_file_path=filename_ref, start_point_in_second=start_point_in_second, duration_in_second=duration_in_second) | |
| return filename_ref, filename_ref | |
| def inference(file_uploaded_in, file_uploaded_ref): | |
| # clear out previously separated results | |
| os.system(f"rm -r {yt_video_dir}/separated") | |
| # change file path name | |
| # os.system(f"cp {file_uploaded_in} {yt_video_dir}/input.wav") | |
| # os.system(f"cp {file_uploaded_ref} {yt_video_dir}/reference.wav") | |
| sample_rate, data = file_uploaded_in | |
| sf.write(f"{yt_video_dir}/input.wav", data, sample_rate) | |
| sample_rate, data = file_uploaded_ref | |
| sf.write(f"{yt_video_dir}/reference.wav", data, sample_rate) | |
| # Perform music mixing style transfer | |
| args = set_up() | |
| inference_style_transfer = Mixing_Style_Transfer_Inference(args) | |
| # output_wav_path, fin_data_out_mix = inference_style_transfer.inference(file_uploaded_in, file_uploaded_ref) | |
| output_wav_path, fin_data_out_mix = inference_style_transfer.inference(f"{yt_video_dir}/input.wav", f"{yt_video_dir}/reference.wav") | |
| return (44100, fin_data_out_mix) | |
| with gr.Blocks() as demo: | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; max-width: 700px; margin: 0 auto;"> | |
| <div | |
| style=" | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.8rem; | |
| font-size: 1.75rem; | |
| " | |
| > | |
| <h1 style="font-weight: 900; margin-bottom: 7px;"> | |
| Music Mixing Style Transfer | |
| </h1> | |
| </div> | |
| """ | |
| ) | |
| gr.Markdown( | |
| """ | |
| This page is a Hugging Face interactive demo of the paper ["Music Mixing Style Transfer: A Contrastive Learning Approach to Disentangle Audio Effects"](https://huggingface.co/papers/2211.02247) (ICASSP 2023). | |
| - [project page](https://jhtonykoo.github.io/MixingStyleTransfer/) | |
| - [GitHub](https://github.com/jhtonyKoo/music_mixing_style_transfer) | |
| - [supplementary](https://pale-cicada-946.notion.site/Music-Mixing-Style-Transfer-A-Contrastive-Learning-Approach-to-Disentangle-Audio-Effects-Supplemen-e6eccd9a431a4a8fa4fdd5adb2d3f219) | |
| """ | |
| ) | |
| with gr.Group(): | |
| with gr.Column(): | |
| with gr.Blocks(): | |
| with gr.Tab("Input Music"): | |
| file_uploaded_in = gr.Audio(label="Input track (mix) to be mixing style transferred") | |
| with gr.Tab("YouTube url"): | |
| with gr.Row(): | |
| yt_link_in = gr.Textbox( | |
| label="Enter YouTube Link of the Video", autofocus=True, lines=3 | |
| ) | |
| yt_in_start_sec = gr.Number( | |
| value=0, | |
| label="starting point of the song (in seconds)" | |
| ) | |
| yt_in_duration_sec = gr.Number( | |
| value=30, | |
| label="duration of the song (in seconds)" | |
| ) | |
| yt_btn_in = gr.Button("Download Audio from YouTube Link", size="lg") | |
| yt_audio_path_in = gr.Audio( | |
| label="Input Audio Extracted from the YouTube Video", interactive=False | |
| ) | |
| yt_btn_in.click( | |
| get_audio_from_yt_video_input, | |
| inputs=[yt_link_in, yt_in_start_sec, yt_in_duration_sec], | |
| outputs=[yt_audio_path_in, file_uploaded_in], | |
| ) | |
| with gr.Blocks(): | |
| with gr.Tab("Reference Music"): | |
| file_uploaded_ref = gr.Audio(label="Reference track (mix) to copy mixing style") | |
| with gr.Tab("YouTube url"): | |
| with gr.Row(): | |
| yt_link_ref = gr.Textbox( | |
| label="Enter YouTube Link of the Video", autofocus=True, lines=3 | |
| ) | |
| yt_ref_start_sec = gr.Number( | |
| value=0, | |
| label="starting point of the song (in seconds)" | |
| ) | |
| yt_ref_duration_sec = gr.Number( | |
| value=30, | |
| label="duration of the song (in seconds)" | |
| ) | |
| yt_btn_ref = gr.Button("Download Audio from YouTube Link", size="lg") | |
| yt_audio_path_ref = gr.Audio( | |
| label="Reference Audio Extracted from the YouTube Video", interactive=False | |
| ) | |
| yt_btn_ref.click( | |
| get_audio_from_yt_video_ref, | |
| inputs=[yt_link_ref, yt_ref_start_sec, yt_ref_duration_sec], | |
| outputs=[yt_audio_path_ref, file_uploaded_ref], | |
| ) | |
| with gr.Group(): | |
| gr.HTML( | |
| """ | |
| <div> <h3> <center> Mixing Style Transfer. Perform stem-wise audio-effects style conversion by first source separating the input mix. The inference computation time takes longer as the input samples' duration. so plz be patient... </h3> </div> | |
| """ | |
| ) | |
| with gr.Column(): | |
| inference_btn = gr.Button("Run Mixing Style Transfer") | |
| with gr.Row(): | |
| output_mix = gr.Audio(label="mixing style transferred music track", type='numpy') | |
| inference_btn.click( | |
| inference, | |
| inputs=[file_uploaded_in, file_uploaded_ref], | |
| outputs=[output_mix], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) |