import os
import subprocess
import uuid
import torch
import torchaudio
import torchaudio.transforms as T
import soundfile as sf
import gradio as gr
import spaces
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip
import look2hear.models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load models
dnr_model = look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR", cache_dir="cache").to(device).eval()
sep_model = look2hear.models.TIGER.from_pretrained("JusperLee/TIGER-speech", cache_dir="cache").to(device).eval()

TARGET_SR = 16000
MAX_SPEAKERS = 4

def extract_audio_from_video(video_path, freq):
    video = VideoFileClip(video_path)
    session_id = uuid.uuid4().hex[:8]
    audio_path = f"temp_audio/{session_id}.wav"
    os.makedirs("temp_audio", exist_ok=True)
    video.audio.write_audiofile(audio_path, fps=freq, verbose=False, logger=None)
    return audio_path, video

def attach_audio_to_video(original_video, audio_path, out_path):
    new_audio = AudioFileClip(audio_path)
    new_video = original_video.set_audio(new_audio)
    new_video.write_videofile(out_path, audio_codec='aac', verbose=False, logger=None)
    return out_path


def separate_speakers_core(audio_path):
    waveform, original_sr = torchaudio.load(audio_path)
    if original_sr != TARGET_SR:
        waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)

    if waveform.dim() == 1:
        waveform = waveform.unsqueeze(0)  # Ensure shape is (1, samples)
    audio_input = waveform.unsqueeze(0).to(device)  # Shape: (1, 1, samples)

    with torch.no_grad():
        ests_speech = sep_model(audio_input).squeeze(0)  # Shape: (num_speakers, samples)

    session_id = uuid.uuid4().hex[:8]
    output_dir = os.path.join("output_sep", session_id)
    os.makedirs(output_dir, exist_ok=True)

    output_files = []
    for i in range(ests_speech.shape[0]):
        path = os.path.join(output_dir, f"speaker_{i+1}.wav")
        speaker_waveform = ests_speech[i].cpu()

        if speaker_waveform.dim() == 1:
            speaker_waveform = speaker_waveform.unsqueeze(0)  # (1, samples)

        # Ensure correct dtype and save in a widely compatible format
        speaker_waveform = speaker_waveform.to(torch.float32)
        torchaudio.save(path, speaker_waveform, TARGET_SR, format="wav", encoding="PCM_S", bits_per_sample=16)
        output_files.append(path)

    print(output_files)

    return output_files


@spaces.GPU()
def separate_dnr(audio_file):
    """
    Perform Dialog, Effects, and Music (DnR) separation on an uploaded audio file.

    Args:
        audio_file (str): File path to the input WAV audio file. 
            This should be a mixed audio track containing dialog, background music, and sound effects.

    Returns:
        Tuple[str, str, str]: Paths to the separated audio files:
            - Dialog-only audio (dialog.wav)
            - Sound effects-only audio (effect.wav)
            - Background music-only audio (music.wav)

    This function uses a pretrained DnR model (TIGER-DnR) to isolate the components in the audio.
    It is intended for tasks such as improving intelligibility or remixing.
    """

    audio, sr = torchaudio.load(audio_file)
    audio = audio.to(device)

    with torch.no_grad():
        dialog, effect, music = dnr_model(audio[None])

    session_id = uuid.uuid4().hex[:8]
    output_dir = os.path.join("output_dnr", session_id)
    os.makedirs(output_dir, exist_ok=True)

    paths = {
        "dialog": os.path.join(output_dir, "dialog.wav"),
        "effect": os.path.join(output_dir, "effect.wav"),
        "music": os.path.join(output_dir, "music.wav"),
    }

    torchaudio.save(paths["dialog"], dialog.cpu(), sr)
    torchaudio.save(paths["effect"], effect.cpu(), sr)
    torchaudio.save(paths["music"], music.cpu(), sr)

    return paths["dialog"], paths["effect"], paths["music"]

@spaces.GPU()
def separate_speakers(audio_path):
    """
    Perform speaker separation on a mixed audio file containing multiple speakers.

    Args:
        audio_path (str): File path to the audio WAV file containing overlapping speech from multiple people.

    Returns:
        List[gr.update]: A list of Gradio update objects, each containing:
            - A separate audio file for each identified speaker (up to MAX_SPEAKERS)
            - Visibility and label updates for the UI

    This function internally calls a pretrained speech separation model (TIGER-speech) 
    and isolates individual speaker tracks from the input audio.
    """

    output_files = separate_speakers_core(audio_path)
    updates = []
    for i in range(MAX_SPEAKERS):
        if i < len(output_files):
            updates.append(gr.update(value=output_files[i], visible=True, label=f"Speaker {i+1}"))
        else:
            updates.append(gr.update(value=None, visible=False))
    return updates

@spaces.GPU()
def separate_dnr_video(video_path):
    """
    Separate dialog, effects, and music from the audio of an uploaded video file and reattach them to the original video.

    Args:
        video_path (str): File path to the input video file (e.g., MP4 or MOV).
            The video should contain a composite audio track with dialog, effects, and music.

    Returns:
        Tuple[str, str, str]: Paths to the output videos with:
            - Only dialog audio track (dialog_video.mp4)
            - Only effects audio track (effect_video.mp4)
            - Only music audio track (music_video.mp4)

    The audio is extracted from the video, separated using the DnR model, and then reattached to the original video visuals.
    """

    audio_path, video = extract_audio_from_video(video_path, 44100)
    dialog_path, effect_path, music_path = separate_dnr(audio_path)

    session_id = uuid.uuid4().hex[:8]
    output_dir = os.path.join("output_dnr_video", session_id)
    os.makedirs(output_dir, exist_ok=True)

    dialog_video = attach_audio_to_video(video, dialog_path, os.path.join(output_dir, "dialog_video.mp4"))
    effect_video = attach_audio_to_video(video, effect_path, os.path.join(output_dir, "effect_video.mp4"))
    music_video = attach_audio_to_video(video, music_path, os.path.join(output_dir, "music_video.mp4"))

    return dialog_video, effect_video, music_video


@spaces.GPU()
def separate_speakers_video(video_path):
    """
    Separate individual speakers from the audio track of a video and reattach each speaker’s voice to a copy of the original video.

    Args:
        video_path (str): File path to a video file with overlapping speech from multiple speakers.

    Returns:
        List[gr.update]: A list of Gradio update objects each containing:
            - A new video file where the audio consists of only one speaker's voice
            - Visibility and label information for UI display

    The function extracts audio from the video, separates individual speakers using a pretrained model,
    and generates one video per speaker by replacing the audio in the original video.
    """

    audio_path, video = extract_audio_from_video(video_path, 16000)
    output_files = separate_speakers_core(audio_path)

    session_id = uuid.uuid4().hex[:8]
    output_dir = os.path.join("output_sep_video", session_id)
    os.makedirs(output_dir, exist_ok=True)

    output_videos = []
    for i, audio_file in enumerate(output_files):
        speaker_video_path = os.path.join(output_dir, f"speaker_{i+1}_video.mp4")
        video_with_sep_audio = attach_audio_to_video(video, audio_file, speaker_video_path)
        output_videos.append(video_with_sep_audio)

    updates = []
    for i in range(MAX_SPEAKERS):
        if i < len(output_videos):
            updates.append(gr.update(value=output_videos[i], visible=True, label=f"Speaker {i+1}"))
        else:
            updates.append(gr.update(value=None, visible=False))
    return updates


# --- Gradio UI ---
with gr.Blocks() as demo:
    gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
    gr.Markdown("TIGER is a lightweight model for speech separation which effectively extracts key acoustic features through frequency band-split, multi-scale and full-frequency-frame modeling.")

    gr.HTML("""
            <div style="display:flex;column-gap:4px;">
                <a href="https://cslikai.cn/TIGER/">
                    <img src='https://img.shields.io/badge/Project-Page-green'>
                </a> 
    			
                <a href="https://huggingface.co/spaces/fffiloni/TIGER-audio-extraction?duplicate=true">
    				<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
    			</a>	
            </div>
        """)

    with gr.Tabs():
        with gr.Tab("Audio DnR"):
            dnr_input = gr.Audio(type="filepath", label="Upload Audio")
            dnr_btn = gr.Button("Separate")
            gr.Examples(
                examples = ["./test/test_mixture_466.wav"],
                inputs = dnr_input
            )
            dnr_output = [gr.Audio(label=l) for l in ["Dialog", "Effects", "Music"]]
            dnr_btn.click(separate_dnr, inputs=dnr_input, outputs=dnr_output)

        with gr.Tab("Audio Speaker Separation"):
            sep_input = gr.Audio(type="filepath", label="Upload Speech Audio")
            sep_btn = gr.Button("Separate Speakers")
            gr.Examples(
                examples = ["./test/mix.wav"],
                inputs = sep_input
            )
            sep_outputs = [gr.Audio(label=f"Speaker {i+1}", visible=(i==0)) for i in range(MAX_SPEAKERS)]
            sep_btn.click(separate_speakers, inputs=sep_input, outputs=sep_outputs)

        with gr.Tab("Video DnR"):
            vdnr_input = gr.Video(label="Upload Video")
            vdnr_btn = gr.Button("Separate Audio Tracks")
            vdnr_output = [gr.Video(label=l) for l in ["Dialog Video", "Effects Video", "Music Video"]]
            vdnr_btn.click(separate_dnr_video, inputs=vdnr_input, outputs=vdnr_output)

        with gr.Tab("Video Speaker Separation"):
            vsep_input = gr.Video(label="Upload Video")
            vsep_btn = gr.Button("Separate Speakers")
            vsep_outputs = [gr.Video(label=f"Speaker {i+1}", visible=(i==0)) for i in range(MAX_SPEAKERS)]
            vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)

if __name__ == "__main__":
    demo.launch(ssr_mode=False, mcp_server=True)