import spaces
import gradio as gr
import os
import numpy as np
from pydub import AudioSegment
import hashlib
from sonic import Sonic
from PIL import Image
import torch  # 필요 시 사용

# ------------------------------------------------------------------
# 모델 초기화
# ------------------------------------------------------------------
cmd = (
    'python3 -m pip install "huggingface_hub[cli]"; '
    'huggingface-cli download LeonJoe13/Sonic --local-dir  checkpoints; '
    'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir  checkpoints/stable-video-diffusion-img2vid-xt; '
    'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;'
)
os.system(cmd)

pipe = Sonic()

# ------------------------------------------------------------------
# 유틸
# ------------------------------------------------------------------
def get_md5(content):
    """바이트/배열에서 md5 해시 문자열 반환"""
    md5hash = hashlib.md5(content)
    return md5hash.hexdigest()

# ------------------------------------------------------------------
# 비디오 생성
# ------------------------------------------------------------------
@spaces.GPU(duration=300)  # 최대 5분까지 GPU 세션 유지
def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0):
    expand_ratio = 0.0          # ★ 얼굴 크롭 방지
    min_resolution = 512

    # 오디오 길이 → 프레임 수 결정 (fps=25, 최대 60초=1500프레임)
    audio = AudioSegment.from_file(audio_path)
    duration = len(audio) / 1000.0  # 초
    fps = 25
    max_steps = fps * 60            # 1500
    inference_steps = max(1, min(int(duration * fps), max_steps))
    print(f"Audio duration: {duration:.2f}s → inference_steps: {inference_steps}")

    # 얼굴 정보는 참고용으로만 출력
    face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio)
    print(f"Face detection info: {face_info}")
    if face_info["face_num"] == 0:
        print("Warning: face not detected – proceeding with full image.")

    # 출력 폴더 보장
    os.makedirs(os.path.dirname(res_video_path), exist_ok=True)

    # 비디오 생성
    pipe.process(
        img_path,
        audio_path,
        res_video_path,
        min_resolution=min_resolution,
        inference_steps=inference_steps,
        dynamic_scale=dynamic_scale,
    )
    return res_video_path

# ------------------------------------------------------------------
# 캐시·경로 설정
# ------------------------------------------------------------------
tmp_path = "./tmp_path/"
res_path = "./res_path/"
os.makedirs(tmp_path, exist_ok=True)
os.makedirs(res_path, exist_ok=True)

# ------------------------------------------------------------------
# Gradio 콜백
# ------------------------------------------------------------------
def process_sonic(image, audio, dynamic_scale):
    # 입력 검증
    if image is None:
        raise gr.Error("Please upload an image")
    if audio is None:
        raise gr.Error("Please upload an audio file")

    img_md5 = get_md5(np.array(image))
    audio_md5 = get_md5(audio[1])
    print(f"Processing (img={img_md5}, audio={audio_md5})")

    # numpy 오디오 → AudioSegment
    sampling_rate, arr = audio[:2]
    if arr.ndim == 1:
        arr = arr[:, None]
    audio_segment = AudioSegment(
        arr.tobytes(),
        frame_rate=sampling_rate,
        sample_width=arr.dtype.itemsize,
        channels=arr.shape[1],
    )

    # 경로
    image_path = os.path.abspath(os.path.join(tmp_path, f"{img_md5}.png"))
    audio_path = os.path.abspath(os.path.join(tmp_path, f"{audio_md5}.wav"))
    res_video_path = os.path.abspath(
        os.path.join(res_path, f"{img_md5}_{audio_md5}_{dynamic_scale}.mp4")
    )

    # 저장 / 캐시
    if not os.path.exists(image_path):
        image.save(image_path)
    if not os.path.exists(audio_path):
        audio_segment.export(audio_path, format="wav")

    if os.path.exists(res_video_path):
        print(f"Using cached result: {res_video_path}")
        return res_video_path

    print(f"Generating new video (dynamic_scale={dynamic_scale})")
    return get_video_res(image_path, audio_path, res_video_path, dynamic_scale)

# ------------------------------------------------------------------
# Gradio UI
# ------------------------------------------------------------------
def get_example():
    """예시 데이터 (필요 시 추가)"""
    return []

css = """
.gradio-container { font-family: 'Arial', sans-serif; }
.main-header { text-align: center; color: #2a2a2a; margin-bottom: 2em; }
.parameter-section { background-color: #f5f5f5; padding: 1em; border-radius: 8px; margin: 1em 0; }
.example-section { margin-top: 2em; }
"""

with gr.Blocks(css=css, theme="apriel") as demo:
    gr.HTML(
        """
        <div class="main-header">
            <h1>🎭 Longer Sonic: Advanced Portrait Animation</h1>
            <p>Transform still images into dynamic videos synchronized with audio(Demo max 60sec)</p>
        </div>
        """
    )

    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Portrait Image", elem_id="image_input")
            audio_input = gr.Audio(label="Voice/Audio Input", elem_id="audio_input", type="numpy")
            dynamic_scale = gr.Slider(
                minimum=0.5,
                maximum=2.0,
                value=1.0,
                step=0.1,
                label="Animation Intensity",
                info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)",
            )
            process_btn = gr.Button("Generate Animation", variant="primary", elem_id="process_btn")

        with gr.Column():
            video_output = gr.Video(label="Generated Animation", elem_id="video_output")

    process_btn.click(
        fn=process_sonic,
        inputs=[image_input, audio_input, dynamic_scale],
        outputs=video_output,
        api_name="animate",
    )

    gr.Examples(
        examples=get_example(),
        fn=process_sonic,
        inputs=[image_input, audio_input, dynamic_scale],
        outputs=video_output,
        cache_examples=False,
    )

# ------------------------------------------------------------------
# Launch
# ------------------------------------------------------------------
demo.launch(share=True)