import gradio as gr
print("Gradio version:", gr.__version__)
import os, time, re
import numpy as np
import joblib
import librosa
from huggingface_hub import hf_hub_download
from deepface import DeepFace
from transformers import pipeline
# 如果不手动用 AutoTokenizer/AutoModel，就不必 import AutoTokenizer, AutoModelForSequenceClassification

# --- 1. 加载 SVM 语音模型 ---
print("Downloading SVM model from Hugging Face Hub...")
model_path = hf_hub_download(repo_id="GCLing/emotion-svm-model", filename="svm_emotion_model.joblib")
print(f"SVM model downloaded to: {model_path}")
svm_model = joblib.load(model_path)
print("SVM model loaded.")

# --- 2. 文本情绪分析：规则+zero-shot ---
zero_shot = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
candidate_labels = ["joy", "sadness", "anger", "fear", "surprise", "disgust"]
label_map_en2cn = {
    "joy": "高興", "sadness": "悲傷", "anger": "憤怒",
    "fear": "恐懼", "surprise": "驚訝", "disgust": "厭惡"
}
emo_keywords = {
    "happy": ["開心","快樂","愉快","喜悦","喜悅","歡喜","興奮","高興"],
    "angry": ["生氣","憤怒","不爽","發火","火大","氣憤"],
    "sad": ["傷心","難過","哭","難受","心酸","憂","悲","哀","痛苦","慘","愁"],
    "surprise": ["驚訝","意外","嚇","驚詫","詫異","訝異","好奇"],
    "fear": ["怕","恐懼","緊張","懼","膽怯","畏"]
}
negations = ["不","沒","沒有","別","勿","非"]

def keyword_emotion(text: str):
    counts = {emo: 0 for emo in emo_keywords}
    for emo, kws in emo_keywords.items():
        for w in kws:
            idx = text.find(w)
            if idx != -1:
                # 简单否定检测
                neg = False
                for neg_word in negations:
                    plen = len(neg_word)
                    if idx - plen >= 0 and text[idx-plen:idx] == neg_word:
                        neg = True
                        break
                if not neg:
                    counts[emo] += 1
    total = sum(counts.values())
    if total > 0:
        return {emo: counts[emo]/total for emo in counts}
    else:
        return None

def predict_text_mixed(text: str):
    if not text or text.strip() == "":
        return {}
    res = keyword_emotion(text)
    if res:
        top_emo = max(res, key=res.get)
        mapping = {"happy":"高兴","angry":"愤怒","sad":"悲伤","surprise":"惊讶","fear":"恐惧"}
        cn = mapping.get(top_emo, top_emo)
        return {cn: res[top_emo]}
    try:
        out = zero_shot(text, candidate_labels=candidate_labels,
                        hypothesis_template="这句话表达了{}情绪")
        result = {}
        for lab, sc in zip(out["labels"], out["scores"]):
            cn = label_map_en2cn.get(lab.lower(), lab)
            result[cn] = float(sc)
        return result
    except Exception as e:
        print("zero-shot error:", e)
        return {"中性": 1.0}

# --- 3. 语音情绪预测函数 ---
def extract_feature(signal: np.ndarray, sr: int) -> np.ndarray:
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=13)
    return np.concatenate([np.mean(mfcc, axis=1), np.var(mfcc, axis=1)])

def predict_voice(audio_path: str):
    if not audio_path:
        print("predict_voice: 无 audio_path，跳过")
        return {}
    try:
        signal, sr = librosa.load(audio_path, sr=None)
        feat = extract_feature(signal, sr)
        probs = svm_model.predict_proba([feat])[0]
        labels = svm_model.classes_
        return {labels[i]: float(probs[i]) for i in range(len(labels))}
    except Exception as e:
        print("predict_voice error:", e)
        return {}

# --- 4. 人脸情绪预测函数 ---
def predict_face(img: np.ndarray):
    print("predict_face called, img is None?", img is None)
    if img is None:
        return {}
    try:
        res = DeepFace.analyze(img, actions=["emotion"], detector_backend="opencv")
        if isinstance(res, list):
            first = res[0] if res else {}
            emo = first.get("emotion", {}) if isinstance(first, dict) else {}
        else:
            emo = res.get("emotion", {}) if isinstance(res, dict) else {}
        # 转 float，确保 JSON 可序列化
        emo_fixed = {k: float(v) for k, v in emo.items()}
        print("predict_face result:", emo_fixed)
        return emo_fixed
    except Exception as e:
        print("DeepFace.analyze error:", e)
        return {}

# --- 5. Gradio 界面 ---
with gr.Blocks() as demo:
    gr.Markdown("## 多模態情緒分析示例")
    with gr.Tabs():
        # 臉部情緒 Tab
        with gr.TabItem("臉部情緒"):
            gr.Markdown("### 臉部情緒 (即時 Webcam Streaming 分析)")
            with gr.Row():
                webcam = gr.Image(source="webcam", streaming=True, type="numpy", label="攝像頭畫面")
                face_out = gr.Label(label="情緒分布")
            webcam.stream(fn=predict_face, inputs=webcam, outputs=face_out)
        # 語音情緒 Tab
        with gr.TabItem("語音情緒"):
            gr.Markdown("### 語音情緒 分析")
            with gr.Row():
                audio = gr.Audio(source="microphone", streaming=False, type="filepath", label="錄音")
                voice_out = gr.Label(label="語音情緒結果")
            audio.change(fn=predict_voice, inputs=audio, outputs=voice_out)
        # 文字情緒 Tab
        with gr.TabItem("文字情緒"):
            gr.Markdown("### 文字情緒 分析 (规则+zero-shot)")
            with gr.Row():
                text = gr.Textbox(lines=3, placeholder="請輸入中文文字…")
                text_out = gr.Label(label="文字情緒結果")
            text.submit(fn=predict_text_mixed, inputs=text, outputs=text_out)

if __name__ == "__main__":
    demo.launch()