my-emotion-app / app.py
GCLing's picture
Update app.py
540fde5 verified
import gradio as gr
import numpy as np
import librosa, joblib, os
from sklearn.ensemble import RandomForestClassifier
from transformers import pipeline
# β€”β€” 1) 訓練/載ε…₯θͺžιŸ³ζ¨‘εž‹ β€”β€”
BASE_VOICE_PATH = r"C:\ζƒ…η·’"
VOICE_MODEL_FILE = "voice_model.joblib"
def train_voice_model():
labels = ["angry","happy","sad","fear","surprise"]
X, y = [], []
for lbl in labels:
folder = os.path.join(BASE_VOICE_PATH, lbl)
if not os.path.isdir(folder):
raise FileNotFoundError(f"ζ‰ΎδΈεˆ°θ³‡ζ–™ε€ΎοΌš{folder}")
for fname in os.listdir(folder):
if fname.lower().endswith(".wav"):
path = os.path.join(folder, fname)
audio, sr = librosa.load(path, sr=None)
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
mfcc_mean = np.mean(mfccs.T, axis=0)
X.append(mfcc_mean)
y.append(lbl)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X, y)
joblib.dump(clf, VOICE_MODEL_FILE)
return clf
if os.path.exists(VOICE_MODEL_FILE):
voice_clf = joblib.load(VOICE_MODEL_FILE)
else:
voice_clf = train_voice_model()
def analyze_audio(path):
audio, sr = librosa.load(path, sr=None)
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
mfcc_mean = np.mean(mfccs.T, axis=0).reshape(1, -1)
return voice_clf.predict(mfcc_mean)[0]
# β€”β€” 2) δΈ­ζ–‡ζ–‡ε­—ζƒ…η·’εˆ†ζž β€”β€”
def analyze_text(text):
if any(w in text for w in ["ι–‹εΏƒ","快樂"]): return "happy"
if any(w in text for w in ["η”Ÿζ°£","憀怒"]): return "angry"
if any(w in text for w in ["ε‚·εΏƒ","ι›£ιŽ","ε“­"]): return "sad"
if any(w in text for w in ["驚","意倖"]): return "surprise"
if any(w in text for w in ["怕","恐懼"]): return "fear"
return "neutral"
# β€”β€” 3) ε³ζ™‚θ‡‰ιƒ¨ζƒ…η·’εˆ†ζžοΌˆζ”Ήη”¨ Hugging Face ferplusοΌ‰ β€”β€”
face_classifier = pipeline(
"image-classification",
model="nateraw/ferplus",
device=-1 # CPU
)
def analyze_face(img):
# img: PIL image or numpy array
result = face_classifier(img, top_k=1)[0]
return result["label"]
# β€”β€” 4) ε»Ίη«‹ Gradio ε€šζ¨™η±€δ»‹ι’ β€”β€”
with gr.Blocks() as demo:
gr.Markdown("# ε€šζ¨‘ζ…‹ζƒ…η·’εˆ†ζžη€Ίη―„")
with gr.Tab("πŸ“ ζ–‡ε­—"):
txt = gr.Textbox(placeholder="θΌΈε…₯中文…")
btn_txt = gr.Button("εˆ†ζžζ–‡ε­—")
out_txt = gr.Textbox()
btn_txt.click(analyze_text, inputs=txt, outputs=out_txt)
with gr.Tab("🎀 θͺžιŸ³"):
aud = gr.Audio(type="filepath")
btn_aud = gr.Button("εˆ†ζžθͺžιŸ³")
out_aud = gr.Textbox()
btn_aud.click(analyze_audio, inputs=aud, outputs=out_aud)
with gr.Tab("πŸ“· 臉部"):
img_cam = gr.Image(source="webcam")
btn_img = gr.Button("εˆ†ζžθ‘¨ζƒ…")
out_img = gr.Textbox()
btn_img.click(analyze_face, inputs=img_cam, outputs=out_img)
demo.launch()