|
import os |
|
import numpy as np |
|
from transformers import pipeline |
|
import speech_recognition as sr |
|
import gradio as gr |
|
import cv2 |
|
from PIL import Image |
|
import moviepy.editor as mp |
|
from gtts import gTTS |
|
from groq import Groq |
|
import re |
|
|
|
client = Groq( |
|
api_key="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", |
|
) |
|
|
|
|
|
image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1) |
|
audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim") |
|
text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2) |
|
|
|
conversation_history = [] |
|
|
|
|
|
def process_input(video_stream, conversation_history): |
|
if isinstance(video_stream, str): |
|
video_file_path = video_stream |
|
|
|
|
|
image_features_list = [] |
|
audio_emotion = "" |
|
text_input = "" |
|
text_emotions = "" |
|
|
|
cap = cv2.VideoCapture(video_file_path) |
|
frame_count = 0 |
|
|
|
while True: |
|
ret, frame = cap.read() |
|
if not ret: |
|
break |
|
|
|
|
|
pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) |
|
|
|
|
|
try: |
|
image_analysis = image_pipeline(pil_image) |
|
if image_analysis: |
|
image_features_list.append(image_analysis[0]['label']) |
|
except Exception as e: |
|
print(f"Error processing image data: {e}") |
|
|
|
|
|
frame_count += 1 |
|
|
|
|
|
image_features = ', '.join(image_features_list) |
|
print("Image features:", image_features) |
|
|
|
|
|
try: |
|
|
|
video_clip = mp.VideoFileClip(video_file_path) |
|
audio_file_path = os.path.join("/tmp", "audio.wav") |
|
video_clip.audio.write_audiofile(audio_file_path) |
|
|
|
recognizer = sr.Recognizer() |
|
with sr.AudioFile(audio_file_path) as source: |
|
audio = recognizer.record(source) |
|
|
|
|
|
audio_data = np.frombuffer(audio.frame_data, dtype=np.int16) |
|
audio_data = audio_data.astype(np.float32) |
|
|
|
audio_emotions = audio_pipeline(audio_data) |
|
if audio_emotions: |
|
audio_emotion = audio_emotions[0]['label'] |
|
print("Audio emotion:", audio_emotion) |
|
|
|
|
|
text_input = recognizer.recognize_google(audio) |
|
print("User said:", text_input) |
|
except Exception as e: |
|
print(f"Error processing audio data: {e}") |
|
|
|
|
|
text_emotions = "" |
|
try: |
|
|
|
if not text_input: |
|
text_input = "" |
|
|
|
text_analysis = text_pipeline(text_input) |
|
print("text analysis:", text_analysis) |
|
|
|
if isinstance(text_analysis, list): |
|
|
|
text_analysis = [item for sublist in text_analysis for item in sublist] |
|
|
|
|
|
text_emotions_list = [] |
|
|
|
|
|
for item in text_analysis: |
|
|
|
if isinstance(item, dict) and 'label' in item: |
|
|
|
text_emotions_list.append(item['label']) |
|
|
|
|
|
if text_emotions_list: |
|
|
|
text_emotions = ', '.join(text_emotions_list) |
|
print("Text emotions:", text_emotions) |
|
else: |
|
text_emotions = "No significant emotions detected in the text." |
|
|
|
except Exception as e: |
|
print(f"Error processing text data: {e}") |
|
|
|
|
|
if conversation_history is not None: |
|
|
|
conversation_history.append({ |
|
"user_input": text_input, |
|
"image_features": image_features, |
|
"audio_emotion": audio_emotion, |
|
"text_emotions": text_emotions |
|
}) |
|
else: |
|
conversation_history = [{ |
|
"user_input": text_input, |
|
"image_features": image_features, |
|
"audio_emotion": audio_emotion, |
|
"text_emotions": text_emotions |
|
}] |
|
|
|
prompt = "User said: " + text_input |
|
if image_features: |
|
prompt += "\nImage features: " + ', '.join(image_features) |
|
if audio_emotion: |
|
prompt += "\nAudio emotion: " + audio_emotion |
|
if text_emotions: |
|
prompt += "\nText emotions: " + text_emotions |
|
|
|
|
|
|
|
history_text = display_history(conversation_history) |
|
|
|
chat_completion = client.chat.completions.create( |
|
messages=[ |
|
{"role": "system", |
|
"content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction." |
|
}, |
|
{"role": "user", |
|
"content": prompt+history_text |
|
}, |
|
{"role": "assistant", |
|
"content": history_text |
|
} |
|
], |
|
model="llama3-70b-8192", |
|
temperature=0.5, |
|
max_tokens=1024, |
|
top_p=1, |
|
stop=None, |
|
stream=False, |
|
) |
|
|
|
ai_response = chat_completion.choices[0].message.content |
|
conversation_history.append({"ai_response": ai_response}) |
|
print(ai_response) |
|
|
|
|
|
tts = gTTS(text=ai_response, lang='en') |
|
audio_file_path = "/tmp/ai_response.wav" |
|
tts.save(audio_file_path) |
|
|
|
return ai_response,audio_file_path,conversation_history |
|
|
|
def display_history(conversation_history): |
|
history_str = "" |
|
for i, turn in enumerate(conversation_history): |
|
if "user_input" in turn: |
|
history_str += f"User: {turn['user_input']}\n" |
|
if "ai_response" in turn: |
|
ai_response = turn['ai_response'] |
|
ai_response = re.sub(r'\*\*', '', ai_response) |
|
history_str += f"Therapist: {turn['ai_response']}\n\n" |
|
return history_str |
|
|
|
|
|
|
|
input_video = gr.Video( label="Your Video", include_audio=True) |
|
output_text = gr.Textbox(label="Therapist Response") |
|
output_audio = gr.Audio(autoplay=True, visible=False) |
|
|
|
|
|
custom_css = """ |
|
gr.Interface .gradio-title{ |
|
text-align: center; |
|
font-size: 24px; |
|
font-weight: bold; |
|
margin-left:123px; |
|
} |
|
|
|
gr.Interface .gradio-description { |
|
text-align: center; |
|
font-size: 16px; |
|
margin-top: 10px; |
|
} |
|
""" |
|
|
|
description = """ |
|
Speak to the AI through video input and get personalized responses from our mental health therapist. Whether you need guidance, support, or just someone to talk to, our AI is here to help you navigate life's challenges with empathy and understanding. |
|
""" |
|
iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text, output_audio], title="MindWave: Real-Time Mental Health Therapist through GenAI and Multimodal Interaction", description=description, theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"), allow_flagging=False,css=custom_css) |
|
|
|
|
|
iface.launch() |
|
|