Hasti11's picture
Update app.py
389c6e6 verified
import os
import numpy as np
from transformers import pipeline
import speech_recognition as sr
import gradio as gr
import cv2
from PIL import Image
import moviepy.editor as mp
from gtts import gTTS
from groq import Groq
import re
client = Groq(
api_key="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",
)
# Initialize pipelines
image_pipeline = pipeline("image-classification", model="trpakov/vit-face-expression", top_k=1)
audio_pipeline = pipeline("audio-classification", model="audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim")
text_pipeline = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions", top_k=2)
conversation_history = []
# max_history_length = 3
def process_input(video_stream, conversation_history):
if isinstance(video_stream, str):
video_file_path = video_stream
# Process video frames
image_features_list = []
audio_emotion = ""
text_input = ""
text_emotions = ""
cap = cv2.VideoCapture(video_file_path)
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
# Convert frame to PIL image
pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# Analyze the image
try:
image_analysis = image_pipeline(pil_image)
if image_analysis:
image_features_list.append(image_analysis[0]['label'])
except Exception as e:
print(f"Error processing image data: {e}")
# Increment frame count
frame_count += 1
# Combine image features into a single string
image_features = ', '.join(image_features_list)
print("Image features:", image_features)
# Process audio data and get the emotion label
try:
# Extract audio from the video file
video_clip = mp.VideoFileClip(video_file_path)
audio_file_path = os.path.join("/tmp", "audio.wav")
video_clip.audio.write_audiofile(audio_file_path)
recognizer = sr.Recognizer()
with sr.AudioFile(audio_file_path) as source:
audio = recognizer.record(source)
# Convert audio data to numpy array
audio_data = np.frombuffer(audio.frame_data, dtype=np.int16)
audio_data = audio_data.astype(np.float32) # Convert to float32
audio_emotions = audio_pipeline(audio_data)
if audio_emotions:
audio_emotion = audio_emotions[0]['label']
print("Audio emotion:", audio_emotion)
# Recognize audio
text_input = recognizer.recognize_google(audio)
print("User said:", text_input)
except Exception as e:
print(f"Error processing audio data: {e}")
# Process text data and get the emotion label
text_emotions = ""
try:
# Initialize text_input in case it's not set
if not text_input:
text_input = ""
text_analysis = text_pipeline(text_input)
print("text analysis:", text_analysis)
if isinstance(text_analysis, list):
# Flatten the list of lists
text_analysis = [item for sublist in text_analysis for item in sublist]
# Initialize an empty list to store the text emotions
text_emotions_list = []
# Iterate through each item in the flattened list
for item in text_analysis:
# Ensure each item is a dictionary and contains the 'label' key
if isinstance(item, dict) and 'label' in item:
# Append the 'label' value to the text_emotions_list
text_emotions_list.append(item['label'])
# Check if text_emotions_list is empty
if text_emotions_list:
# Convert the text_emotions_list to a comma-separated string
text_emotions = ', '.join(text_emotions_list)
print("Text emotions:", text_emotions)
else:
text_emotions = "No significant emotions detected in the text."
except Exception as e:
print(f"Error processing text data: {e}")
if conversation_history is not None:
# conversation_history = conversation_history[-max_history_length:] # Keep most recent entries
conversation_history.append({
"user_input": text_input,
"image_features": image_features,
"audio_emotion": audio_emotion,
"text_emotions": text_emotions
})
else:
conversation_history = [{
"user_input": text_input,
"image_features": image_features,
"audio_emotion": audio_emotion,
"text_emotions": text_emotions
}]
prompt = "User said: " + text_input
if image_features:
prompt += "\nImage features: " + ', '.join(image_features)
if audio_emotion:
prompt += "\nAudio emotion: " + audio_emotion
if text_emotions:
prompt += "\nText emotions: " + text_emotions
# Get conversation history text
history_text = display_history(conversation_history)
chat_completion = client.chat.completions.create(
messages=[
{"role": "system",
"content": "As a mental health therapist, you're speaking to a user who is seeking guidance and support. They may be experiencing various challenges and are looking for solutions to improve their mental well-being. Your responses should be empathetic, supportive, and offer practical advice tailored to the user's specific issues. Remember to maintain a positive and non-judgmental tone throughout the interaction."
},
{"role": "user",
"content": prompt+history_text
},
{"role": "assistant",
"content": history_text
}
],
model="llama3-70b-8192",
temperature=0.5,
max_tokens=1024,
top_p=1,
stop=None,
stream=False,
)
ai_response = chat_completion.choices[0].message.content
conversation_history.append({"ai_response": ai_response})
print(ai_response)
# Convert AI response to audio
tts = gTTS(text=ai_response, lang='en')
audio_file_path = "/tmp/ai_response.wav"
tts.save(audio_file_path)
return ai_response,audio_file_path,conversation_history
def display_history(conversation_history):
history_str = ""
for i, turn in enumerate(conversation_history):
if "user_input" in turn:
history_str += f"User: {turn['user_input']}\n"
if "ai_response" in turn:
ai_response = turn['ai_response']
ai_response = re.sub(r'\*\*', '', ai_response)
history_str += f"Therapist: {turn['ai_response']}\n\n"
return history_str
# Create the Gradio interface
input_video = gr.Video( label="Your Video", include_audio=True)
output_text = gr.Textbox(label="Therapist Response")
output_audio = gr.Audio(autoplay=True, visible=False)
custom_css = """
gr.Interface .gradio-title{
text-align: center;
font-size: 24px;
font-weight: bold;
margin-left:123px;
}
gr.Interface .gradio-description {
text-align: center;
font-size: 16px;
margin-top: 10px;
}
"""
description = """
Speak to the AI through video input and get personalized responses from our mental health therapist. Whether you need guidance, support, or just someone to talk to, our AI is here to help you navigate life's challenges with empathy and understanding.
"""
iface = gr.Interface(fn=process_input, inputs=input_video, outputs=[output_text, output_audio], title="MindWave: Real-Time Mental Health Therapist through GenAI and Multimodal Interaction", description=description, theme=gr.themes.Default(primary_hue="teal", secondary_hue="cyan"), allow_flagging=False,css=custom_css)
iface.launch()