Spaces:

reab5555
/

Multimodal-Behavioral-Anomalies-Detection

Runtime error

App Files Files Community

reab5555 commited on Jul 22, 2024

Commit

16178c1

verified ·

1 Parent(s): 5b70288

Update app.py

Browse files

Files changed (1) hide show

app.py +357 -267

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import cv2
 import numpy as np
@@ -5,38 +6,35 @@ import torch
 import torch.nn as nn
 import torch.optim as optim
 from facenet_pytorch import InceptionResnetV1, MTCNN
 import mediapipe as mp
 from fer import FER
 from sklearn.cluster import DBSCAN
-from sklearn.preprocessing import MinMaxScaler
-from sklearn.decomposition import PCA
-import umap
 import pandas as pd
 import matplotlib
 import matplotlib.pyplot as plt
 from moviepy.editor import VideoFileClip
 from PIL import Image
 import gradio as gr
 import tempfile
 import shutil
-import tensorflow as tf
-print(torch.__version__)
-print(torch.version.cuda)
 matplotlib.rcParams['figure.dpi'] = 500
 matplotlib.rcParams['savefig.dpi'] = 500
 # Initialize models and other global variables
-device = 'cuda'
-mtcnn = MTCNN(keep_all=False, device=device, thresholds=[0.985, 0.985, 0.985], min_face_size=80)
 model = InceptionResnetV1(pretrained='vggface2').eval().to(device)
 mp_face_mesh = mp.solutions.face_mesh
-face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.8)
 emotion_detector = FER(mtcnn=False)
 def frame_to_timecode(frame_num, total_frames, duration):
     total_seconds = (frame_num / total_frames) * duration
     hours = int(total_seconds // 3600)
@@ -45,6 +43,15 @@ def frame_to_timecode(frame_num, total_frames, duration):
     milliseconds = int((total_seconds - int(total_seconds)) * 1000)
     return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
 def get_face_embedding_and_emotion(face_img):
     face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0).float() / 255
@@ -57,11 +64,10 @@ def get_face_embedding_and_emotion(face_img):
     if emotions:
         emotion_dict = emotions[0]['emotions']
     else:
-        emotion_dict = {e: 0 for e in ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']}
     return embedding.cpu().numpy().flatten(), emotion_dict
 def alignFace(img):
     img_raw = img.copy()
     results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
@@ -87,7 +93,6 @@ def alignFace(img):
     new_img = cv2.warpAffine(img_raw, rotation_matrix, (width, height))
     return new_img
 def extract_frames(video_path, output_folder, desired_fps, progress_callback=None):
     os.makedirs(output_folder, exist_ok=True)
     clip = VideoFileClip(video_path)
@@ -111,6 +116,19 @@ def extract_frames(video_path, output_folder, desired_fps, progress_callback=Non
     clip.close()
     return frame_count, original_fps
 def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, batch_size):
     embeddings_by_frame = {}
@@ -140,29 +158,29 @@ def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, b
                     x1, y1, x2, y2 = [int(b) for b in boxes[0]]
                     face = frame[y1:y2, x1:x2]
                     if face.size > 0:
-                        aligned_face = alignFace(face)
-                        if aligned_face is not None:
-                            aligned_face_resized = cv2.resize(aligned_face, (160, 160))
-                            output_path = os.path.join(aligned_faces_folder, f"frame_{frame_num}_face.jpg")
-                            cv2.imwrite(output_path, aligned_face_resized)
-                            aligned_face_paths.append(output_path)
-                            embedding, emotion = get_face_embedding_and_emotion(aligned_face_resized)
-                            embeddings_by_frame[frame_num] = embedding
-                            emotions_by_frame[frame_num] = emotion
-        progress((i + len(batch_files)) / frame_count,
-                 f"Processing frames {i + 1} to {min(i + len(batch_files), frame_count)} of {frame_count}")
     return embeddings_by_frame, emotions_by_frame, aligned_face_paths
 def cluster_faces(embeddings):
     if len(embeddings) < 2:
         print("Not enough faces for clustering. Assigning all to one cluster.")
         return np.zeros(len(embeddings), dtype=int)
     X = np.stack(embeddings)
     dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
     clusters = dbscan.fit_predict(X)
@@ -172,7 +190,6 @@ def cluster_faces(embeddings):
     return clusters
 def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder):
     for (frame_num, embedding), cluster in zip(embeddings_by_frame.items(), clusters):
         person_folder = os.path.join(organized_faces_folder, f"person_{cluster}")
@@ -181,39 +198,11 @@ def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder
         dst = os.path.join(person_folder, f"frame_{frame_num}_face.jpg")
         shutil.copy(src, dst)
-def find_optimal_components(embeddings, max_components=20):
-    pca = PCA(n_components=max_components)
-    pca.fit(embeddings)
-    explained_variance_ratio = pca.explained_variance_ratio_
-    cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
-    # Plot explained variance ratio
-    plt.figure(figsize=(10, 6))
-    plt.plot(range(1, max_components + 1), cumulative_variance_ratio, 'bo-')
-    plt.xlabel('Number of Components')
-    plt.ylabel('Cumulative Explained Variance Ratio')
-    plt.title('Explained Variance Ratio vs. Number of Components')
-    plt.grid(True)
-    # Find elbow point
-    differences = np.diff(cumulative_variance_ratio)
-    elbow_point = np.argmin(differences) + 1
-    plt.axvline(x=elbow_point, color='r', linestyle='--', label=f'Elbow point: {elbow_point}')
-    plt.legend()
-    return elbow_point, plt
-def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, output_folder,
-                            video_duration):
-    emotions = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
     person_data = {}
-    for (frame_num, embedding), (_, emotion_dict), cluster in zip(embeddings_by_frame.items(),
-                                                                  emotions_by_frame.items(), clusters):
         if cluster not in person_data:
             person_data[cluster] = []
         person_data[cluster].append((frame_num, embedding, {e: emotion_dict[e] for e in emotions}))
@@ -227,33 +216,18 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
     embeddings_array = np.array(embeddings)
     np.save(os.path.join(output_folder, 'face_embeddings.npy'), embeddings_array)
-    # Find optimal number of components
-    optimal_components, _ = find_optimal_components(embeddings_array)
-    reducer = umap.UMAP(n_components=optimal_components, random_state=1)
-    embeddings_reduced = reducer.fit_transform(embeddings)
-    scaler = MinMaxScaler(feature_range=(0, 1))
-    embeddings_reduced_normalized = scaler.fit_transform(embeddings_reduced)
     total_frames = max(frames)
     timecodes = [frame_to_timecode(frame, total_frames, video_duration) for frame in frames]
-    times_in_minutes = [frame / total_frames * video_duration / 60 for frame in frames]
     df_data = {
         'Frame': frames,
         'Timecode': timecodes,
-        'Time (Minutes)': times_in_minutes,
         'Embedding_Index': range(len(embeddings))
     }
-    # Add raw embeddings
     for i in range(len(embeddings[0])):
         df_data[f'Raw_Embedding_{i}'] = [embedding[i] for embedding in embeddings]
-    for i in range(optimal_components):
-        df_data[f'Comp {i + 1}'] = embeddings_reduced_normalized[:, i]
     for emotion in emotions:
         df_data[emotion] = [e[emotion] for e in emotions_data]
@@ -261,123 +235,139 @@ def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, de
     return df, largest_cluster
-class LSTMAutoencoder(nn.Module):
-    def __init__(self, input_size, hidden_size=128, num_layers=2):
-        super(LSTMAutoencoder, self).__init__()
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.num_layers = num_layers
-        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
-        self.fc = nn.Linear(hidden_size, input_size)
     def forward(self, x):
-        outputs, (hidden, _) = self.lstm(x)
-        out = self.fc(outputs)
-        return out
-def lstm_anomaly_detection(X, feature_columns, raw_embedding_columns, epochs=100, batch_size=8):
-    device = 'cuda'
-    X = torch.FloatTensor(X).to(device)
-    if X.dim() == 2:
-        X = X.unsqueeze(0)
-    elif X.dim() == 1:
-        X = X.unsqueeze(0).unsqueeze(2)
-    print(f"X shape after reshaping: {X.shape}")
-    model = LSTMAutoencoder(input_size=X.shape[2]).to(device)
     criterion = nn.MSELoss()
-    optimizer = optim.Adam(model.parameters())
     for epoch in range(epochs):
-        model.train()
-        optimizer.zero_grad()
-        output = model(X)
-        loss = criterion(output, X)
-        loss.backward()
-        optimizer.step()
-        if epoch % 10 == 0:
-            print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}")
-    model.eval()
     with torch.no_grad():
-        reconstructed = model(X).squeeze(0).cpu().numpy()
-    mse_all = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
-    component_columns = [col for col in feature_columns if col.startswith('Comp')]
-    component_indices = [feature_columns.index(col) for col in component_columns]
-    if len(component_indices) > 0:
-        mse_comp = np.mean(
-            np.power(X.squeeze(0).cpu().numpy()[:, component_indices] - reconstructed[:, component_indices], 2), axis=1)
-    else:
-        mse_comp = mse_all
-    raw_embedding_indices = [feature_columns.index(col) for col in raw_embedding_columns]
-    mse_raw = np.mean(np.power(X.squeeze(0).cpu().numpy()[:, raw_embedding_indices] - reconstructed[:, raw_embedding_indices], 2), axis=1)
-    return mse_all, mse_comp, mse_raw
-def embedding_anomaly_detection(embeddings, epochs=100, batch_size=8):
-    device = 'cuda'
-    X = torch.FloatTensor(embeddings).to(device)
-    if X.dim() == 2:
-        X = X.unsqueeze(0)
-    elif X.dim() == 1:
-        X = X.unsqueeze(0).unsqueeze(2)
-    model = LSTMAutoencoder(input_size=X.shape[2]).to(device)
-    criterion = nn.MSELoss()
-    optimizer = optim.Adam(model.parameters())
-    for epoch in range(epochs):
-        model.train()
-        optimizer.zero_grad()
-        output = model(X)
-        loss = criterion(output, X)
-        loss.backward()
-        optimizer.step()
-    model.eval()
-    with torch.no_grad():
-        reconstructed = model(X).squeeze(0).cpu().numpy()
-    mse = np.mean(np.power(X.squeeze(0).cpu().numpy() - reconstructed, 2), axis=1)
-    return mse
-def determine_anomalies(mse_values, threshold=5):
-    mean = np.mean(mse_values)
-    std = np.std(mse_values)
-    anomalies = mse_values > (mean + threshold * std)
-    return anomalies
-def plot_mse(df, mse_values, title, color='blue', time_threshold=1, hide_first_n=2):
-    plt.figure(figsize=(16, 8), dpi=300)
-    fig, ax = plt.subplots(figsize=(16, 8))
-    df['Seconds'] = df['Timecode'].apply(
-        lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
-    # Plot all points
-    ax.scatter(df['Seconds'], mse_values, color=color, alpha=0.7, s=10)
-    # Determine anomalies
-    anomalies = determine_anomalies(mse_values)
-    # Hide the first n anomalies
-    visible_anomalies = np.where(anomalies)[0][hide_first_n:]
-    ax.scatter(df['Seconds'].iloc[visible_anomalies], mse_values[visible_anomalies], color='red', s=50, zorder=5)
-    # Group closely occurring anomalies and annotate only the highest MSE
-    anomaly_data = list(zip(df['Timecode'].iloc[visible_anomalies],
-                            df['Seconds'].iloc[visible_anomalies],
-                            mse_values[visible_anomalies]))
-    anomaly_data.sort(key=lambda x: x[1])  # Sort by seconds
     grouped_anomalies = []
     current_group = []
@@ -390,38 +380,116 @@ def plot_mse(df, mse_values, title, color='blue', time_threshold=1, hide_first_n
     if current_group:
         grouped_anomalies.append(current_group)
     for group in grouped_anomalies:
         highest_mse_anomaly = max(group, key=lambda x: x[2])
         timecode, sec, mse = highest_mse_anomaly
         ax.annotate(timecode, (sec, mse), textcoords="offset points", xytext=(0, 10),
-                    ha='center', fontsize=8, color='red')
-    # Add baseline (mean MSE) line
-    mean_mse = np.mean(mse_values)
-    ax.axhline(y=mean_mse, color='black', linestyle='--', linewidth=1)
-    ax.text(df['Seconds'].max(), mean_mse, f'Baseline ({mean_mse:.6f})',
-            verticalalignment='bottom', horizontalalignment='right', color='black', fontsize=8)
-    # Set x-axis labels to timecodes
     max_seconds = df['Seconds'].max()
     num_ticks = 100
     tick_locations = np.linspace(0, max_seconds, num_ticks)
-    tick_labels = [frame_to_timecode(int(s * df['Frame'].max() / max_seconds), df['Frame'].max(), max_seconds)
-                   for s in tick_locations]
     ax.set_xticks(tick_locations)
     ax.set_xticklabels(tick_labels, rotation=90, ha='center', fontsize=6)
-    ax.set_xlabel('Time')
     ax.set_ylabel('Mean Squared Error')
     ax.set_title(title)
     ax.grid(True, linestyle='--', alpha=0.7)
     plt.tight_layout()
     plt.close()
     return fig
-def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster):
     face_samples = {"most_frequent": [], "others": []}
     for cluster_folder in sorted(os.listdir(organized_faces_folder)):
         if cluster_folder.startswith("person_"):
@@ -430,7 +498,7 @@ def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
             if face_files:
                 cluster_id = int(cluster_folder.split('_')[1])
                 if cluster_id == largest_cluster:
-                    for i, sample in enumerate(face_files):
                         face_path = os.path.join(person_folder, sample)
                         output_path = os.path.join(output_folder, f"face_sample_most_frequent_{i:04d}.jpg")
                         face_img = cv2.imread(face_path)
@@ -438,27 +506,28 @@ def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
                             small_face = cv2.resize(face_img, (160, 160))
                             cv2.imwrite(output_path, small_face)
                             face_samples["most_frequent"].append(output_path)
                 else:
-                    for i, sample in enumerate(face_files):
-                        face_path = os.path.join(person_folder, sample)
-                        output_path = os.path.join(output_folder, f"face_sample_other_{cluster_id:02d}_{i:04d}.jpg")
-                        face_img = cv2.imread(face_path)
-                        if face_img is not None:
-                            small_face = cv2.resize(face_img, (160, 160))
-                            cv2.imwrite(output_path, small_face)
-                            face_samples["others"].append(output_path)
     return face_samples
-def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
     output_folder = "output"
     os.makedirs(output_folder, exist_ok=True)
-    # Initialize plot variables
-    mse_plot_all = None
-    mse_plot_comp = None
-    mse_plot_raw = None
-    emotion_plots = [None] * 6  # For the 6 emotions
-    face_samples = {"most_frequent": [], "others": []}
     with tempfile.TemporaryDirectory() as temp_dir:
         aligned_faces_folder = os.path.join(temp_dir, 'aligned_faces')
@@ -485,13 +554,12 @@ def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
                                                                                     progress, batch_size)
         if not aligned_face_paths:
-            return ("No faces were extracted from the video.",
-                    None, None, None, None, None, None, None, None, None, [], [])
         progress(0.6, "Clustering faces")
         embeddings = [embedding for _, embedding in embeddings_by_frame.items()]
         clusters = cluster_faces(embeddings)
-        num_clusters = len(set(clusters))  # Get the number of unique clusters
         progress(0.7, "Organizing faces")
         organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder)
@@ -500,35 +568,42 @@ def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
         df, largest_cluster = save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps,
                                                       original_fps, temp_dir, video_duration)
         progress(0.85, "Getting face samples")
         face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
         progress(0.9, "Performing anomaly detection")
-        feature_columns = [col for col in df.columns if
-                           col not in ['Frame', 'Timecode', 'Time (Minutes)', 'Embedding_Index']]
-        raw_embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
-        X = df[feature_columns].values
         try:
-            mse_all, mse_comp, mse_raw = lstm_anomaly_detection(
-                X, feature_columns, raw_embedding_columns, batch_size=batch_size)
             progress(0.95, "Generating plots")
-            mse_plot_all = plot_mse(df, mse_all, "Facial Features + Emotions", color='blue', hide_first_n=2)
-            mse_plot_comp = plot_mse(df, mse_comp, "Facial Features", color='deepskyblue', hide_first_n=2)
-            mse_plot_raw = plot_mse(df, mse_raw, "Facial Embeddings", color='steelblue', hide_first_n=2)
-            emotion_plots = [
-                plot_mse(df, embedding_anomaly_detection(df[emotion].values.reshape(-1, 1)),
-                         f"MSE: {emotion.capitalize()}", color=color, hide_first_n=5)
-                for emotion, color in zip(['fear', 'sad', 'angry', 'happy', 'surprise', 'neutral'],
-                                          ['purple', 'green', 'orange', 'darkblue', 'gold', 'grey'])
-            ]
         except Exception as e:
             print(f"Error details: {str(e)}")
-            return (f"Error in anomaly detection: {str(e)}",
-                    None, None, None, None, None, None, None, None, None, [], [])
         progress(1.0, "Preparing results")
         results = f"Number of persons/clusters detected: {num_clusters}\n\n"
@@ -536,58 +611,73 @@ def process_video(video_path, desired_fps, batch_size, progress=gr.Progress()):
         for cluster_id in range(num_clusters):
             results += f"Person/Cluster {cluster_id + 1}: {len([c for c in clusters if c == cluster_id])} frames\n"
         return (
             results,
-            mse_plot_all,
-            mse_plot_comp,
-            mse_plot_raw,
             *emotion_plots,
             face_samples["most_frequent"],
-            face_samples["others"]
         )
-# Define gallery outputs
-gallery_outputs = [
-    gr.Gallery(label="Most Frequent Person Random Samples", columns=10, rows=2, height="auto"),
-    gr.Gallery(label="Other Persons Random Samples", columns=10, rows=1, height="auto")
-]
-# Update the Gradio interface
-iface = gr.Interface(
-    fn=process_video,
-    inputs=[
-        gr.Video(),
-        gr.Slider(minimum=1, maximum=20, step=1, value=10, label="Desired FPS"),
-        gr.Slider(minimum=1, maximum=32, step=1, value=10, label="Batch Size")
-    ],
-    outputs=[
-        gr.Textbox(label="Anomaly Detection Results"),
-        gr.Plot(label="MSE: Facial Features + Emotions"),
-        gr.Plot(label="MSE: Facial Features"),
-        gr.Plot(label="MSE: Facial Embeddings"),
-        gr.Plot(label="MSE: Fear"),
-        gr.Plot(label="MSE: Sad"),
-        gr.Plot(label="MSE: Angry"),
-        gr.Plot(label="MSE: Happy"),
-        gr.Plot(label="MSE: Surprise"),
-        gr.Plot(label="MSE: Neutral"),
-    ] + gallery_outputs,
-    title="Facial Expressions Anomaly Detection",
-    description="""
-        This application detects anomalies in facial expressions and emotions from a video input.
-        It identifies distinct persons in the video and provides sample faces for each, with multiple samples for the most frequent person.
-        The graphs show Mean Squared Error (MSE) values for different aspects of facial expressions and emotions over time.
-        Each point represents a frame, with red points indicating detected anomalies.
-        Anomalies are annotated with their corresponding timecodes.
-        Higher MSE values indicate more unusual or anomalous expressions or emotions at that point in the video.
-        Adjust the parameters as needed:
-        - Desired FPS: Frames per second to analyze (lower for faster processing)
-        - Batch Size: Affects processing speed and GPU memory usage
-        """,
-    allow_flagging="never"
-)
-# Launch the interface
-iface.launch()

+import math
 import os
 import cv2
 import numpy as np
 import torch.nn as nn
 import torch.optim as optim
 from facenet_pytorch import InceptionResnetV1, MTCNN
+import tensorflow as tf
 import mediapipe as mp
 from fer import FER
 from sklearn.cluster import DBSCAN
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
 import pandas as pd
 import matplotlib
 import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
 from moviepy.editor import VideoFileClip
 from PIL import Image
 import gradio as gr
 import tempfile
 import shutil
+import copy
+import time
 matplotlib.rcParams['figure.dpi'] = 500
 matplotlib.rcParams['savefig.dpi'] = 500
 # Initialize models and other global variables
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+mtcnn = MTCNN(keep_all=False, device=device, thresholds=[0.95, 0.95, 0.95], min_face_size=80)
 model = InceptionResnetV1(pretrained='vggface2').eval().to(device)
 mp_face_mesh = mp.solutions.face_mesh
+face_mesh = mp_face_mesh.FaceMesh(static_image_mode=False, max_num_faces=1, min_detection_confidence=0.5)
 emotion_detector = FER(mtcnn=False)
 def frame_to_timecode(frame_num, total_frames, duration):
     total_seconds = (frame_num / total_frames) * duration
     hours = int(total_seconds // 3600)
     milliseconds = int((total_seconds - int(total_seconds)) * 1000)
     return f"{hours:02d}:{minutes:02d}:{seconds:02d}.{milliseconds:03d}"
+def seconds_to_timecode(seconds):
+    hours = int(seconds // 3600)
+    minutes = int((seconds % 3600) // 60)
+    seconds = int(seconds % 60)
+    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
+def timecode_to_seconds(timecode):
+    h, m, s = map(int, timecode.split(':'))
+    return h * 3600 + m * 60 + s
 def get_face_embedding_and_emotion(face_img):
     face_tensor = torch.tensor(face_img).permute(2, 0, 1).unsqueeze(0).float() / 255
     if emotions:
         emotion_dict = emotions[0]['emotions']
     else:
+        emotion_dict = {e: 0 for e in ['angry', 'disgust', 'fear', 'sad', 'happy']}
     return embedding.cpu().numpy().flatten(), emotion_dict
 def alignFace(img):
     img_raw = img.copy()
     results = face_mesh.process(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
     new_img = cv2.warpAffine(img_raw, rotation_matrix, (width, height))
     return new_img
 def extract_frames(video_path, output_folder, desired_fps, progress_callback=None):
     os.makedirs(output_folder, exist_ok=True)
     clip = VideoFileClip(video_path)
     clip.close()
     return frame_count, original_fps
+def is_frontal_face(landmarks, threshold=40):
+    nose_tip = landmarks[4]
+    left_chin = landmarks[234]
+    right_chin = landmarks[454]
+    nose_to_left = [left_chin.x - nose_tip.x, left_chin.y - nose_tip.y]
+    nose_to_right = [right_chin.x - nose_tip.x, right_chin.y - nose_tip.y]
+    dot_product = nose_to_left[0] * nose_to_right[0] + nose_to_left[1] * nose_to_right[1]
+    magnitude_left = math.sqrt(nose_to_left[0] ** 2 + nose_to_left[1] ** 2)
+    magnitude_right = math.sqrt(nose_to_right[0] ** 2 + nose_to_right[1] ** 2)
+    cos_angle = dot_product / (magnitude_left * magnitude_right)
+    angle = math.acos(cos_angle)
+    angle_degrees = math.degrees(angle)
+    return abs(180 - angle_degrees) < threshold
 def process_frames(frames_folder, aligned_faces_folder, frame_count, progress, batch_size):
     embeddings_by_frame = {}
                     x1, y1, x2, y2 = [int(b) for b in boxes[0]]
                     face = frame[y1:y2, x1:x2]
                     if face.size > 0:
+                        results = face_mesh.process(cv2.cvtColor(face, cv2.COLOR_BGR2RGB))
+                        if results.multi_face_landmarks and is_frontal_face(results.multi_face_landmarks[0].landmark):
+                            aligned_face = alignFace(face)
+                            if aligned_face is not None:
+                                aligned_face_resized = cv2.resize(aligned_face, (160, 160))
+                                output_path = os.path.join(aligned_faces_folder, f"frame_{frame_num}_face.jpg")
+                                cv2.imwrite(output_path, aligned_face_resized)
+                                aligned_face_paths.append(output_path)
+                                embedding, emotion = get_face_embedding_and_emotion(aligned_face_resized)
+                                embeddings_by_frame[frame_num] = embedding
+                                emotions_by_frame[frame_num] = emotion
+        progress((i + len(batch_files)) / len(frame_files),
+                 f"Processing frames {i + 1} to {min(i + len(batch_files), len(frame_files))} of {len(frame_files)}")
     return embeddings_by_frame, emotions_by_frame, aligned_face_paths
 def cluster_faces(embeddings):
     if len(embeddings) < 2:
         print("Not enough faces for clustering. Assigning all to one cluster.")
         return np.zeros(len(embeddings), dtype=int)
     X = np.stack(embeddings)
     dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')
     clusters = dbscan.fit_predict(X)
     return clusters
 def organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder):
     for (frame_num, embedding), cluster in zip(embeddings_by_frame.items(), clusters):
         person_folder = os.path.join(organized_faces_folder, f"person_{cluster}")
         dst = os.path.join(person_folder, f"frame_{frame_num}_face.jpg")
         shutil.copy(src, dst)
+def save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps, original_fps, output_folder, video_duration):
+    emotions = ['angry', 'disgust', 'fear', 'sad', 'happy']
     person_data = {}
+    for (frame_num, embedding), (_, emotion_dict), cluster in zip(embeddings_by_frame.items(), emotions_by_frame.items(), clusters):
         if cluster not in person_data:
             person_data[cluster] = []
         person_data[cluster].append((frame_num, embedding, {e: emotion_dict[e] for e in emotions}))
     embeddings_array = np.array(embeddings)
     np.save(os.path.join(output_folder, 'face_embeddings.npy'), embeddings_array)
     total_frames = max(frames)
     timecodes = [frame_to_timecode(frame, total_frames, video_duration) for frame in frames]
     df_data = {
         'Frame': frames,
         'Timecode': timecodes,
         'Embedding_Index': range(len(embeddings))
     }
     for i in range(len(embeddings[0])):
         df_data[f'Raw_Embedding_{i}'] = [embedding[i] for embedding in embeddings]
     for emotion in emotions:
         df_data[emotion] = [e[emotion] for e in emotions_data]
     return df, largest_cluster
+class Autoencoder(nn.Module):
+    def __init__(self, input_size):
+        super(Autoencoder, self).__init__()
+        self.encoder = nn.Sequential(
+            nn.Linear(input_size, 512),
+            nn.ReLU(),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Linear(256, 128),
+            nn.ReLU(),
+            nn.Linear(128, 64)
+        )
+        self.decoder = nn.Sequential(
+            nn.Linear(64, 128),
+            nn.ReLU(),
+            nn.Linear(128, 256),
+            nn.ReLU(),
+            nn.Linear(256, 512),
+            nn.ReLU(),
+            nn.Linear(512, input_size)
+        )
     def forward(self, x):
+        batch_size, seq_len, _ = x.size()
+        x = x.view(batch_size * seq_len, -1)
+        encoded = self.encoder(x)
+        decoded = self.decoder(encoded)
+        return decoded.view(batch_size, seq_len, -1)
+def determine_anomalies(mse_values, threshold):
+    mean = np.mean(mse_values)
+    std = np.std(mse_values)
+    anomalies = mse_values > (mean + threshold * std)
+    return anomalies
+def anomaly_detection(X_emotions, X_embeddings, epochs=200, batch_size=8, patience=3):
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Normalize emotions
+    scaler_emotions = MinMaxScaler()
+    X_emotions_scaled = scaler_emotions.fit_transform(X_emotions)
+    # Process emotions
+    X_emotions_scaled = torch.FloatTensor(X_emotions_scaled).to(device)
+    if X_emotions_scaled.dim() == 2:
+        X_emotions_scaled = X_emotions_scaled.unsqueeze(0)
+    model_emotions = Autoencoder(input_size=X_emotions_scaled.shape[2]).to(device)
     criterion = nn.MSELoss()
+    optimizer_emotions = optim.Adam(model_emotions.parameters())
+    # Train emotions model
     for epoch in range(epochs):
+        model_emotions.train()
+        optimizer_emotions.zero_grad()
+        output_emotions = model_emotions(X_emotions_scaled)
+        loss_emotions = criterion(output_emotions, X_emotions_scaled)
+        loss_emotions.backward()
+        optimizer_emotions.step()
+    # Process facial embeddings
+    X_embeddings = torch.FloatTensor(X_embeddings).to(device)
+    if X_embeddings.dim() == 2:
+        X_embeddings = X_embeddings.unsqueeze(0)
+    model_embeddings = Autoencoder(input_size=X_embeddings.shape[2]).to(device)
+    optimizer_embeddings = optim.Adam(model_embeddings.parameters())
+    # Train embeddings model
+    for epoch in range(epochs):
+        model_embeddings.train()
+        optimizer_embeddings.zero_grad()
+        output_embeddings = model_embeddings(X_embeddings)
+        loss_embeddings = criterion(output_embeddings, X_embeddings)
+        loss_embeddings.backward()
+        optimizer_embeddings.step()
+    # Compute MSE for emotions and embeddings
+    model_emotions.eval()
+    model_embeddings.eval()
     with torch.no_grad():
+        reconstructed_emotions = model_emotions(X_emotions_scaled).cpu().numpy()
+        reconstructed_embeddings = model_embeddings(X_embeddings).cpu().numpy()
+        mse_emotions = np.mean(np.power(X_emotions_scaled.cpu().numpy() - reconstructed_emotions, 2), axis=2).squeeze()
+        mse_embeddings = np.mean(np.power(X_embeddings.cpu().numpy() - reconstructed_embeddings, 2), axis=2).squeeze()
+    return mse_emotions, mse_embeddings
+def plot_mse(df, mse_values, title, color='blue', time_threshold=3, anomaly_threshold=4):
+    plt.figure(figsize=(16, 8), dpi=500)
+    fig, ax = plt.subplots(figsize=(16, 8))
+    if 'Seconds' not in df.columns:
+        df['Seconds'] = df['Timecode'].apply(
+            lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
+    # Ensure df and mse_values have the same length and remove NaN values
+    min_length = min(len(df), len(mse_values))
+    df = df.iloc[:min_length]
+    mse_values = mse_values[:min_length]
+    # Remove NaN values
+    mask = ~np.isnan(mse_values)
+    df = df[mask]
+    mse_values = mse_values[mask]
+    mean = pd.Series(mse_values).rolling(window=10).mean()
+    std = pd.Series(mse_values).rolling(window=10).std()
+    median = np.median(mse_values)
+    ax.scatter(df['Seconds'], mse_values, color=color, alpha=0.3, s=5)
+    ax.plot(df['Seconds'], mean, color=color, linewidth=2)
+    ax.fill_between(df['Seconds'], mean - std, mean + std, color=color, alpha=0.2)
+    # Add median line
+    ax.axhline(y=median, color='black', linestyle='--', label='Baseline')
+    ax.text(ax.get_xlim()[1], median, 'Baseline', verticalalignment='center', horizontalalignment='left', color='black')
+    # Add threshold line
+    threshold = np.mean(mse_values) + anomaly_threshold * np.std(mse_values)
+    ax.axhline(y=threshold, color='red', linestyle='--', label=f'Threshold: {anomaly_threshold:.1f}')
+    ax.text(ax.get_xlim()[1], threshold, f'Threshold: {anomaly_threshold:.1f}', verticalalignment='center', horizontalalignment='left', color='red')
+    anomalies = determine_anomalies(mse_values, anomaly_threshold)
+    anomaly_frames = df['Frame'].iloc[anomalies].tolist()
+    ax.scatter(df['Seconds'].iloc[anomalies], mse_values[anomalies], color='red', s=25, zorder=5)
+    anomaly_data = list(zip(df['Timecode'].iloc[anomalies],
+                            df['Seconds'].iloc[anomalies],
+                            mse_values[anomalies]))
+    anomaly_data.sort(key=lambda x: x[1])
     grouped_anomalies = []
     current_group = []
     if current_group:
         grouped_anomalies.append(current_group)
+    for group in grouped_anomalies:
+        start_sec = group[0][1]
+        end_sec = group[-1][1]
+        rect = Rectangle((start_sec, ax.get_ylim()[0]), end_sec - start_sec, ax.get_ylim()[1] - ax.get_ylim()[0],
+                         facecolor='red', alpha=0.3, zorder=1)
+        ax.add_patch(rect)
     for group in grouped_anomalies:
         highest_mse_anomaly = max(group, key=lambda x: x[2])
         timecode, sec, mse = highest_mse_anomaly
         ax.annotate(timecode, (sec, mse), textcoords="offset points", xytext=(0, 10),
+                    ha='center', fontsize=6, color='red')
     max_seconds = df['Seconds'].max()
     num_ticks = 100
     tick_locations = np.linspace(0, max_seconds, num_ticks)
+    tick_labels = [seconds_to_timecode(int(s)) for s in tick_locations]
     ax.set_xticks(tick_locations)
     ax.set_xticklabels(tick_labels, rotation=90, ha='center', fontsize=6)
+    ax.set_xlabel('Timecode')
     ax.set_ylabel('Mean Squared Error')
     ax.set_title(title)
     ax.grid(True, linestyle='--', alpha=0.7)
+    ax.legend()
+    plt.tight_layout()
+    plt.close()
+    return fig, anomaly_frames
+def plot_mse_histogram(mse_values, title, anomaly_threshold, color='blue'):
+    plt.figure(figsize=(16, 8), dpi=500)
+    fig, ax = plt.subplots(figsize=(16, 8))
+    ax.hist(mse_values, bins=100, edgecolor='black', color=color, alpha=0.7)
+    ax.set_xlabel('Mean Squared Error')
+    ax.set_ylabel('Number of Samples')
+    ax.set_title(title)
+    mean = np.mean(mse_values)
+    std = np.std(mse_values)
+    threshold = mean + anomaly_threshold * std
+    ax.axvline(x=threshold, color='red', linestyle='--', linewidth=2)
+    # Move annotation to the bottom and away from the line
+    ax.annotate(f'Threshold: {anomaly_threshold:.1f}',
+                xy=(threshold, ax.get_ylim()[0]),
+                xytext=(0, -20),
+                textcoords='offset points',
+                ha='center', va='top',
+                bbox=dict(boxstyle='round,pad=0.5', fc='white', ec='none', alpha=0.7),
+                color='red')
+    plt.tight_layout()
+    plt.close()
+    return fig
+def plot_emotion(df, emotion, color, anomaly_threshold):
+    plt.figure(figsize=(16, 8), dpi=500)
+    fig, ax = plt.subplots(figsize=(16, 8))
+    df['Seconds'] = df['Timecode'].apply(
+        lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
+    mean = df[emotion].rolling(window=10).mean()
+    std = df[emotion].rolling(window=10).std()
+    median = df[emotion].median()
+    ax.scatter(df['Seconds'], df[emotion], color=color, alpha=0.3, s=5)
+    ax.plot(df['Seconds'], mean, color=color, linewidth=2)
+    ax.fill_between(df['Seconds'], mean - std, mean + std, color=color, alpha=0.2)
+    # Add median line
+    ax.axhline(y=median, color='black', linestyle='--', label='Baseline')
+    ax.text(ax.get_xlim()[1], median, 'Baseline', verticalalignment='center', horizontalalignment='left', color='black')
+    # Convert anomaly threshold to probability
+    probability_threshold = (anomaly_threshold - 1) / 6  # Convert 1-7 scale to 0-1 probability
+    # Add threshold line and detect anomalies
+    ax.axhline(y=probability_threshold, color='red', linestyle='--', label=f'Threshold: {probability_threshold:.2f}')
+    ax.text(ax.get_xlim()[1], probability_threshold, f'Threshold: {probability_threshold:.2f}',
+            verticalalignment='center', horizontalalignment='left', color='red')
+    # Detect and highlight anomalies
+    anomalies = df[emotion] >= probability_threshold
+    ax.scatter(df['Seconds'][anomalies], df[emotion][anomalies], color='red', s=25, zorder=5)
+    max_seconds = df['Seconds'].max()
+    num_ticks = 100
+    tick_locations = np.linspace(0, max_seconds, num_ticks)
+    tick_labels = [seconds_to_timecode(int(s)) for s in tick_locations]
+    ax.set_xticks(tick_locations)
+    ax.set_xticklabels(tick_labels, rotation=90, ha='center', fontsize=6)
+    ax.set_xlabel('Timecode')
+    ax.set_ylabel('Emotion Probability')
+    ax.set_title(f"{emotion.capitalize()} Over Time")
+    ax.grid(True, linestyle='--', alpha=0.7)
+    ax.legend()
     plt.tight_layout()
     plt.close()
     return fig
+def get_all_face_samples(organized_faces_folder, output_folder, largest_cluster, max_samples=500):
     face_samples = {"most_frequent": [], "others": []}
     for cluster_folder in sorted(os.listdir(organized_faces_folder)):
         if cluster_folder.startswith("person_"):
             if face_files:
                 cluster_id = int(cluster_folder.split('_')[1])
                 if cluster_id == largest_cluster:
+                    for i, sample in enumerate(face_files[:max_samples]):
                         face_path = os.path.join(person_folder, sample)
                         output_path = os.path.join(output_folder, f"face_sample_most_frequent_{i:04d}.jpg")
                         face_img = cv2.imread(face_path)
                             small_face = cv2.resize(face_img, (160, 160))
                             cv2.imwrite(output_path, small_face)
                             face_samples["most_frequent"].append(output_path)
+                        if len(face_samples["most_frequent"]) >= max_samples:
+                            break
                 else:
+                    remaining_samples = max_samples - len(face_samples["others"])
+                    if remaining_samples > 0:
+                        for i, sample in enumerate(face_files[:remaining_samples]):
+                            face_path = os.path.join(person_folder, sample)
+                            output_path = os.path.join(output_folder, f"face_sample_other_{cluster_id:02d}_{i:04d}.jpg")
+                            face_img = cv2.imread(face_path)
+                            if face_img is not None:
+                                small_face = cv2.resize(face_img, (160, 160))
+                                cv2.imwrite(output_path, small_face)
+                                face_samples["others"].append(output_path)
+                            if len(face_samples["others"]) >= max_samples:
+                                break
     return face_samples
+def process_video(video_path, anomaly_threshold, desired_fps, progress=gr.Progress()):
+    start_time = time.time()
     output_folder = "output"
     os.makedirs(output_folder, exist_ok=True)
+    batch_size = 16
     with tempfile.TemporaryDirectory() as temp_dir:
         aligned_faces_folder = os.path.join(temp_dir, 'aligned_faces')
                                                                                     progress, batch_size)
         if not aligned_face_paths:
+            return ("No faces were extracted from the video.",) + (None,) * 10
         progress(0.6, "Clustering faces")
         embeddings = [embedding for _, embedding in embeddings_by_frame.items()]
         clusters = cluster_faces(embeddings)
+        num_clusters = len(set(clusters))
         progress(0.7, "Organizing faces")
         organize_faces_by_person(embeddings_by_frame, clusters, aligned_faces_folder, organized_faces_folder)
         df, largest_cluster = save_person_data_to_csv(embeddings_by_frame, emotions_by_frame, clusters, desired_fps,
                                                       original_fps, temp_dir, video_duration)
+        # Add 'Seconds' column to df
+        df['Seconds'] = df['Timecode'].apply(
+            lambda x: sum(float(t) * 60 ** i for i, t in enumerate(reversed(x.split(':')))))
         progress(0.85, "Getting face samples")
         face_samples = get_all_face_samples(organized_faces_folder, output_folder, largest_cluster)
         progress(0.9, "Performing anomaly detection")
+        emotion_columns = ['angry', 'disgust', 'fear', 'sad', 'happy']
+        embedding_columns = [col for col in df.columns if col.startswith('Raw_Embedding_')]
+        X_emotions = df[emotion_columns].values
+        X_embeddings = df[embedding_columns].values
         try:
+            mse_emotions, mse_embeddings = anomaly_detection(X_emotions, X_embeddings, batch_size=batch_size)
             progress(0.95, "Generating plots")
+            mse_plot_embeddings, anomaly_frames_embeddings = plot_mse(df, mse_embeddings, "Facial Embeddings",
+                                                                      color='green',
+                                                                      anomaly_threshold=anomaly_threshold)
+            mse_histogram_embeddings = plot_mse_histogram(mse_embeddings, "MSE Distribution: Facial Embeddings",
+                                                          anomaly_threshold, color='green')
+            # Add emotion plots
+            emotion_plots = []
+            for emotion, color in zip(emotion_columns, ['purple', 'brown', 'green', 'orange', 'darkblue']):
+                emotion_plot = plot_emotion(df, emotion, color, anomaly_threshold)
+                emotion_plots.append(emotion_plot)
+            mse_var_emotions = np.var(mse_emotions)
+            mse_var_embeddings = np.var(mse_embeddings)
         except Exception as e:
             print(f"Error details: {str(e)}")
+            return (f"Error in anomaly detection: {str(e)}",) + (None,) * 15
         progress(1.0, "Preparing results")
         results = f"Number of persons/clusters detected: {num_clusters}\n\n"
         for cluster_id in range(num_clusters):
             results += f"Person/Cluster {cluster_id + 1}: {len([c for c in clusters if c == cluster_id])} frames\n"
+        end_time = time.time()
+        execution_time = end_time - start_time
+        # Load anomaly frames as images
+        anomaly_faces_embeddings = [
+            cv2.imread(os.path.join(aligned_faces_folder, f"frame_{frame}_face.jpg"))
+            for frame in anomaly_frames_embeddings
+            if os.path.exists(os.path.join(aligned_faces_folder, f"frame_{frame}_face.jpg"))
+        ]
+        anomaly_faces_embeddings = [cv2.cvtColor(face, cv2.COLOR_BGR2RGB) for face in anomaly_faces_embeddings if face is not None]
         return (
+            execution_time,
             results,
+            df,
+            mse_embeddings,
+            mse_emotions,
+            mse_plot_embeddings,
+            mse_histogram_embeddings,
             *emotion_plots,
             face_samples["most_frequent"],
+            face_samples["others"],
+            anomaly_faces_embeddings,
+            aligned_faces_folder
         )
+with gr.Blocks() as iface:
+    gr.Markdown("# Facial Expressions Anomaly Detection")
+    with gr.Row():
+        video_input = gr.Video()
+        anomaly_threshold = gr.Slider(minimum=1, maximum=7, step=0.1, value=4.5, label="Anomaly Detection Threshold")
+        fps_slider = gr.Slider(minimum=10, maximum=20, step=5, value=20, label="Frames Per Second")
+    process_btn = gr.Button("Process Video")
+    execution_time = gr.Number(label="Execution Time (seconds)")
+    results_text = gr.Textbox(label="Anomaly Detection Results")
+    anomaly_frames_embeddings = gr.Gallery(label="Anomaly Frames (Facial Embeddings)", columns=6, rows=2, height="auto")
+    mse_embeddings_plot = gr.Plot(label="MSE: Facial Embeddings")
+    mse_embeddings_hist = gr.Plot(label="MSE Distribution: Facial Embeddings")
+    # Add emotion plots
+    emotion_plots = [gr.Plot(label=f"{emotion.capitalize()} Over Time") for emotion in ['angry', 'disgust', 'fear', 'sad', 'happy']]
+    face_samples_most_frequent = gr.Gallery(label="Most Frequent Person Samples (Target)", columns=6, rows=2, height="auto")
+    face_samples_others = gr.Gallery(label="Other Persons Samples", columns=6, rows=1, height="auto")
+    # Hidden components to store intermediate results
+    df_store = gr.State()
+    mse_emotions_store = gr.State()
+    mse_embeddings_store = gr.State()
+    aligned_faces_folder_store = gr.State()
+    process_btn.click(
+        process_video,
+        inputs=[video_input, anomaly_threshold, fps_slider],
+        outputs=[
+            execution_time, results_text, df_store, mse_embeddings_store, mse_emotions_store,
+            mse_embeddings_plot, mse_embeddings_hist,
+            *emotion_plots,
+            face_samples_most_frequent, face_samples_others, anomaly_frames_embeddings,
+            aligned_faces_folder_store
+        ]
+    )
+if __name__ == "__main__":
+    iface.launch()