File size: 2,871 Bytes
028a426
 
 
 
 
 
 
 
 
 
92bb01c
028a426
 
 
 
92bb01c
028a426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7551cd5
 
 
 
 
 
 
 
 
3d63486
471c088
7551cd5
3d63486
92bb01c
028a426
 
 
3d63486
028a426
 
7551cd5
 
028a426
 
 
4dc11cd
7551cd5
028a426
 
 
7551cd5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
import glob
import pickle
import os
import torch
import numpy as np
from utils.audio import load_spectrograms
from utils.compute_args import compute_args
from utils.tokenize import tokenize, create_dict, sent_to_ix, cmumosei_2, cmumosei_7, pad_feature
from model_LA import Model_LA
import gradio as gr

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# load model
ckpts_path = 'ckpt'
model_name = "Model_LA_e"
# Listing sorted checkpoints
ckpts = sorted(glob.glob(os.path.join(ckpts_path, model_name,'best*')), reverse=True)

# Load original args
args = torch.load(ckpts[0], map_location=torch.device(device))['args']
args = compute_args(args)
pretrained_emb = np.load("train_glove.npy")
token_to_ix = pickle.load(open("token_to_ix.pkl", "rb")) 
state_dict = torch.load(ckpts[0], map_location=torch.device(device))['state_dict']

net = Model_LA(args, len(token_to_ix), pretrained_emb).to(device)
net.load_state_dict(state_dict)

def inference(video_path, text):
    # data preprocessing
    # text
    def clean(w):
        return re.sub(
                r"([.,'!?\"()*#:;])",
                '',
                w.lower()
                ).replace('-', ' ').replace('/', ' ')

    s = [clean(w) for w in text.split() if clean(w) != '']

    # Sound
    _, mel, mag = load_spectrograms(video_path)

    l_max_len = args.lang_seq_len
    a_max_len = args.audio_seq_len
    v_max_len = args.video_seq_len
    L = sent_to_ix(s, token_to_ix, max_token=l_max_len)
    A = pad_feature(mel, a_max_len)
    V = pad_feature(mel, v_max_len)
    # print shapes
    print(f"Processed text shape from {len(s)} to {L.shape}")
    print(f"Processed audio shape from {mel.shape} to {A.shape}")
    print(f"Processed video shape from {mel.shape} to {V.shape}")

    net.train(False)
    x = np.expand_dims(L,axis=0)
    y = np.expand_dims(A,axis=0)
    z = np.expand_dims(V,axis=0)
    x, y, z = torch.from_numpy(x).to(device), torch.from_numpy(y).to(device), torch.from_numpy(z).float().to(device)
    pred = net(x, y, z).cpu().data.numpy()[0]
    pred = np.exp(pred) / np.sum(np.exp(pred)) # softmax
    label_to_ix = ['happy', 'sad', 'angry', 'fear', 'disgust', 'surprise']
    result_dict = {label_to_ix[i]: float(pred[i]) for i in range(len(label_to_ix))}
    return result_dict


title="Emotion Recognition"
description=""
            
examples = [              
                ['examples/03bSnISJMiM_1.mp4', "IT WAS REALLY GOOD "],
                ['examples/03bSnISJMiM_5.mp4', "AND THEY SHOULDVE I GUESS "],
           ]
           
gr.Interface(inference,
                     inputs = [gr.inputs.Video(type="avi", source="upload"), "text"],
                     outputs=["label"],
                     title=title,
                     description=description,
                     examples=examples
                     ).launch(debug=True)