Spaces:
Runtime error
Runtime error
Myapp
Browse files- app.py +125 -0
- requirements.txt +2 -0
app.py
ADDED
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import tensorflow as tf
|
3 |
+
from typing import List
|
4 |
+
import os
|
5 |
+
import cv2
|
6 |
+
from tensorflow.keras.models import Sequential
|
7 |
+
from tensorflow.keras.optimizers import legacy
|
8 |
+
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
|
9 |
+
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
def convert_mp4_to_mpg(input_file, output_file):
|
14 |
+
"""
|
15 |
+
Convert an MP4 video file to an MPG video file using OpenCV.
|
16 |
+
|
17 |
+
Args:
|
18 |
+
input_file (str): Path to the input MP4 file.
|
19 |
+
output_file (str): Path to the output MPG file.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
None
|
23 |
+
"""
|
24 |
+
cap = cv2.VideoCapture(input_file)
|
25 |
+
|
26 |
+
# Check if the video file was opened successfully
|
27 |
+
if not cap.isOpened():
|
28 |
+
raise Exception(f"Could not open video file: {input_file}")
|
29 |
+
|
30 |
+
# Define the codec and create a VideoWriter object
|
31 |
+
fourcc = cv2.VideoWriter_fourcc(*'mpg2') # Use 'mpg2' codec for MPG files
|
32 |
+
out = cv2.VideoWriter(output_file, fourcc, 30.0, (int(cap.get(3)), int(cap.get(4))))
|
33 |
+
|
34 |
+
while cap.isOpened():
|
35 |
+
ret, frame = cap.read()
|
36 |
+
if not ret:
|
37 |
+
break
|
38 |
+
|
39 |
+
out.write(frame)
|
40 |
+
|
41 |
+
# Release the video objects
|
42 |
+
cap.release()
|
43 |
+
out.release()
|
44 |
+
|
45 |
+
def load_video(video_path):
|
46 |
+
|
47 |
+
cap = cv2.VideoCapture(video_path)
|
48 |
+
frames = []
|
49 |
+
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
|
50 |
+
ret, frame = cap.read()
|
51 |
+
frame = tf.image.rgb_to_grayscale(frame)
|
52 |
+
frames.append(frame[190:236,80:220,:])
|
53 |
+
cap.release()
|
54 |
+
|
55 |
+
mean = tf.math.reduce_mean(frames)
|
56 |
+
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
|
57 |
+
return tf.cast((frames - mean), tf.float32) / std
|
58 |
+
|
59 |
+
def load_data(path: str):
|
60 |
+
path = bytes.decode(path.numpy())
|
61 |
+
#file_name = path.split('/')[-1].split('.')[0]
|
62 |
+
# File name splitting for windows
|
63 |
+
file_name = path.split('\\')[-1].split('.')[0]
|
64 |
+
video_path = os.path.join('data','s1',f'{file_name}.mpg')
|
65 |
+
alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
|
66 |
+
frames = load_video(video_path)
|
67 |
+
|
68 |
+
|
69 |
+
return frames
|
70 |
+
|
71 |
+
|
72 |
+
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
|
73 |
+
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
|
74 |
+
num_to_char = tf.keras.layers.StringLookup(
|
75 |
+
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
|
76 |
+
)
|
77 |
+
|
78 |
+
model = Sequential()
|
79 |
+
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
|
80 |
+
model.add(Activation('relu'))
|
81 |
+
model.add(MaxPool3D((1,2,2)))
|
82 |
+
|
83 |
+
model.add(Conv3D(256, 3, padding='same'))
|
84 |
+
model.add(Activation('relu'))
|
85 |
+
model.add(MaxPool3D((1,2,2)))
|
86 |
+
|
87 |
+
model.add(Conv3D(75, 3, padding='same'))
|
88 |
+
model.add(Activation('relu'))
|
89 |
+
model.add(MaxPool3D((1,2,2)))
|
90 |
+
|
91 |
+
model.add(TimeDistributed(Flatten()))
|
92 |
+
|
93 |
+
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
|
94 |
+
model.add(Dropout(.5))
|
95 |
+
|
96 |
+
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
|
97 |
+
model.add(Dropout(.5))
|
98 |
+
|
99 |
+
model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
|
100 |
+
model.summary()
|
101 |
+
|
102 |
+
optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training
|
103 |
+
|
104 |
+
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
|
105 |
+
|
106 |
+
#Loading weights
|
107 |
+
model.load_weights('model/checkpoint')
|
108 |
+
|
109 |
+
def Predict(Video):
|
110 |
+
#convert_mp4_to_mpg(Video,'output.mpg')
|
111 |
+
sample = load_data(tf.convert_to_tensor(Video))
|
112 |
+
yhat = model.predict(tf.expand_dims(sample, axis=0))
|
113 |
+
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
|
114 |
+
result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
|
115 |
+
return result[0].numpy().decode('utf-8')
|
116 |
+
|
117 |
+
|
118 |
+
interface = gr.Interface(fn=Predict,
|
119 |
+
inputs="video",
|
120 |
+
outputs="text",
|
121 |
+
title='Video Lip Reading',
|
122 |
+
description="""Wlc this code developped by wissem karous with <3' """)
|
123 |
+
|
124 |
+
|
125 |
+
interface.launch(debug=True)
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
tensorflow
|
2 |
+
opencv-python
|