Spaces:
Runtime error
Runtime error
File size: 4,169 Bytes
a43b81e 1bcc75f a43b81e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
import tensorflow as tf
from typing import List
import os
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import legacy
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler
def convert_mp4_to_mpg(input_file, output_file):
"""
Convert an MP4 video file to an MPG video file using OpenCV.
Args:
input_file (str): Path to the input MP4 file.
output_file (str): Path to the output MPG file.
Returns:
None
"""
cap = cv2.VideoCapture(input_file)
# Check if the video file was opened successfully
if not cap.isOpened():
raise Exception(f"Could not open video file: {input_file}")
# Define the codec and create a VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mpg2') # Use 'mpg2' codec for MPG files
out = cv2.VideoWriter(output_file, fourcc, 30.0, (int(cap.get(3)), int(cap.get(4))))
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
out.write(frame)
# Release the video objects
cap.release()
out.release()
def load_video(video_path):
cap = cv2.VideoCapture(video_path)
frames = []
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
ret, frame = cap.read()
frame = tf.image.rgb_to_grayscale(frame)
frames.append(frame[190:236,80:220,:])
cap.release()
mean = tf.math.reduce_mean(frames)
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
return tf.cast((frames - mean), tf.float32) / std
def load_data(path: str):
path = bytes.decode(path.numpy())
#file_name = path.split('/')[-1].split('.')[0]
# File name splitting for windows
file_name = path.split('\\')[-1].split('.')[0]
video_path = os.path.join('data','s1',f'{file_name}.mpg')
alignment_path = os.path.join('data','alignments','s1',f'{file_name}.align')
frames = load_video(video_path)
return frames
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)
model = Sequential()
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D((1,2,2)))
model.add(TimeDistributed(Flatten()))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(.5))
model.add(Dense(char_to_num.vocabulary_size()+1, kernel_initializer='he_normal', activation='softmax'))
model.summary()
optimizer = legacy.Adam(learning_rate=0.001) # Replace legacy.Adam with the appropriate legacy optimizer you used during training
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#Loading weights
model.load_weights('model/checkpoint')
def Predict(Video):
#convert_mp4_to_mpg(Video,'output.mpg')
sample = load_data(tf.convert_to_tensor(Video))
yhat = model.predict(tf.expand_dims(sample, axis=0))
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75], greedy=True)[0][0].numpy()
result=[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]
return result[0].numpy().decode('utf-8')
interface = gr.Interface(fn=Predict,
inputs="video",
outputs="text",
title='Video Lip Reading',
description="""Wlc this model developped by wissem karous with <3 """)
interface.launch(debug=True) |