multimodal-deepfakes / pipeline.py
Kimata's picture
enable ONEDNN ops
8fa56f0
raw
history blame
6.34 kB
import os
import cv2
import torch
import zipfile
import librosa
import numpy as np
import tensorflow as tf
from facenet_pytorch import MTCNN
from rawnet import RawNet
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
local_zip = "./efficientnet-b0.zip"
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall()
zip_ref.close()
# Load models.
model = tf.keras.models.load_model("efficientnet-b0/")
class DetectionPipeline:
"""Pipeline class for detecting faces in the frames of a video file."""
def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):
"""Constructor for DetectionPipeline class.
Keyword Arguments:
n_frames {int} -- Total number of frames to load. These will be evenly spaced
throughout the video. If not specified (i.e., None), all frames will be loaded.
(default: {None})
batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
resize {float} -- Fraction by which to resize frames from original prior to face
detection. A value less than 1 results in downsampling and a value greater than
1 result in upsampling. (default: {None})
"""
self.n_frames = n_frames
self.batch_size = batch_size
self.resize = resize
self.input_modality = input_modality
def __call__(self, filename):
"""Load frames from an MP4 video and detect faces.
Arguments:
filename {str} -- Path to video.
"""
# Create video reader and find length
if self.input_modality == 'video':
print('Input modality is video.')
v_cap = cv2.VideoCapture(filename)
v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Pick 'n_frames' evenly spaced frames to sample
if self.n_frames is None:
sample = np.arange(0, v_len)
else:
sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
# Loop through frames
faces = []
frames = []
for j in range(v_len):
success = v_cap.grab()
if j in sample:
# Load frame
success, frame = v_cap.retrieve()
if not success:
continue
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Resize frame to desired size
if self.resize is not None:
frame = frame.resize([int(d * self.resize) for d in frame.size])
frames.append(frame)
# When batch is full, detect faces and reset frame list
if len(frames) % self.batch_size == 0 or j == sample[-1]:
face2 = cv2.resize(frame, (224, 224))
faces.append(face2)
v_cap.release()
return faces
elif self.input_modality == 'image':
print('Input modality is image.')
#Perform inference for image modality.
print('Reading image')
# print(f"Image path is: {filename}")
image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (224, 224))
# if not face.any():
# print("No faces found...")
return image
elif self.input_modality == 'audio':
print("INput modality is audio.")
#Load audio.
x, sr = librosa.load(filename)
x_pt = torch.Tensor(x)
x_pt = torch.unsqueeze(x_pt, dim = 0)
return x_pt
else:
raise ValueError("Invalid input modality. Must be either 'video' or image")
detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
detection_image_pipeline = DetectionPipeline(batch_size = 1, input_modality = 'image')
def deepfakes_video_predict(input_video):
faces = detection_video_pipeline(input_video)
total = 0
real_res = []
fake_res = []
for face in faces:
face2 = face/255
pred = model.predict(np.expand_dims(face2, axis=0))[0]
real, fake = pred[0], pred[1]
real_res.append(real)
fake_res.append(fake)
total+=1
pred2 = pred[1]
if pred2 > 0.5:
fake+=1
else:
real+=1
real_mean = np.mean(real_res)
fake_mean = np.mean(fake_res)
print(f"Real Faces: {real_mean}")
print(f"Fake Faces: {fake_mean}")
text = ""
if real_mean >= 0.5:
text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean*100), 3)) + "%"
else:
text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean*100, 3)) + "%"
return text
def deepfakes_image_predict(input_image):
faces = detection_image_pipeline(input_image)
face2 = faces/255
pred = model.predict(np.expand_dims(face2, axis = 0))[0]
real, fake = pred[0], pred[1]
if real > 0.5:
text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real*100), 3)) + "%"
else:
text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake*100, 3)) + "%"
return text2
def load_audio_model():
d_args = {
"nb_samp": 64600,
"first_conv": 1024,
"in_channels": 1,
"filts": [20, [20, 20], [20, 128], [128, 128]],
"blocks": [2, 4],
"nb_fc_node": 1024,
"gru_node": 1024,
"nb_gru_layer": 3,
"nb_classes": 2}
model = RawNet(d_args = d_args, device='cpu')
#Load pretrained ckpt.
model_dict = model.state_dict()
ckpt = torch.load('pre_trained_DF_RawNet2.pth', map_location=torch.device('cpu'))
model.load_state_dict(ckpt, model_dict)
return model
audio_label_map = {
0: "Real audio",
1: "Fake audio"
}
def deepfakes_audio_predict(input_audio):
#Perform inference on audio.
x, sr = input_audio
x_pt = torch.Tensor(x)
x_pt = torch.unsqueeze(x_pt, dim = 0)
#Load model.
model = load_audio_model()
#Perform inference.
grads = model(x_pt)
#Get the argmax.
grads_np = grads.detach().numpy()
result = np.argmax(grads_np)
return audio_label_map[result]