multimodal-deepfakes

Runtime error

App Files Files Community

multimodal-deepfakes / pipeline.py

Kimata

enable ONEDNN ops

8fa56f0 over 1 year ago

raw

history blame

6.34 kB

	import os
	import cv2
	import torch
	import zipfile
	import librosa
	import numpy as np
	import tensorflow as tf
	from facenet_pytorch import MTCNN
	from rawnet import RawNet
	os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

	local_zip = "./efficientnet-b0.zip"
	zip_ref = zipfile.ZipFile(local_zip, 'r')
	zip_ref.extractall()
	zip_ref.close()


	# Load models.
	model = tf.keras.models.load_model("efficientnet-b0/")



	class DetectionPipeline:
	"""Pipeline class for detecting faces in the frames of a video file."""

	def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):
	"""Constructor for DetectionPipeline class.

	Keyword Arguments:
	n_frames {int} -- Total number of frames to load. These will be evenly spaced
	throughout the video. If not specified (i.e., None), all frames will be loaded.
	(default: {None})
	batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
	resize {float} -- Fraction by which to resize frames from original prior to face
	detection. A value less than 1 results in downsampling and a value greater than
	1 result in upsampling. (default: {None})
	"""
	self.n_frames = n_frames
	self.batch_size = batch_size
	self.resize = resize
	self.input_modality = input_modality

	def __call__(self, filename):
	"""Load frames from an MP4 video and detect faces.

	Arguments:
	filename {str} -- Path to video.
	"""
	# Create video reader and find length
	if self.input_modality == 'video':
	print('Input modality is video.')
	v_cap = cv2.VideoCapture(filename)
	v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))

	# Pick 'n_frames' evenly spaced frames to sample
	if self.n_frames is None:
	sample = np.arange(0, v_len)
	else:
	sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)

	# Loop through frames
	faces = []
	frames = []
	for j in range(v_len):
	success = v_cap.grab()
	if j in sample:
	# Load frame
	success, frame = v_cap.retrieve()
	if not success:
	continue
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

	# Resize frame to desired size
	if self.resize is not None:
	frame = frame.resize([int(d * self.resize) for d in frame.size])
	frames.append(frame)

	# When batch is full, detect faces and reset frame list
	if len(frames) % self.batch_size == 0 or j == sample[-1]:
	face2 = cv2.resize(frame, (224, 224))
	faces.append(face2)

	v_cap.release()
	return faces

	elif self.input_modality == 'image':
	print('Input modality is image.')
	#Perform inference for image modality.
	print('Reading image')
	# print(f"Image path is: {filename}")
	image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
	image = cv2.resize(image, (224, 224))

	# if not face.any():
	# print("No faces found...")

	return image

	elif self.input_modality == 'audio':
	print("INput modality is audio.")

	#Load audio.
	x, sr = librosa.load(filename)
	x_pt = torch.Tensor(x)
	x_pt = torch.unsqueeze(x_pt, dim = 0)
	return x_pt

	else:
	raise ValueError("Invalid input modality. Must be either 'video' or image")

	detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
	detection_image_pipeline = DetectionPipeline(batch_size = 1, input_modality = 'image')

	def deepfakes_video_predict(input_video):

	faces = detection_video_pipeline(input_video)
	total = 0
	real_res = []
	fake_res = []

	for face in faces:

	face2 = face/255
	pred = model.predict(np.expand_dims(face2, axis=0))[0]
	real, fake = pred[0], pred[1]
	real_res.append(real)
	fake_res.append(fake)

	total+=1

	pred2 = pred[1]

	if pred2 > 0.5:
	fake+=1
	else:
	real+=1
	real_mean = np.mean(real_res)
	fake_mean = np.mean(fake_res)
	print(f"Real Faces: {real_mean}")
	print(f"Fake Faces: {fake_mean}")
	text = ""

	if real_mean >= 0.5:
	text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean*100), 3)) + "%"
	else:
	text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean*100, 3)) + "%"

	return text


	def deepfakes_image_predict(input_image):
	faces = detection_image_pipeline(input_image)
	face2 = faces/255
	pred = model.predict(np.expand_dims(face2, axis = 0))[0]
	real, fake = pred[0], pred[1]
	if real > 0.5:
	text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real*100), 3)) + "%"
	else:
	text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake*100, 3)) + "%"
	return text2

	def load_audio_model():
	d_args = {
	"nb_samp": 64600,
	"first_conv": 1024,
	"in_channels": 1,
	"filts": [20, [20, 20], [20, 128], [128, 128]],
	"blocks": [2, 4],
	"nb_fc_node": 1024,
	"gru_node": 1024,
	"nb_gru_layer": 3,
	"nb_classes": 2}

	model = RawNet(d_args = d_args, device='cpu')

	#Load pretrained ckpt.
	model_dict = model.state_dict()
	ckpt = torch.load('pre_trained_DF_RawNet2.pth', map_location=torch.device('cpu'))
	model.load_state_dict(ckpt, model_dict)
	return model

	audio_label_map = {
	0: "Real audio",
	1: "Fake audio"
	}

	def deepfakes_audio_predict(input_audio):
	#Perform inference on audio.
	x, sr = input_audio
	x_pt = torch.Tensor(x)
	x_pt = torch.unsqueeze(x_pt, dim = 0)

	#Load model.
	model = load_audio_model()

	#Perform inference.
	grads = model(x_pt)

	#Get the argmax.
	grads_np = grads.detach().numpy()
	result = np.argmax(grads_np)

	return audio_label_map[result]