Spaces:
Runtime error
Runtime error
File size: 6,343 Bytes
e50136c 8fa56f0 e50136c dacce75 e50136c dacce75 e50136c dacce75 e50136c dacce75 e50136c dacce75 e50136c dacce75 e50136c 11dd612 e50136c 3f3f7e7 dacce75 3f3f7e7 e50136c d7b6ade e50136c 3f3f7e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
import os
import cv2
import torch
import zipfile
import librosa
import numpy as np
import tensorflow as tf
from facenet_pytorch import MTCNN
from rawnet import RawNet
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
local_zip = "./efficientnet-b0.zip"
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall()
zip_ref.close()
# Load models.
model = tf.keras.models.load_model("efficientnet-b0/")
class DetectionPipeline:
"""Pipeline class for detecting faces in the frames of a video file."""
def __init__(self, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):
"""Constructor for DetectionPipeline class.
Keyword Arguments:
n_frames {int} -- Total number of frames to load. These will be evenly spaced
throughout the video. If not specified (i.e., None), all frames will be loaded.
(default: {None})
batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
resize {float} -- Fraction by which to resize frames from original prior to face
detection. A value less than 1 results in downsampling and a value greater than
1 result in upsampling. (default: {None})
"""
self.n_frames = n_frames
self.batch_size = batch_size
self.resize = resize
self.input_modality = input_modality
def __call__(self, filename):
"""Load frames from an MP4 video and detect faces.
Arguments:
filename {str} -- Path to video.
"""
# Create video reader and find length
if self.input_modality == 'video':
print('Input modality is video.')
v_cap = cv2.VideoCapture(filename)
v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Pick 'n_frames' evenly spaced frames to sample
if self.n_frames is None:
sample = np.arange(0, v_len)
else:
sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
# Loop through frames
faces = []
frames = []
for j in range(v_len):
success = v_cap.grab()
if j in sample:
# Load frame
success, frame = v_cap.retrieve()
if not success:
continue
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Resize frame to desired size
if self.resize is not None:
frame = frame.resize([int(d * self.resize) for d in frame.size])
frames.append(frame)
# When batch is full, detect faces and reset frame list
if len(frames) % self.batch_size == 0 or j == sample[-1]:
face2 = cv2.resize(frame, (224, 224))
faces.append(face2)
v_cap.release()
return faces
elif self.input_modality == 'image':
print('Input modality is image.')
#Perform inference for image modality.
print('Reading image')
# print(f"Image path is: {filename}")
image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (224, 224))
# if not face.any():
# print("No faces found...")
return image
elif self.input_modality == 'audio':
print("INput modality is audio.")
#Load audio.
x, sr = librosa.load(filename)
x_pt = torch.Tensor(x)
x_pt = torch.unsqueeze(x_pt, dim = 0)
return x_pt
else:
raise ValueError("Invalid input modality. Must be either 'video' or image")
detection_video_pipeline = DetectionPipeline(n_frames=5, batch_size=1, input_modality='video')
detection_image_pipeline = DetectionPipeline(batch_size = 1, input_modality = 'image')
def deepfakes_video_predict(input_video):
faces = detection_video_pipeline(input_video)
total = 0
real_res = []
fake_res = []
for face in faces:
face2 = face/255
pred = model.predict(np.expand_dims(face2, axis=0))[0]
real, fake = pred[0], pred[1]
real_res.append(real)
fake_res.append(fake)
total+=1
pred2 = pred[1]
if pred2 > 0.5:
fake+=1
else:
real+=1
real_mean = np.mean(real_res)
fake_mean = np.mean(fake_res)
print(f"Real Faces: {real_mean}")
print(f"Fake Faces: {fake_mean}")
text = ""
if real_mean >= 0.5:
text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean*100), 3)) + "%"
else:
text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean*100, 3)) + "%"
return text
def deepfakes_image_predict(input_image):
faces = detection_image_pipeline(input_image)
face2 = faces/255
pred = model.predict(np.expand_dims(face2, axis = 0))[0]
real, fake = pred[0], pred[1]
if real > 0.5:
text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real*100), 3)) + "%"
else:
text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake*100, 3)) + "%"
return text2
def load_audio_model():
d_args = {
"nb_samp": 64600,
"first_conv": 1024,
"in_channels": 1,
"filts": [20, [20, 20], [20, 128], [128, 128]],
"blocks": [2, 4],
"nb_fc_node": 1024,
"gru_node": 1024,
"nb_gru_layer": 3,
"nb_classes": 2}
model = RawNet(d_args = d_args, device='cpu')
#Load pretrained ckpt.
model_dict = model.state_dict()
ckpt = torch.load('pre_trained_DF_RawNet2.pth', map_location=torch.device('cpu'))
model.load_state_dict(ckpt, model_dict)
return model
audio_label_map = {
0: "Real audio",
1: "Fake audio"
}
def deepfakes_audio_predict(input_audio):
#Perform inference on audio.
x, sr = input_audio
x_pt = torch.Tensor(x)
x_pt = torch.unsqueeze(x_pt, dim = 0)
#Load model.
model = load_audio_model()
#Perform inference.
grads = model(x_pt)
#Get the argmax.
grads_np = grads.detach().numpy()
result = np.argmax(grads_np)
return audio_label_map[result]
|