multimodal-deepfakes

Runtime error

App Files Files Community

Kimata commited on Jul 2, 2023

Commit

e50136c

0 Parent(s):

add audio modality

Browse files

Files changed (20) hide show

.gitattributes +5 -0
README.md +37 -0
__pycache__/pipeline.cpython-39.pyc +0 -0
__pycache__/rawnet.cpython-39.pyc +0 -0
app.py +35 -0
audios/DF_E_2000027.flac +0 -0
audios/DF_E_2000028.flac +0 -0
audios/DF_E_2000031.flac +0 -0
audios/DF_E_2000032.flac +0 -0
efficientnet-b0.zip +3 -0
images/fake_image.jpg +0 -0
images/lady.jpg +0 -0
packages.txt +3 -0
pipeline.ipynb +790 -0
pipeline.py +223 -0
pre_trained_DF_RawNet2.pth +3 -0
rawnet.py +363 -0
requirements.txt +8 -0
videos/celeb_synthesis.mp4 +0 -0
videos/real-1.mp4 +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,5 @@

+efficientnet-b0/ filter=lfs diff=lfs merge=lfs -text
+efficientnet-b0.zip filter=lfs diff=lfs merge=lfs -text
+pre_trained_DF_RawNet2.pth filter=lfs diff=lfs merge=lfs -text
+efficientnet-b0/* filter=lfs diff=lfs merge=lfs -text
+efficientnet-b0/** filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+---
+title: Deepfakes_Video_Detector
+emoji: 🔥
+colorFrom: blue
+colorTo: gray
+sdk: gradio
+app_file: app.py
+pinned: false
+---
+# Configuration
+`title`: _string_
+Display title for the Space
+`emoji`: _string_
+Space emoji (emoji-only character allowed)
+`colorFrom`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`colorTo`: _string_
+Color for Thumbnail gradient (red, yellow, green, blue, indigo, purple, pink, gray)
+`sdk`: _string_
+Can be either `gradio`, `streamlit`, or `static`
+`sdk_version` : _string_
+Only applicable for `streamlit` SDK.
+See [doc](https://hf.co/docs/hub/spaces) for more info on supported versions.
+`app_file`: _string_
+Path to your main application file (which contains either `gradio` or `streamlit` Python code, or `static` html code).
+Path is relative to the root of the repository.
+`pinned`: _boolean_
+Whether the Space stays on top of your list.

__pycache__/pipeline.cpython-39.pyc ADDED Viewed

Binary file (5.84 kB). View file

__pycache__/rawnet.cpython-39.pyc ADDED Viewed

Binary file (9.72 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import gradio as gr
+import pipeline
+title="EfficientNetV2 Deepfakes Video Detector"
+description="EfficientNetV2 Deepfakes Image Detector by using frame-by-frame detection."
+video_interface = gr.Interface(pipeline.deepfakes_video_predict,
+                    gr.Video(),
+                    "text",
+                    examples = ["videos/celeb_synthesis.mp4", "videos/real-1.mp4"],
+                    cache_examples = False
+                    )
+image_interface = gr.Interface(pipeline.deepfakes_image_predict,
+                    gr.Image(),
+                    "text",
+                    examples = ["images/lady.jpg", "images/fake_image.jpg"],
+                    cache_examples=False
+                    )
+audio_interface = gr.Interface(pipeline.deepfakes_audio_predict,
+                               gr.Audio(),
+                               "text",
+                               examples = ["audios\DF_E_2000027.flac", "audios\DF_E_2000031.flac"],
+                               cache_examples = False)
+app = gr.TabbedInterface(interface_list= [image_interface, video_interface, audio_interface],
+                         tab_names = ['Image inference', 'Video inference', 'audio_interface'])
+if __name__ == '__main__':
+    app.launch(share = True)

audios/DF_E_2000027.flac ADDED Viewed

Binary file (30.3 kB). View file

audios/DF_E_2000028.flac ADDED Viewed

Binary file (29.7 kB). View file

audios/DF_E_2000031.flac ADDED Viewed

Binary file (65.2 kB). View file

audios/DF_E_2000032.flac ADDED Viewed

Binary file (80.3 kB). View file

efficientnet-b0.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4e4c321c6a075d0d72676a00f3610db80f8dcd04e276af0c2ddf6d88cd9b2596
+size 22846906

images/fake_image.jpg ADDED Viewed

images/lady.jpg ADDED Viewed

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+ffmpeg
+libsm6
+libxext6

pipeline.ipynb ADDED Viewed

	@@ -0,0 +1,790 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\debonair\\anaconda3\\lib\\site-packages\\numpy\\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:\n",
+      "c:\\Users\\debonair\\anaconda3\\lib\\site-packages\\numpy\\.libs\\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll\n",
+      "c:\\Users\\debonair\\anaconda3\\lib\\site-packages\\numpy\\.libs\\libopenblas64__v0.3.23-gcc_10_3_0.dll\n",
+      "  warnings.warn(\"loaded more than 1 DLL from .libs:\"\n",
+      "c:\\Users\\debonair\\anaconda3\\lib\\site-packages\\tensorflow_addons\\utils\\tfa_eol_msg.py:23: UserWarning: \n",
+      "\n",
+      "TensorFlow Addons (TFA) has ended development and introduction of new features.\n",
+      "TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n",
+      "Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n",
+      "\n",
+      "For more information see: https://github.com/tensorflow/addons/issues/2807 \n",
+      "\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "import cv2\n",
+    "import numpy as np\n",
+    "from PIL import Image\n",
+    "import tensorflow as tf\n",
+    "import tensorflow_addons\n",
+    "import moviepy.editor as mp\n",
+    "from facenet_pytorch import MTCNN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "OSError",
+     "evalue": "No file or directory found at FINAL-EFFICIENTNETV2-B0",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[1;31mOSError\u001b[0m                                   Traceback (most recent call last)",
+      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_25172\\3936866724.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mmtcnn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mMTCNN\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmargin\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m14\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeep_all\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfactor\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0.7\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'cpu'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;31m#Load model.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkeras\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"FINAL-EFFICIENTNETV2-B0\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python39\\site-packages\\keras\\utils\\traceback_utils.py\u001b[0m in \u001b[0;36merror_handler\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m     68\u001b[0m             \u001b[1;31m# To get the full stack trace, call:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     69\u001b[0m             \u001b[1;31m# `tf.debugging.disable_traceback_filtering()`\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 70\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfiltered_tb\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     71\u001b[0m         \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     72\u001b[0m             \u001b[1;32mdel\u001b[0m \u001b[0mfiltered_tb\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
+      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python39\\site-packages\\keras\\saving\\save.py\u001b[0m in \u001b[0;36mload_model\u001b[1;34m(filepath, custom_objects, compile, options)\u001b[0m\n\u001b[0;32m    224\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_str\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    225\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mtf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgfile\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_str\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 226\u001b[1;33m                         raise IOError(\n\u001b[0m\u001b[0;32m    227\u001b[0m                             \u001b[1;34mf\"No file or directory found at {filepath_str}\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    228\u001b[0m                         )\n",
+      "\u001b[1;31mOSError\u001b[0m: No file or directory found at FINAL-EFFICIENTNETV2-B0"
+     ]
+    }
+   ],
+   "source": [
+    "# Load face detector\n",
+    "mtcnn = MTCNN(margin=14, keep_all=True, factor=0.7, device='cpu')\n",
+    "#Load model.\n",
+    "model = tf.keras.models.load_model(\"FINAL-EFFICIENTNETV2-B0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Face Detection function, Reference: (Timesler, 2020);\n",
+    "class DetectionPipeline:\n",
+    "    \"\"\"Pipeline class for detecting faces in the frames of a video file.\"\"\"\n",
+    "\n",
+    "    def __init__(self, detector, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):\n",
+    "        \"\"\"Constructor for DetectionPipeline class.\n",
+    "\n",
+    "        Keyword Arguments:\n",
+    "            n_frames {int} -- Total number of frames to load. These will be evenly spaced\n",
+    "                throughout the video. If not specified (i.e., None), all frames will be loaded.\n",
+    "                (default: {None})\n",
+    "            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})\n",
+    "            resize {float} -- Fraction by which to resize frames from original prior to face\n",
+    "                detection. A value less than 1 results in downsampling and a value greater than\n",
+    "                1 result in upsampling. (default: {None})\n",
+    "        \"\"\"\n",
+    "        self.detector = detector\n",
+    "        self.n_frames = n_frames\n",
+    "        self.batch_size = batch_size\n",
+    "        self.resize = resize\n",
+    "        self.input_modality = input_modality\n",
+    "\n",
+    "    def __call__(self, filename):\n",
+    "        \"\"\"Load frames from an MP4 video and detect faces.\n",
+    "\n",
+    "        Arguments:\n",
+    "            filename {str} -- Path to video.\n",
+    "        \"\"\"\n",
+    "        # Create video reader and find length\n",
+    "        if self.input_modality == 'video':\n",
+    "            print('Input modality is video.')\n",
+    "            v_cap = cv2.VideoCapture(filename)\n",
+    "            v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
+    "\n",
+    "            # Pick 'n_frames' evenly spaced frames to sample\n",
+    "            if self.n_frames is None:\n",
+    "                sample = np.arange(0, v_len)\n",
+    "            else:\n",
+    "                sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)\n",
+    "\n",
+    "            # Loop through frames\n",
+    "            faces = []\n",
+    "            frames = []\n",
+    "            for j in range(v_len):\n",
+    "                success = v_cap.grab()\n",
+    "                if j in sample:\n",
+    "                    # Load frame\n",
+    "                    success, frame = v_cap.retrieve()\n",
+    "                    if not success:\n",
+    "                        continue\n",
+    "                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n",
+    "                    # frame = Image.fromarray(frame)\n",
+    "\n",
+    "                    # Resize frame to desired size\n",
+    "                    if self.resize is not None:\n",
+    "                        frame = frame.resize([int(d * self.resize) for d in frame.size])\n",
+    "                    frames.append(frame)\n",
+    "\n",
+    "                    # When batch is full, detect faces and reset frame list\n",
+    "                    if len(frames) % self.batch_size == 0 or j == sample[-1]:\n",
+    "\n",
+    "                        boxes, probs = self.detector.detect(frames)\n",
+    "\n",
+    "                        for i in range(len(frames)):\n",
+    "\n",
+    "                            if boxes[i] is None:\n",
+    "                                faces.append(face2)     #append previous face frame if no face is detected\n",
+    "                                continue\n",
+    "\n",
+    "                            box = boxes[i][0].astype(int)\n",
+    "                            frame = frames[i]\n",
+    "                            face = frame[box[1]:box[3], box[0]:box[2]]\n",
+    "\n",
+    "                            if not face.any():\n",
+    "                                faces.append(face2)     #append previous face frame if no face is detected\n",
+    "                                continue\n",
+    "\n",
+    "                            face2 = cv2.resize(face, (224, 224))\n",
+    "\n",
+    "                            faces.append(face2)\n",
+    "\n",
+    "                        frames = []\n",
+    "\n",
+    "            v_cap.release()\n",
+    "            return faces\n",
+    "\n",
+    "        elif self.input_modality == 'image':\n",
+    "            print('Input modality is image.')\n",
+    "            #Perform inference for image modality.\n",
+    "            image = cv2.imread(filename)\n",
+    "            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
+    "            boxes, probs = self.detector.detect(image)\n",
+    "\n",
+    "            if boxes is None:\n",
+    "                print('No faces found')\n",
+    "\n",
+    "            box = boxes[0].astype(int)\n",
+    "            face = image[box[1]:box[3], box[0]:box[2]]\n",
+    "            face = cv2.resize(face, (224, 224))\n",
+    "\n",
+    "            if not face.any():\n",
+    "                print(\"No faces found...\")\n",
+    "\n",
+    "            return face\n",
+    "        \n",
+    "        else:\n",
+    "            raise ValueError(\"Invalid input modality. Must be either 'video' or image\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "detection_video_pipeline = DetectionPipeline(detector=mtcnn, n_frames=2, batch_size=1, input_modality='video')\n",
+    "def deepfakes_video_predict(input_video):\n",
+    "\n",
+    "    faces = detection_video_pipeline(input_video)\n",
+    "    total = 0\n",
+    "    real = 0\n",
+    "    fake = 0\n",
+    "\n",
+    "    for face in faces:\n",
+    "\n",
+    "        face2 = face/255\n",
+    "        pred = model.predict(np.expand_dims(face2, axis=0))[0]\n",
+    "        total+=1\n",
+    "\n",
+    "        pred2 = pred[1]\n",
+    "\n",
+    "        if pred2 > 0.5:\n",
+    "          fake+=1\n",
+    "        else:\n",
+    "          real+=1\n",
+    "\n",
+    "    fake_ratio = fake/total\n",
+    "\n",
+    "    text =\"\"\n",
+    "    text2 = \"Deepfakes Confidence: \" + str(fake_ratio*100) + \"%\"\n",
+    "\n",
+    "    if fake_ratio >= 0.5:\n",
+    "        text = \"The video is FAKE.\"\n",
+    "    else:\n",
+    "        text = \"The video is REAL.\"\n",
+    "\n",
+    "    return text, text2\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "detection_image_pipeline = DetectionPipeline(detector=mtcnn, batch_size = 1, input_modality = 'image')\n",
+    "def deepfakes_image_predict(input_image):\n",
+    "    faces = detection_image_pipeline(input_image)\n",
+    "    face2 = faces/255\n",
+    "    pred = model.predict(np.expand_dims(face2, axis = 0))[0]\n",
+    "    real, fake = pred[0], pred[1]\n",
+    "    if real > 0.5:\n",
+    "        text = \"The image is REAL.\"\n",
+    "        text2 = \"Deepfakes Confidence: \" + str(round(real*100, 3)) + \"%\"\n",
+    "    else:\n",
+    "        text = \"The image is FAKE.\"\n",
+    "        text2 = \"Deepfakes Confidence: \" + str(round(fake*100, 3)) + \"%\"\n",
+    "    return text, text2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input modality is image.\n",
+      "1/1 [==============================] - 0s 75ms/step\n",
+      "('The video is FAKE.', 'Deepfakes Confidence: 99.957%')\n",
+      "Input modality is image.\n",
+      "1/1 [==============================] - 0s 85ms/step\n",
+      "('The video is REAL.', 'Deepfakes Confidence: 99.992%')\n"
+     ]
+    }
+   ],
+   "source": [
+    "image_res = deepfakes_image_predict('fake_image.jpg')\n",
+    "print(image_res)\n",
+    "\n",
+    "image_res = deepfakes_image_predict('lady.jpg')\n",
+    "print(image_res)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input modality is video.\n",
+      "1/1 [==============================] - 0s 80ms/step\n",
+      "1/1 [==============================] - 0s 71ms/step\n",
+      "('The video is FAKE.', 'Deepfakes Confidence: 100.0%')\n"
+     ]
+    }
+   ],
+   "source": [
+    "video_dir = 'Video1-fake-1-ff.mp4'\n",
+    "videos = deepfakes_video_predict(video_dir)\n",
+    "print(videos)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Input modality is video.\n",
+      "1/1 [==============================] - 0s 82ms/step\n",
+      "1/1 [==============================] - 0s 78ms/step\n",
+      "('The video is REAL.', 'Deepfakes Confidence: 0.0%')\n"
+     ]
+    }
+   ],
+   "source": [
+    "video_dir = 'real-1.mp4'\n",
+    "videos = deepfakes_video_predict(video_dir)\n",
+    "print(videos)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Audio modality pipeline."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Load model.\n",
+    "import torch \n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "from rawnet import SincConv, Residual_block\n",
+    "\n",
+    "\n",
+    "\n",
+    "d_args = {\n",
+    "  \"nb_samp\": 64600,\n",
+    "  \"first_conv\": 1024,\n",
+    "  \"in_channels\": 1,\n",
+    "  \"filts\": [20, [20, 20], [20, 128], [128, 128]],\n",
+    "  \"blocks\": [2, 4],\n",
+    "  \"nb_fc_node\": 1024,\n",
+    "  \"gru_node\": 1024,\n",
+    "  \"nb_gru_layer\": 3,\n",
+    "  \"nb_classes\": 2}\n",
+    "\n",
+    "\n",
+    "class RawNet(nn.Module):\n",
+    "    def __init__(self, d_args, device):\n",
+    "        super(RawNet, self).__init__()\n",
+    "\n",
+    "        \n",
+    "        self.device=device\n",
+    "\n",
+    "        self.Sinc_conv=SincConv(device=self.device,\n",
+    "\t\t\tout_channels = d_args['filts'][0],\n",
+    "\t\t\tkernel_size = d_args['first_conv'],\n",
+    "                        in_channels = d_args['in_channels']\n",
+    "        )\n",
+    "        \n",
+    "        self.first_bn = nn.BatchNorm1d(num_features = d_args['filts'][0])\n",
+    "        self.selu = nn.SELU(inplace=True)\n",
+    "        self.block0 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][1], first = True))\n",
+    "        self.block1 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][1]))\n",
+    "        self.block2 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))\n",
+    "        d_args['filts'][2][0] = d_args['filts'][2][1]\n",
+    "        self.block3 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))\n",
+    "        self.block4 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))\n",
+    "        self.block5 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))\n",
+    "        self.avgpool = nn.AdaptiveAvgPool1d(1)\n",
+    "\n",
+    "        self.fc_attention0 = self._make_attention_fc(in_features = d_args['filts'][1][-1],\n",
+    "            l_out_features = d_args['filts'][1][-1])\n",
+    "        self.fc_attention1 = self._make_attention_fc(in_features = d_args['filts'][1][-1],\n",
+    "            l_out_features = d_args['filts'][1][-1])\n",
+    "        self.fc_attention2 = self._make_attention_fc(in_features = d_args['filts'][2][-1],\n",
+    "            l_out_features = d_args['filts'][2][-1])\n",
+    "        self.fc_attention3 = self._make_attention_fc(in_features = d_args['filts'][2][-1],\n",
+    "            l_out_features = d_args['filts'][2][-1])\n",
+    "        self.fc_attention4 = self._make_attention_fc(in_features = d_args['filts'][2][-1],\n",
+    "            l_out_features = d_args['filts'][2][-1])\n",
+    "        self.fc_attention5 = self._make_attention_fc(in_features = d_args['filts'][2][-1],\n",
+    "            l_out_features = d_args['filts'][2][-1])\n",
+    "\n",
+    "        self.bn_before_gru = nn.BatchNorm1d(num_features = d_args['filts'][2][-1])\n",
+    "        self.gru = nn.GRU(input_size = d_args['filts'][2][-1],\n",
+    "\t\t\thidden_size = d_args['gru_node'],\n",
+    "\t\t\tnum_layers = d_args['nb_gru_layer'],\n",
+    "\t\t\tbatch_first = True)\n",
+    "\n",
+    "        \n",
+    "        self.fc1_gru = nn.Linear(in_features = d_args['gru_node'],\n",
+    "\t\t\tout_features = d_args['nb_fc_node'])\n",
+    "       \n",
+    "        self.fc2_gru = nn.Linear(in_features = d_args['nb_fc_node'],\n",
+    "\t\t\tout_features = d_args['nb_classes'],bias=True)\n",
+    "\t\t\t\n",
+    "       \n",
+    "        self.sig = nn.Sigmoid()\n",
+    "        self.logsoftmax = nn.LogSoftmax(dim=1)\n",
+    "        \n",
+    "    def forward(self, x, y = None):\n",
+    "        \n",
+    "        \n",
+    "        nb_samp = x.shape[0]\n",
+    "        len_seq = x.shape[1]\n",
+    "        x=x.view(nb_samp,1,len_seq)\n",
+    "        \n",
+    "        x = self.Sinc_conv(x)    \n",
+    "        x = F.max_pool1d(torch.abs(x), 3)\n",
+    "        x = self.first_bn(x)\n",
+    "        x =  self.selu(x)\n",
+    "        \n",
+    "        x0 = self.block0(x)\n",
+    "        y0 = self.avgpool(x0).view(x0.size(0), -1) # torch.Size([batch, filter])\n",
+    "        y0 = self.fc_attention0(y0)\n",
+    "        y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)  # torch.Size([batch, filter, 1])\n",
+    "        x = x0 * y0 + y0  # (batch, filter, time) x (batch, filter, 1)\n",
+    "        \n",
+    "\n",
+    "        x1 = self.block1(x)\n",
+    "        y1 = self.avgpool(x1).view(x1.size(0), -1) # torch.Size([batch, filter])\n",
+    "        y1 = self.fc_attention1(y1)\n",
+    "        y1 = self.sig(y1).view(y1.size(0), y1.size(1), -1)  # torch.Size([batch, filter, 1])\n",
+    "        x = x1 * y1 + y1 # (batch, filter, time) x (batch, filter, 1)\n",
+    "\n",
+    "        x2 = self.block2(x)\n",
+    "        y2 = self.avgpool(x2).view(x2.size(0), -1) # torch.Size([batch, filter])\n",
+    "        y2 = self.fc_attention2(y2)\n",
+    "        y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)  # torch.Size([batch, filter, 1])\n",
+    "        x = x2 * y2 + y2 # (batch, filter, time) x (batch, filter, 1)\n",
+    "\n",
+    "        x3 = self.block3(x)\n",
+    "        y3 = self.avgpool(x3).view(x3.size(0), -1) # torch.Size([batch, filter])\n",
+    "        y3 = self.fc_attention3(y3)\n",
+    "        y3 = self.sig(y3).view(y3.size(0), y3.size(1), -1)  # torch.Size([batch, filter, 1])\n",
+    "        x = x3 * y3 + y3 # (batch, filter, time) x (batch, filter, 1)\n",
+    "\n",
+    "        x4 = self.block4(x)\n",
+    "        y4 = self.avgpool(x4).view(x4.size(0), -1) # torch.Size([batch, filter])\n",
+    "        y4 = self.fc_attention4(y4)\n",
+    "        y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)  # torch.Size([batch, filter, 1])\n",
+    "        x = x4 * y4 + y4 # (batch, filter, time) x (batch, filter, 1)\n",
+    "\n",
+    "        x5 = self.block5(x)\n",
+    "        y5 = self.avgpool(x5).view(x5.size(0), -1) # torch.Size([batch, filter])\n",
+    "        y5 = self.fc_attention5(y5)\n",
+    "        y5 = self.sig(y5).view(y5.size(0), y5.size(1), -1)  # torch.Size([batch, filter, 1])\n",
+    "        x = x5 * y5 + y5 # (batch, filter, time) x (batch, filter, 1)\n",
+    "\n",
+    "        x = self.bn_before_gru(x)\n",
+    "        x = self.selu(x)\n",
+    "        x = x.permute(0, 2, 1)     #(batch, filt, time) >> (batch, time, filt)\n",
+    "        self.gru.flatten_parameters()\n",
+    "        x, _ = self.gru(x)\n",
+    "        x = x[:,-1,:]\n",
+    "        x = self.fc1_gru(x)\n",
+    "        x = self.fc2_gru(x)\n",
+    "        output=self.logsoftmax(x)\n",
+    "      \n",
+    "        return output\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "    def _make_attention_fc(self, in_features, l_out_features):\n",
+    "\n",
+    "        l_fc = []\n",
+    "        \n",
+    "        l_fc.append(nn.Linear(in_features = in_features,\n",
+    "\t\t\t        out_features = l_out_features))\n",
+    "\n",
+    "        \n",
+    "\n",
+    "        return nn.Sequential(*l_fc)\n",
+    "\n",
+    "\n",
+    "    def _make_layer(self, nb_blocks, nb_filts, first = False):\n",
+    "        layers = []\n",
+    "        #def __init__(self, nb_filts, first = False):\n",
+    "        for i in range(nb_blocks):\n",
+    "            first = first if i == 0 else False\n",
+    "            layers.append(Residual_block(nb_filts = nb_filts,\n",
+    "\t\t\t\tfirst = first))\n",
+    "            if i == 0: nb_filts[0] = nb_filts[1]\n",
+    "            \n",
+    "        return nn.Sequential(*layers)\n",
+    "\n",
+    "    def summary(self, input_size, batch_size=-1, device=\"cuda\", print_fn = None):\n",
+    "        if print_fn == None: printfn = print\n",
+    "        model = self\n",
+    "        \n",
+    "        def register_hook(module):\n",
+    "            def hook(module, input, output):\n",
+    "                class_name = str(module.__class__).split(\".\")[-1].split(\"'\")[0]\n",
+    "                module_idx = len(summary)\n",
+    "                \n",
+    "                m_key = \"%s-%i\" % (class_name, module_idx + 1)\n",
+    "                summary[m_key] = OrderedDict()\n",
+    "                summary[m_key][\"input_shape\"] = list(input[0].size())\n",
+    "                summary[m_key][\"input_shape\"][0] = batch_size\n",
+    "                if isinstance(output, (list, tuple)):\n",
+    "                    summary[m_key][\"output_shape\"] = [\n",
+    "\t\t\t\t\t\t[-1] + list(o.size())[1:] for o in output\n",
+    "\t\t\t\t\t]\n",
+    "                else:\n",
+    "                    summary[m_key][\"output_shape\"] = list(output.size())\n",
+    "                    if len(summary[m_key][\"output_shape\"]) != 0:\n",
+    "                        summary[m_key][\"output_shape\"][0] = batch_size\n",
+    "                        \n",
+    "                params = 0\n",
+    "                if hasattr(module, \"weight\") and hasattr(module.weight, \"size\"):\n",
+    "                    params += torch.prod(torch.LongTensor(list(module.weight.size())))\n",
+    "                    summary[m_key][\"trainable\"] = module.weight.requires_grad\n",
+    "                if hasattr(module, \"bias\") and hasattr(module.bias, \"size\"):\n",
+    "                    params += torch.prod(torch.LongTensor(list(module.bias.size())))\n",
+    "                summary[m_key][\"nb_params\"] = params\n",
+    "                \n",
+    "            if (\n",
+    "\t\t\t\tnot isinstance(module, nn.Sequential)\n",
+    "\t\t\t\tand not isinstance(module, nn.ModuleList)\n",
+    "\t\t\t\tand not (module == model)\n",
+    "\t\t\t):\n",
+    "                hooks.append(module.register_forward_hook(hook))\n",
+    "                \n",
+    "        device = device.lower()\n",
+    "        assert device in [\n",
+    "\t\t\t\"cuda\",\n",
+    "\t\t\t\"cpu\",\n",
+    "\t\t], \"Input device is not valid, please specify 'cuda' or 'cpu'\"\n",
+    "        \n",
+    "        if device == \"cuda\" and torch.cuda.is_available():\n",
+    "            dtype = torch.cuda.FloatTensor\n",
+    "        else:\n",
+    "            dtype = torch.FloatTensor\n",
+    "        if isinstance(input_size, tuple):\n",
+    "            input_size = [input_size]\n",
+    "        x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]\n",
+    "        summary = OrderedDict()\n",
+    "        hooks = []\n",
+    "        model.apply(register_hook)\n",
+    "        model(*x)\n",
+    "        for h in hooks:\n",
+    "            h.remove()\n",
+    "            \n",
+    "        print_fn(\"----------------------------------------------------------------\")\n",
+    "        line_new = \"{:>20}  {:>25} {:>15}\".format(\"Layer (type)\", \"Output Shape\", \"Param #\")\n",
+    "        print_fn(line_new)\n",
+    "        print_fn(\"================================================================\")\n",
+    "        total_params = 0\n",
+    "        total_output = 0\n",
+    "        trainable_params = 0\n",
+    "        for layer in summary:\n",
+    "            # input_shape, output_shape, trainable, nb_params\n",
+    "            line_new = \"{:>20}  {:>25} {:>15}\".format(\n",
+    "\t\t\t\tlayer,\n",
+    "\t\t\t\tstr(summary[layer][\"output_shape\"]),\n",
+    "\t\t\t\t\"{0:,}\".format(summary[layer][\"nb_params\"]),\n",
+    "\t\t\t)\n",
+    "            total_params += summary[layer][\"nb_params\"]\n",
+    "            total_output += np.prod(summary[layer][\"output_shape\"])\n",
+    "            if \"trainable\" in summary[layer]:\n",
+    "                if summary[layer][\"trainable\"] == True:\n",
+    "                    trainable_params += summary[layer][\"nb_params\"]\n",
+    "            print_fn(line_new)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<All keys matched successfully>"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = RawNet(d_args = d_args, device = 'cpu')\n",
+    "model_dict = model.state_dict()\n",
+    "ckpt = torch.load('pre_trained_DF_RawNet2.pth', map_location = torch.device('cpu'))\n",
+    "model.load_state_dict(ckpt, model_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import librosa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_and_preprocess_audio(audio_path):\n",
+    "    '''Loads and returns a torch tensor object'''\n",
+    "    x, sr = librosa.load(audio_path)\n",
+    "    x_pt = torch.Tensor(x)\n",
+    "    x_pt = torch.unsqueeze(x_pt, dim = 0)\n",
+    "    return x_pt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[2.5792e-05, 3.1405e-05, 4.5405e-05,  ..., 0.0000e+00, 0.0000e+00,\n",
+       "         0.0000e+00]])"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds = load_and_preprocess_audio(audio_path = 'audios/DF_E_2000027.flac')\n",
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grads = model(ds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[-6.5565e-06, -1.1934e+01]], grad_fn=<LogSoftmaxBackward0>)"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "grads"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batch = grads[:, 1].data.cpu().numpy().ravel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([-11.933539], dtype=float32)"
+      ]
+     },
+     "execution_count": 79,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_, batch_pred = grads.max(dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([0])"
+      ]
+     },
+     "execution_count": 83,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "batch_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "res = np.argmin(grads.detach().numpy())\n",
+    "res"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "-11.933546"
+      ]
+     },
+     "execution_count": 63,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "grads.detach().numpy()[0][0] + grads.detach().numpy()[0][1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

pipeline.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import os
+import cv2
+import torch
+import zipfile
+import librosa
+import numpy as np
+import tensorflow as tf
+import tensorflow_addons
+from facenet_pytorch import MTCNN
+from rawnet import RawNet
+local_zip = "./efficientnet-b0.zip"
+zip_ref = zipfile.ZipFile(local_zip, 'r')
+zip_ref.extractall()
+zip_ref.close()
+# Load models.
+mtcnn = MTCNN(margin=14, keep_all=True, factor=0.7, device='cpu')
+model = tf.keras.models.load_model("efficientnet-b0/")
+class DetectionPipeline:
+    """Pipeline class for detecting faces in the frames of a video file."""
+    def __init__(self, detector, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):
+        """Constructor for DetectionPipeline class.
+        Keyword Arguments:
+            n_frames {int} -- Total number of frames to load. These will be evenly spaced
+                throughout the video. If not specified (i.e., None), all frames will be loaded.
+                (default: {None})
+            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})
+            resize {float} -- Fraction by which to resize frames from original prior to face
+                detection. A value less than 1 results in downsampling and a value greater than
+                1 result in upsampling. (default: {None})
+        """
+        self.detector = detector
+        self.n_frames = n_frames
+        self.batch_size = batch_size
+        self.resize = resize
+        self.input_modality = input_modality
+    def __call__(self, filename):
+        """Load frames from an MP4 video and detect faces.
+        Arguments:
+            filename {str} -- Path to video.
+        """
+        # Create video reader and find length
+        if self.input_modality == 'video':
+            print('Input modality is video.')
+            v_cap = cv2.VideoCapture(filename)
+            v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            # Pick 'n_frames' evenly spaced frames to sample
+            if self.n_frames is None:
+                sample = np.arange(0, v_len)
+            else:
+                sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)
+            # Loop through frames
+            faces = []
+            frames = []
+            for j in range(v_len):
+                success = v_cap.grab()
+                if j in sample:
+                    # Load frame
+                    success, frame = v_cap.retrieve()
+                    if not success:
+                        continue
+                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                    # Resize frame to desired size
+                    if self.resize is not None:
+                        frame = frame.resize([int(d * self.resize) for d in frame.size])
+                    frames.append(frame)
+                    # When batch is full, detect faces and reset frame list
+                    if len(frames) % self.batch_size == 0 or j == sample[-1]:
+                        boxes, _ = self.detector.detect(frames)
+                        for i in range(len(frames)):
+                            if boxes[i] is None:
+                                faces.append(face2)     #append previous face frame if no face is detected
+                                continue
+                            box = boxes[i][0].astype(int)
+                            frame = frames[i]
+                            face = frame[box[1]:box[3], box[0]:box[2]]
+                            if not face.any():
+                                faces.append(face2)     #append previous face frame if no face is detected
+                                continue
+                            face2 = cv2.resize(face, (224, 224))
+                            faces.append(face2)
+                        frames = []
+            v_cap.release()
+            return faces
+        elif self.input_modality == 'image':
+            print('Input modality is image.')
+            #Perform inference for image modality.
+            print('Reading image')
+            # print(f"Image path is: {filename}")
+            # image = cv2.imread(filename)
+            image = cv2.cvtColor(filename, cv2.COLOR_BGR2RGB)
+            boxes, _ = self.detector.detect(image)
+            if boxes is None:
+                print('No faces found')
+            box = boxes[0].astype(int)
+            face = image[box[1]:box[3], box[0]:box[2]]
+            face = cv2.resize(face, (224, 224))
+            if not face.any():
+                print("No faces found...")
+            return face
+        elif self.input_modality == 'audio':
+            print("INput modality is audio.")
+            #Load audio.
+            x, sr = librosa.load(filename)
+            x_pt = torch.Tensor(x)
+            x_pt = torch.unsqueeze(x_pt, dim = 0)
+            return x_pt
+        else:
+            raise ValueError("Invalid input modality. Must be either 'video' or image")
+detection_video_pipeline = DetectionPipeline(detector=mtcnn, n_frames=5, batch_size=1, input_modality='video')
+detection_image_pipeline = DetectionPipeline(detector=mtcnn, batch_size = 1, input_modality = 'image')
+def deepfakes_video_predict(input_video):
+    faces = detection_video_pipeline(input_video)
+    total = 0
+    real_res = []
+    fake_res = []
+    for face in faces:
+        face2 = face/255
+        pred = model.predict(np.expand_dims(face2, axis=0))[0]
+        real, fake = pred[0], pred[1]
+        real_res.append(real)
+        fake_res.append(fake)
+        total+=1
+        pred2 = pred[1]
+        if pred2 > 0.5:
+          fake+=1
+        else:
+          real+=1
+    real_mean = np.mean(real_res)
+    fake_mean = np.mean(fake_res)
+    print(f"Real Faces: {real_mean}")
+    print(f"Fake Faces: {fake_mean}")
+    text = ""
+    if real_mean >= 0.5:
+        text = "The video is REAL. \n Deepfakes Confidence: " + str(round(100 - (real_mean*100), 3)) + "%"
+    else:
+        text = "The video is FAKE. \n Deepfakes Confidence: " + str(round(fake_mean*100, 3)) + "%"
+    return text
+def deepfakes_image_predict(input_image):
+    faces = detection_image_pipeline(input_image)
+    face2 = faces/255
+    pred = model.predict(np.expand_dims(face2, axis = 0))[0]
+    real, fake = pred[0], pred[1]
+    if real > 0.5:
+        text2 = "The image is REAL. \n Deepfakes Confidence: " + str(round(100 - (real*100), 3)) + "%"
+    else:
+        text2 = "The image is FAKE. \n Deepfakes Confidence: " + str(round(fake*100, 3)) + "%"
+    return text2
+def load_audio_model():
+    d_args = {
+  "nb_samp": 64600,
+  "first_conv": 1024,
+  "in_channels": 1,
+  "filts": [20, [20, 20], [20, 128], [128, 128]],
+  "blocks": [2, 4],
+  "nb_fc_node": 1024,
+  "gru_node": 1024,
+  "nb_gru_layer": 3,
+  "nb_classes": 2}
+    model = RawNet(d_args = d_args, device='cpu')
+    #Load pretrained ckpt.
+    model_dict = model.state_dict()
+    ckpt = torch.load('pre_trained_DF_RawNet2.pth', map_location=torch.device('cpu'))
+    model = model.load_state_dict(ckpt, model_dict)
+    return model
+def deepfakes_audio_predict(input_audio):
+    #Perform inference on audio.
+    x, sr = librosa.load(input_audio)
+    x_pt = torch.Tensor(x)
+    x_pt = torch.unsqueeze(x_pt, dim = 0)
+    #Load model.
+    model = load_audio_model()
+    #Perform inference.
+    grads = model(x_pt)
+    #Get the argmax.
+    grads_np = grads.detach().numpy()
+    result = np.argmax(grads_np)
+    return result

pre_trained_DF_RawNet2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:52d8ad5f524a0f600c7c876d7a157a8f06c44a03504d0b2795c852f5e42c9127
+size 70515422

rawnet.py ADDED Viewed

	@@ -0,0 +1,363 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+import numpy as np
+from torch.utils import data
+from collections import OrderedDict
+from torch.nn.parameter import Parameter
+___author__ = "Hemlata Tak"
+__email__ = "[email protected]"
+class SincConv(nn.Module):
+    @staticmethod
+    def to_mel(hz):
+        return 2595 * np.log10(1 + hz / 700)
+    @staticmethod
+    def to_hz(mel):
+        return 700 * (10 ** (mel / 2595) - 1)
+    def __init__(self, device,out_channels, kernel_size,in_channels=1,sample_rate=16000,
+                 stride=1, padding=0, dilation=1, bias=False, groups=1):
+        super(SincConv,self).__init__()
+        if in_channels != 1:
+            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels)
+            raise ValueError(msg)
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.sample_rate=sample_rate
+        # Forcing the filters to be odd (i.e, perfectly symmetrics)
+        if kernel_size%2==0:
+            self.kernel_size=self.kernel_size+1
+        self.device=device
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        if bias:
+            raise ValueError('SincConv does not support bias.')
+        if groups > 1:
+            raise ValueError('SincConv does not support groups.')
+        # initialize filterbanks using Mel scale
+        NFFT = 512
+        f=int(self.sample_rate/2)*np.linspace(0,1,int(NFFT/2)+1)
+        fmel=self.to_mel(f)   # Hz to mel conversion
+        fmelmax=np.max(fmel)
+        fmelmin=np.min(fmel)
+        filbandwidthsmel=np.linspace(fmelmin,fmelmax,self.out_channels+1)
+        filbandwidthsf=self.to_hz(filbandwidthsmel)  # Mel to Hz conversion
+        self.mel=filbandwidthsf
+        self.hsupp=torch.arange(-(self.kernel_size-1)/2, (self.kernel_size-1)/2+1)
+        self.band_pass=torch.zeros(self.out_channels,self.kernel_size)
+    def forward(self,x):
+        for i in range(len(self.mel)-1):
+            fmin=self.mel[i]
+            fmax=self.mel[i+1]
+            hHigh=(2*fmax/self.sample_rate)*np.sinc(2*fmax*self.hsupp/self.sample_rate)
+            hLow=(2*fmin/self.sample_rate)*np.sinc(2*fmin*self.hsupp/self.sample_rate)
+            hideal=hHigh-hLow
+            self.band_pass[i,:]=Tensor(np.hamming(self.kernel_size))*Tensor(hideal)
+        band_pass_filter=self.band_pass.to(self.device)
+        self.filters = (band_pass_filter).view(self.out_channels, 1, self.kernel_size)
+        return F.conv1d(x, self.filters, stride=self.stride,
+                        padding=self.padding, dilation=self.dilation,
+                         bias=None, groups=1)
+class Residual_block(nn.Module):
+    def __init__(self, nb_filts, first = False):
+        super(Residual_block, self).__init__()
+        self.first = first
+        if not self.first:
+            self.bn1 = nn.BatchNorm1d(num_features = nb_filts[0])
+        self.lrelu = nn.LeakyReLU(negative_slope=0.3)
+        self.conv1 = nn.Conv1d(in_channels = nb_filts[0],
+			out_channels = nb_filts[1],
+			kernel_size = 3,
+			padding = 1,
+			stride = 1)
+        self.bn2 = nn.BatchNorm1d(num_features = nb_filts[1])
+        self.conv2 = nn.Conv1d(in_channels = nb_filts[1],
+			out_channels = nb_filts[1],
+			padding = 1,
+			kernel_size = 3,
+			stride = 1)
+        if nb_filts[0] != nb_filts[1]:
+            self.downsample = True
+            self.conv_downsample = nn.Conv1d(in_channels = nb_filts[0],
+				out_channels = nb_filts[1],
+				padding = 0,
+				kernel_size = 1,
+				stride = 1)
+        else:
+            self.downsample = False
+        self.mp = nn.MaxPool1d(3)
+    def forward(self, x):
+        identity = x
+        if not self.first:
+            out = self.bn1(x)
+            out = self.lrelu(out)
+        else:
+            out = x
+        out = self.conv1(x)
+        out = self.bn2(out)
+        out = self.lrelu(out)
+        out = self.conv2(out)
+        if self.downsample:
+            identity = self.conv_downsample(identity)
+        out += identity
+        out = self.mp(out)
+        return out
+class RawNet(nn.Module):
+    def __init__(self, d_args, device):
+        super(RawNet, self).__init__()
+        self.device=device
+        self.Sinc_conv=SincConv(device=self.device,
+			out_channels = d_args['filts'][0],
+			kernel_size = d_args['first_conv'],
+                        in_channels = d_args['in_channels']
+        )
+        self.first_bn = nn.BatchNorm1d(num_features = d_args['filts'][0])
+        self.selu = nn.SELU(inplace=True)
+        self.block0 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][1], first = True))
+        self.block1 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][1]))
+        self.block2 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))
+        d_args['filts'][2][0] = d_args['filts'][2][1]
+        self.block3 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))
+        self.block4 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))
+        self.block5 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        self.fc_attention0 = self._make_attention_fc(in_features = d_args['filts'][1][-1],
+            l_out_features = d_args['filts'][1][-1])
+        self.fc_attention1 = self._make_attention_fc(in_features = d_args['filts'][1][-1],
+            l_out_features = d_args['filts'][1][-1])
+        self.fc_attention2 = self._make_attention_fc(in_features = d_args['filts'][2][-1],
+            l_out_features = d_args['filts'][2][-1])
+        self.fc_attention3 = self._make_attention_fc(in_features = d_args['filts'][2][-1],
+            l_out_features = d_args['filts'][2][-1])
+        self.fc_attention4 = self._make_attention_fc(in_features = d_args['filts'][2][-1],
+            l_out_features = d_args['filts'][2][-1])
+        self.fc_attention5 = self._make_attention_fc(in_features = d_args['filts'][2][-1],
+            l_out_features = d_args['filts'][2][-1])
+        self.bn_before_gru = nn.BatchNorm1d(num_features = d_args['filts'][2][-1])
+        self.gru = nn.GRU(input_size = d_args['filts'][2][-1],
+			hidden_size = d_args['gru_node'],
+			num_layers = d_args['nb_gru_layer'],
+			batch_first = True)
+        self.fc1_gru = nn.Linear(in_features = d_args['gru_node'],
+			out_features = d_args['nb_fc_node'])
+        self.fc2_gru = nn.Linear(in_features = d_args['nb_fc_node'],
+			out_features = d_args['nb_classes'],bias=True)
+        self.sig = nn.Sigmoid()
+        self.logsoftmax = nn.LogSoftmax(dim=1)
+    def forward(self, x, y = None):
+        nb_samp = x.shape[0]
+        len_seq = x.shape[1]
+        x=x.view(nb_samp,1,len_seq)
+        x = self.Sinc_conv(x)
+        x = F.max_pool1d(torch.abs(x), 3)
+        x = self.first_bn(x)
+        x =  self.selu(x)
+        x0 = self.block0(x)
+        y0 = self.avgpool(x0).view(x0.size(0), -1) # torch.Size([batch, filter])
+        y0 = self.fc_attention0(y0)
+        y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x0 * y0 + y0  # (batch, filter, time) x (batch, filter, 1)
+        x1 = self.block1(x)
+        y1 = self.avgpool(x1).view(x1.size(0), -1) # torch.Size([batch, filter])
+        y1 = self.fc_attention1(y1)
+        y1 = self.sig(y1).view(y1.size(0), y1.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x1 * y1 + y1 # (batch, filter, time) x (batch, filter, 1)
+        x2 = self.block2(x)
+        y2 = self.avgpool(x2).view(x2.size(0), -1) # torch.Size([batch, filter])
+        y2 = self.fc_attention2(y2)
+        y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x2 * y2 + y2 # (batch, filter, time) x (batch, filter, 1)
+        x3 = self.block3(x)
+        y3 = self.avgpool(x3).view(x3.size(0), -1) # torch.Size([batch, filter])
+        y3 = self.fc_attention3(y3)
+        y3 = self.sig(y3).view(y3.size(0), y3.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x3 * y3 + y3 # (batch, filter, time) x (batch, filter, 1)
+        x4 = self.block4(x)
+        y4 = self.avgpool(x4).view(x4.size(0), -1) # torch.Size([batch, filter])
+        y4 = self.fc_attention4(y4)
+        y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x4 * y4 + y4 # (batch, filter, time) x (batch, filter, 1)
+        x5 = self.block5(x)
+        y5 = self.avgpool(x5).view(x5.size(0), -1) # torch.Size([batch, filter])
+        y5 = self.fc_attention5(y5)
+        y5 = self.sig(y5).view(y5.size(0), y5.size(1), -1)  # torch.Size([batch, filter, 1])
+        x = x5 * y5 + y5 # (batch, filter, time) x (batch, filter, 1)
+        x = self.bn_before_gru(x)
+        x = self.selu(x)
+        x = x.permute(0, 2, 1)     #(batch, filt, time) >> (batch, time, filt)
+        self.gru.flatten_parameters()
+        x, _ = self.gru(x)
+        x = x[:,-1,:]
+        x = self.fc1_gru(x)
+        x = self.fc2_gru(x)
+        output=self.logsoftmax(x)
+        return output
+    def _make_attention_fc(self, in_features, l_out_features):
+        l_fc = []
+        l_fc.append(nn.Linear(in_features = in_features,
+			        out_features = l_out_features))
+        return nn.Sequential(*l_fc)
+    def _make_layer(self, nb_blocks, nb_filts, first = False):
+        layers = []
+        #def __init__(self, nb_filts, first = False):
+        for i in range(nb_blocks):
+            first = first if i == 0 else False
+            layers.append(Residual_block(nb_filts = nb_filts,
+				first = first))
+            if i == 0: nb_filts[0] = nb_filts[1]
+        return nn.Sequential(*layers)
+    def summary(self, input_size, batch_size=-1, device="cuda", print_fn = None):
+        if print_fn == None: printfn = print
+        model = self
+        def register_hook(module):
+            def hook(module, input, output):
+                class_name = str(module.__class__).split(".")[-1].split("'")[0]
+                module_idx = len(summary)
+                m_key = "%s-%i" % (class_name, module_idx + 1)
+                summary[m_key] = OrderedDict()
+                summary[m_key]["input_shape"] = list(input[0].size())
+                summary[m_key]["input_shape"][0] = batch_size
+                if isinstance(output, (list, tuple)):
+                    summary[m_key]["output_shape"] = [
+						[-1] + list(o.size())[1:] for o in output
+					]
+                else:
+                    summary[m_key]["output_shape"] = list(output.size())
+                    if len(summary[m_key]["output_shape"]) != 0:
+                        summary[m_key]["output_shape"][0] = batch_size
+                params = 0
+                if hasattr(module, "weight") and hasattr(module.weight, "size"):
+                    params += torch.prod(torch.LongTensor(list(module.weight.size())))
+                    summary[m_key]["trainable"] = module.weight.requires_grad
+                if hasattr(module, "bias") and hasattr(module.bias, "size"):
+                    params += torch.prod(torch.LongTensor(list(module.bias.size())))
+                summary[m_key]["nb_params"] = params
+            if (
+				not isinstance(module, nn.Sequential)
+				and not isinstance(module, nn.ModuleList)
+				and not (module == model)
+			):
+                hooks.append(module.register_forward_hook(hook))
+        device = device.lower()
+        assert device in [
+			"cuda",
+			"cpu",
+		], "Input device is not valid, please specify 'cuda' or 'cpu'"
+        if device == "cuda" and torch.cuda.is_available():
+            dtype = torch.cuda.FloatTensor
+        else:
+            dtype = torch.FloatTensor
+        if isinstance(input_size, tuple):
+            input_size = [input_size]
+        x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]
+        summary = OrderedDict()
+        hooks = []
+        model.apply(register_hook)
+        model(*x)
+        for h in hooks:
+            h.remove()
+        print_fn("----------------------------------------------------------------")
+        line_new = "{:>20}  {:>25} {:>15}".format("Layer (type)", "Output Shape", "Param #")
+        print_fn(line_new)
+        print_fn("================================================================")
+        total_params = 0
+        total_output = 0
+        trainable_params = 0
+        for layer in summary:
+            # input_shape, output_shape, trainable, nb_params
+            line_new = "{:>20}  {:>25} {:>15}".format(
+				layer,
+				str(summary[layer]["output_shape"]),
+				"{0:,}".format(summary[layer]["nb_params"]),
+			)
+            total_params += summary[layer]["nb_params"]
+            total_output += np.prod(summary[layer]["output_shape"])
+            if "trainable" in summary[layer]:
+                if summary[layer]["trainable"] == True:
+                    trainable_params += summary[layer]["nb_params"]
+            print_fn(line_new)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+tensorflow
+tensorflow-addons
+facenet_pytorch
+numpy
+opencv-python
+opencv-python-headless
+mtcnn
+moviepy

videos/celeb_synthesis.mp4 ADDED Viewed

Binary file (209 kB). View file

videos/real-1.mp4 ADDED Viewed

Binary file (631 kB). View file