multimodal-deepfakes

Runtime error

File size: 31,149 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\Users\\debonair\\anaconda3\\lib\\site-packages\\numpy\\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:\n",
      "c:\\Users\\debonair\\anaconda3\\lib\\site-packages\\numpy\\.libs\\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll\n",
      "c:\\Users\\debonair\\anaconda3\\lib\\site-packages\\numpy\\.libs\\libopenblas64__v0.3.23-gcc_10_3_0.dll\n",
      "  warnings.warn(\"loaded more than 1 DLL from .libs:\"\n",
      "c:\\Users\\debonair\\anaconda3\\lib\\site-packages\\tensorflow_addons\\utils\\tfa_eol_msg.py:23: UserWarning: \n",
      "\n",
      "TensorFlow Addons (TFA) has ended development and introduction of new features.\n",
      "TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.\n",
      "Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). \n",
      "\n",
      "For more information see: https://github.com/tensorflow/addons/issues/2807 \n",
      "\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "import cv2\n",
    "import numpy as np\n",
    "from PIL import Image\n",
    "import tensorflow as tf\n",
    "import tensorflow_addons\n",
    "import moviepy.editor as mp\n",
    "from facenet_pytorch import MTCNN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "ename": "OSError",
     "evalue": "No file or directory found at FINAL-EFFICIENTNETV2-B0",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mOSError\u001b[0m                                   Traceback (most recent call last)",
      "\u001b[1;32m~\\AppData\\Local\\Temp\\ipykernel_25172\\3936866724.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[0mmtcnn\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mMTCNN\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmargin\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m14\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkeep_all\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mTrue\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfactor\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m0.7\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdevice\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'cpu'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;31m#Load model.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mmodel\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkeras\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmodels\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mload_model\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"FINAL-EFFICIENTNETV2-B0\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python39\\site-packages\\keras\\utils\\traceback_utils.py\u001b[0m in \u001b[0;36merror_handler\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m     68\u001b[0m             \u001b[1;31m# To get the full stack trace, call:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     69\u001b[0m             \u001b[1;31m# `tf.debugging.disable_traceback_filtering()`\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 70\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfiltered_tb\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     71\u001b[0m         \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     72\u001b[0m             \u001b[1;32mdel\u001b[0m \u001b[0mfiltered_tb\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python39\\site-packages\\keras\\saving\\save.py\u001b[0m in \u001b[0;36mload_model\u001b[1;34m(filepath, custom_objects, compile, options)\u001b[0m\n\u001b[0;32m    224\u001b[0m                 \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_str\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    225\u001b[0m                     \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mtf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mio\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgfile\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_str\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 226\u001b[1;33m                         raise IOError(\n\u001b[0m\u001b[0;32m    227\u001b[0m                             \u001b[1;34mf\"No file or directory found at {filepath_str}\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    228\u001b[0m                         )\n",
      "\u001b[1;31mOSError\u001b[0m: No file or directory found at FINAL-EFFICIENTNETV2-B0"
     ]
    }
   ],
   "source": [
    "# Load face detector\n",
    "mtcnn = MTCNN(margin=14, keep_all=True, factor=0.7, device='cpu')\n",
    "#Load model.\n",
    "model = tf.keras.models.load_model(\"FINAL-EFFICIENTNETV2-B0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Face Detection function, Reference: (Timesler, 2020);\n",
    "class DetectionPipeline:\n",
    "    \"\"\"Pipeline class for detecting faces in the frames of a video file.\"\"\"\n",
    "\n",
    "    def __init__(self, detector, n_frames=None, batch_size=60, resize=None, input_modality = 'video'):\n",
    "        \"\"\"Constructor for DetectionPipeline class.\n",
    "\n",
    "        Keyword Arguments:\n",
    "            n_frames {int} -- Total number of frames to load. These will be evenly spaced\n",
    "                throughout the video. If not specified (i.e., None), all frames will be loaded.\n",
    "                (default: {None})\n",
    "            batch_size {int} -- Batch size to use with MTCNN face detector. (default: {32})\n",
    "            resize {float} -- Fraction by which to resize frames from original prior to face\n",
    "                detection. A value less than 1 results in downsampling and a value greater than\n",
    "                1 result in upsampling. (default: {None})\n",
    "        \"\"\"\n",
    "        self.detector = detector\n",
    "        self.n_frames = n_frames\n",
    "        self.batch_size = batch_size\n",
    "        self.resize = resize\n",
    "        self.input_modality = input_modality\n",
    "\n",
    "    def __call__(self, filename):\n",
    "        \"\"\"Load frames from an MP4 video and detect faces.\n",
    "\n",
    "        Arguments:\n",
    "            filename {str} -- Path to video.\n",
    "        \"\"\"\n",
    "        # Create video reader and find length\n",
    "        if self.input_modality == 'video':\n",
    "            print('Input modality is video.')\n",
    "            v_cap = cv2.VideoCapture(filename)\n",
    "            v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
    "\n",
    "            # Pick 'n_frames' evenly spaced frames to sample\n",
    "            if self.n_frames is None:\n",
    "                sample = np.arange(0, v_len)\n",
    "            else:\n",
    "                sample = np.linspace(0, v_len - 1, self.n_frames).astype(int)\n",
    "\n",
    "            # Loop through frames\n",
    "            faces = []\n",
    "            frames = []\n",
    "            for j in range(v_len):\n",
    "                success = v_cap.grab()\n",
    "                if j in sample:\n",
    "                    # Load frame\n",
    "                    success, frame = v_cap.retrieve()\n",
    "                    if not success:\n",
    "                        continue\n",
    "                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n",
    "                    # frame = Image.fromarray(frame)\n",
    "\n",
    "                    # Resize frame to desired size\n",
    "                    if self.resize is not None:\n",
    "                        frame = frame.resize([int(d * self.resize) for d in frame.size])\n",
    "                    frames.append(frame)\n",
    "\n",
    "                    # When batch is full, detect faces and reset frame list\n",
    "                    if len(frames) % self.batch_size == 0 or j == sample[-1]:\n",
    "\n",
    "                        boxes, probs = self.detector.detect(frames)\n",
    "\n",
    "                        for i in range(len(frames)):\n",
    "\n",
    "                            if boxes[i] is None:\n",
    "                                faces.append(face2)     #append previous face frame if no face is detected\n",
    "                                continue\n",
    "\n",
    "                            box = boxes[i][0].astype(int)\n",
    "                            frame = frames[i]\n",
    "                            face = frame[box[1]:box[3], box[0]:box[2]]\n",
    "\n",
    "                            if not face.any():\n",
    "                                faces.append(face2)     #append previous face frame if no face is detected\n",
    "                                continue\n",
    "\n",
    "                            face2 = cv2.resize(face, (224, 224))\n",
    "\n",
    "                            faces.append(face2)\n",
    "\n",
    "                        frames = []\n",
    "\n",
    "            v_cap.release()\n",
    "            return faces\n",
    "\n",
    "        elif self.input_modality == 'image':\n",
    "            print('Input modality is image.')\n",
    "            #Perform inference for image modality.\n",
    "            image = cv2.imread(filename)\n",
    "            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
    "            boxes, probs = self.detector.detect(image)\n",
    "\n",
    "            if boxes is None:\n",
    "                print('No faces found')\n",
    "\n",
    "            box = boxes[0].astype(int)\n",
    "            face = image[box[1]:box[3], box[0]:box[2]]\n",
    "            face = cv2.resize(face, (224, 224))\n",
    "\n",
    "            if not face.any():\n",
    "                print(\"No faces found...\")\n",
    "\n",
    "            return face\n",
    "        \n",
    "        else:\n",
    "            raise ValueError(\"Invalid input modality. Must be either 'video' or image\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "detection_video_pipeline = DetectionPipeline(detector=mtcnn, n_frames=2, batch_size=1, input_modality='video')\n",
    "def deepfakes_video_predict(input_video):\n",
    "\n",
    "    faces = detection_video_pipeline(input_video)\n",
    "    total = 0\n",
    "    real = 0\n",
    "    fake = 0\n",
    "\n",
    "    for face in faces:\n",
    "\n",
    "        face2 = face/255\n",
    "        pred = model.predict(np.expand_dims(face2, axis=0))[0]\n",
    "        total+=1\n",
    "\n",
    "        pred2 = pred[1]\n",
    "\n",
    "        if pred2 > 0.5:\n",
    "          fake+=1\n",
    "        else:\n",
    "          real+=1\n",
    "\n",
    "    fake_ratio = fake/total\n",
    "\n",
    "    text =\"\"\n",
    "    text2 = \"Deepfakes Confidence: \" + str(fake_ratio*100) + \"%\"\n",
    "\n",
    "    if fake_ratio >= 0.37:\n",
    "        text = \"The video is FAKE.\"\n",
    "    else:\n",
    "        text = \"The video is REAL.\"\n",
    "\n",
    "    return text, text2\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "detection_image_pipeline = DetectionPipeline(detector=mtcnn, batch_size = 1, input_modality = 'image')\n",
    "def deepfakes_image_predict(input_image):\n",
    "    faces = detection_image_pipeline(input_image)\n",
    "    face2 = faces/255\n",
    "    pred = model.predict(np.expand_dims(face2, axis = 0))[0]\n",
    "    real, fake = pred[0], pred[1]\n",
    "    if real > 0.5:\n",
    "        text = \"The image is REAL.\"\n",
    "        text2 = \"Deepfakes Confidence: \" + str(round(real*100, 3)) + \"%\"\n",
    "    else:\n",
    "        text = \"The image is FAKE.\"\n",
    "        text2 = \"Deepfakes Confidence: \" + str(round(fake*100, 3)) + \"%\"\n",
    "    return text, text2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input modality is image.\n",
      "1/1 [==============================] - 0s 75ms/step\n",
      "('The video is FAKE.', 'Deepfakes Confidence: 99.957%')\n",
      "Input modality is image.\n",
      "1/1 [==============================] - 0s 85ms/step\n",
      "('The video is REAL.', 'Deepfakes Confidence: 99.992%')\n"
     ]
    }
   ],
   "source": [
    "image_res = deepfakes_image_predict('fake_image.jpg')\n",
    "print(image_res)\n",
    "\n",
    "image_res = deepfakes_image_predict('lady.jpg')\n",
    "print(image_res)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input modality is video.\n",
      "1/1 [==============================] - 0s 80ms/step\n",
      "1/1 [==============================] - 0s 71ms/step\n",
      "('The video is FAKE.', 'Deepfakes Confidence: 100.0%')\n"
     ]
    }
   ],
   "source": [
    "video_dir = 'Video1-fake-1-ff.mp4'\n",
    "videos = deepfakes_video_predict(video_dir)\n",
    "print(videos)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Input modality is video.\n",
      "1/1 [==============================] - 0s 82ms/step\n",
      "1/1 [==============================] - 0s 78ms/step\n",
      "('The video is REAL.', 'Deepfakes Confidence: 0.0%')\n"
     ]
    }
   ],
   "source": [
    "video_dir = 'real-1.mp4'\n",
    "videos = deepfakes_video_predict(video_dir)\n",
    "print(videos)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Audio modality pipeline."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Load model.\n",
    "import torch \n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "from rawnet import SincConv, Residual_block\n",
    "\n",
    "\n",
    "\n",
    "d_args = {\n",
    "  \"nb_samp\": 64600,\n",
    "  \"first_conv\": 1024,\n",
    "  \"in_channels\": 1,\n",
    "  \"filts\": [20, [20, 20], [20, 128], [128, 128]],\n",
    "  \"blocks\": [2, 4],\n",
    "  \"nb_fc_node\": 1024,\n",
    "  \"gru_node\": 1024,\n",
    "  \"nb_gru_layer\": 3,\n",
    "  \"nb_classes\": 2}\n",
    "\n",
    "\n",
    "class RawNet(nn.Module):\n",
    "    def __init__(self, d_args, device):\n",
    "        super(RawNet, self).__init__()\n",
    "\n",
    "        \n",
    "        self.device=device\n",
    "\n",
    "        self.Sinc_conv=SincConv(device=self.device,\n",
    "\t\t\tout_channels = d_args['filts'][0],\n",
    "\t\t\tkernel_size = d_args['first_conv'],\n",
    "                        in_channels = d_args['in_channels']\n",
    "        )\n",
    "        \n",
    "        self.first_bn = nn.BatchNorm1d(num_features = d_args['filts'][0])\n",
    "        self.selu = nn.SELU(inplace=True)\n",
    "        self.block0 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][1], first = True))\n",
    "        self.block1 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][1]))\n",
    "        self.block2 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))\n",
    "        d_args['filts'][2][0] = d_args['filts'][2][1]\n",
    "        self.block3 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))\n",
    "        self.block4 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))\n",
    "        self.block5 = nn.Sequential(Residual_block(nb_filts = d_args['filts'][2]))\n",
    "        self.avgpool = nn.AdaptiveAvgPool1d(1)\n",
    "\n",
    "        self.fc_attention0 = self._make_attention_fc(in_features = d_args['filts'][1][-1],\n",
    "            l_out_features = d_args['filts'][1][-1])\n",
    "        self.fc_attention1 = self._make_attention_fc(in_features = d_args['filts'][1][-1],\n",
    "            l_out_features = d_args['filts'][1][-1])\n",
    "        self.fc_attention2 = self._make_attention_fc(in_features = d_args['filts'][2][-1],\n",
    "            l_out_features = d_args['filts'][2][-1])\n",
    "        self.fc_attention3 = self._make_attention_fc(in_features = d_args['filts'][2][-1],\n",
    "            l_out_features = d_args['filts'][2][-1])\n",
    "        self.fc_attention4 = self._make_attention_fc(in_features = d_args['filts'][2][-1],\n",
    "            l_out_features = d_args['filts'][2][-1])\n",
    "        self.fc_attention5 = self._make_attention_fc(in_features = d_args['filts'][2][-1],\n",
    "            l_out_features = d_args['filts'][2][-1])\n",
    "\n",
    "        self.bn_before_gru = nn.BatchNorm1d(num_features = d_args['filts'][2][-1])\n",
    "        self.gru = nn.GRU(input_size = d_args['filts'][2][-1],\n",
    "\t\t\thidden_size = d_args['gru_node'],\n",
    "\t\t\tnum_layers = d_args['nb_gru_layer'],\n",
    "\t\t\tbatch_first = True)\n",
    "\n",
    "        \n",
    "        self.fc1_gru = nn.Linear(in_features = d_args['gru_node'],\n",
    "\t\t\tout_features = d_args['nb_fc_node'])\n",
    "       \n",
    "        self.fc2_gru = nn.Linear(in_features = d_args['nb_fc_node'],\n",
    "\t\t\tout_features = d_args['nb_classes'],bias=True)\n",
    "\t\t\t\n",
    "       \n",
    "        self.sig = nn.Sigmoid()\n",
    "        self.logsoftmax = nn.LogSoftmax(dim=1)\n",
    "        \n",
    "    def forward(self, x, y = None):\n",
    "        \n",
    "        \n",
    "        nb_samp = x.shape[0]\n",
    "        len_seq = x.shape[1]\n",
    "        x=x.view(nb_samp,1,len_seq)\n",
    "        \n",
    "        x = self.Sinc_conv(x)    \n",
    "        x = F.max_pool1d(torch.abs(x), 3)\n",
    "        x = self.first_bn(x)\n",
    "        x =  self.selu(x)\n",
    "        \n",
    "        x0 = self.block0(x)\n",
    "        y0 = self.avgpool(x0).view(x0.size(0), -1) # torch.Size([batch, filter])\n",
    "        y0 = self.fc_attention0(y0)\n",
    "        y0 = self.sig(y0).view(y0.size(0), y0.size(1), -1)  # torch.Size([batch, filter, 1])\n",
    "        x = x0 * y0 + y0  # (batch, filter, time) x (batch, filter, 1)\n",
    "        \n",
    "\n",
    "        x1 = self.block1(x)\n",
    "        y1 = self.avgpool(x1).view(x1.size(0), -1) # torch.Size([batch, filter])\n",
    "        y1 = self.fc_attention1(y1)\n",
    "        y1 = self.sig(y1).view(y1.size(0), y1.size(1), -1)  # torch.Size([batch, filter, 1])\n",
    "        x = x1 * y1 + y1 # (batch, filter, time) x (batch, filter, 1)\n",
    "\n",
    "        x2 = self.block2(x)\n",
    "        y2 = self.avgpool(x2).view(x2.size(0), -1) # torch.Size([batch, filter])\n",
    "        y2 = self.fc_attention2(y2)\n",
    "        y2 = self.sig(y2).view(y2.size(0), y2.size(1), -1)  # torch.Size([batch, filter, 1])\n",
    "        x = x2 * y2 + y2 # (batch, filter, time) x (batch, filter, 1)\n",
    "\n",
    "        x3 = self.block3(x)\n",
    "        y3 = self.avgpool(x3).view(x3.size(0), -1) # torch.Size([batch, filter])\n",
    "        y3 = self.fc_attention3(y3)\n",
    "        y3 = self.sig(y3).view(y3.size(0), y3.size(1), -1)  # torch.Size([batch, filter, 1])\n",
    "        x = x3 * y3 + y3 # (batch, filter, time) x (batch, filter, 1)\n",
    "\n",
    "        x4 = self.block4(x)\n",
    "        y4 = self.avgpool(x4).view(x4.size(0), -1) # torch.Size([batch, filter])\n",
    "        y4 = self.fc_attention4(y4)\n",
    "        y4 = self.sig(y4).view(y4.size(0), y4.size(1), -1)  # torch.Size([batch, filter, 1])\n",
    "        x = x4 * y4 + y4 # (batch, filter, time) x (batch, filter, 1)\n",
    "\n",
    "        x5 = self.block5(x)\n",
    "        y5 = self.avgpool(x5).view(x5.size(0), -1) # torch.Size([batch, filter])\n",
    "        y5 = self.fc_attention5(y5)\n",
    "        y5 = self.sig(y5).view(y5.size(0), y5.size(1), -1)  # torch.Size([batch, filter, 1])\n",
    "        x = x5 * y5 + y5 # (batch, filter, time) x (batch, filter, 1)\n",
    "\n",
    "        x = self.bn_before_gru(x)\n",
    "        x = self.selu(x)\n",
    "        x = x.permute(0, 2, 1)     #(batch, filt, time) >> (batch, time, filt)\n",
    "        self.gru.flatten_parameters()\n",
    "        x, _ = self.gru(x)\n",
    "        x = x[:,-1,:]\n",
    "        x = self.fc1_gru(x)\n",
    "        x = self.fc2_gru(x)\n",
    "        output=self.logsoftmax(x)\n",
    "      \n",
    "        return output\n",
    "        \n",
    "        \n",
    "\n",
    "    def _make_attention_fc(self, in_features, l_out_features):\n",
    "\n",
    "        l_fc = []\n",
    "        \n",
    "        l_fc.append(nn.Linear(in_features = in_features,\n",
    "\t\t\t        out_features = l_out_features))\n",
    "\n",
    "        \n",
    "\n",
    "        return nn.Sequential(*l_fc)\n",
    "\n",
    "\n",
    "    def _make_layer(self, nb_blocks, nb_filts, first = False):\n",
    "        layers = []\n",
    "        #def __init__(self, nb_filts, first = False):\n",
    "        for i in range(nb_blocks):\n",
    "            first = first if i == 0 else False\n",
    "            layers.append(Residual_block(nb_filts = nb_filts,\n",
    "\t\t\t\tfirst = first))\n",
    "            if i == 0: nb_filts[0] = nb_filts[1]\n",
    "            \n",
    "        return nn.Sequential(*layers)\n",
    "\n",
    "    def summary(self, input_size, batch_size=-1, device=\"cuda\", print_fn = None):\n",
    "        if print_fn == None: printfn = print\n",
    "        model = self\n",
    "        \n",
    "        def register_hook(module):\n",
    "            def hook(module, input, output):\n",
    "                class_name = str(module.__class__).split(\".\")[-1].split(\"'\")[0]\n",
    "                module_idx = len(summary)\n",
    "                \n",
    "                m_key = \"%s-%i\" % (class_name, module_idx + 1)\n",
    "                summary[m_key] = OrderedDict()\n",
    "                summary[m_key][\"input_shape\"] = list(input[0].size())\n",
    "                summary[m_key][\"input_shape\"][0] = batch_size\n",
    "                if isinstance(output, (list, tuple)):\n",
    "                    summary[m_key][\"output_shape\"] = [\n",
    "\t\t\t\t\t\t[-1] + list(o.size())[1:] for o in output\n",
    "\t\t\t\t\t]\n",
    "                else:\n",
    "                    summary[m_key][\"output_shape\"] = list(output.size())\n",
    "                    if len(summary[m_key][\"output_shape\"]) != 0:\n",
    "                        summary[m_key][\"output_shape\"][0] = batch_size\n",
    "                        \n",
    "                params = 0\n",
    "                if hasattr(module, \"weight\") and hasattr(module.weight, \"size\"):\n",
    "                    params += torch.prod(torch.LongTensor(list(module.weight.size())))\n",
    "                    summary[m_key][\"trainable\"] = module.weight.requires_grad\n",
    "                if hasattr(module, \"bias\") and hasattr(module.bias, \"size\"):\n",
    "                    params += torch.prod(torch.LongTensor(list(module.bias.size())))\n",
    "                summary[m_key][\"nb_params\"] = params\n",
    "                \n",
    "            if (\n",
    "\t\t\t\tnot isinstance(module, nn.Sequential)\n",
    "\t\t\t\tand not isinstance(module, nn.ModuleList)\n",
    "\t\t\t\tand not (module == model)\n",
    "\t\t\t):\n",
    "                hooks.append(module.register_forward_hook(hook))\n",
    "                \n",
    "        device = device.lower()\n",
    "        assert device in [\n",
    "\t\t\t\"cuda\",\n",
    "\t\t\t\"cpu\",\n",
    "\t\t], \"Input device is not valid, please specify 'cuda' or 'cpu'\"\n",
    "        \n",
    "        if device == \"cuda\" and torch.cuda.is_available():\n",
    "            dtype = torch.cuda.FloatTensor\n",
    "        else:\n",
    "            dtype = torch.FloatTensor\n",
    "        if isinstance(input_size, tuple):\n",
    "            input_size = [input_size]\n",
    "        x = [torch.rand(2, *in_size).type(dtype) for in_size in input_size]\n",
    "        summary = OrderedDict()\n",
    "        hooks = []\n",
    "        model.apply(register_hook)\n",
    "        model(*x)\n",
    "        for h in hooks:\n",
    "            h.remove()\n",
    "            \n",
    "        print_fn(\"----------------------------------------------------------------\")\n",
    "        line_new = \"{:>20}  {:>25} {:>15}\".format(\"Layer (type)\", \"Output Shape\", \"Param #\")\n",
    "        print_fn(line_new)\n",
    "        print_fn(\"================================================================\")\n",
    "        total_params = 0\n",
    "        total_output = 0\n",
    "        trainable_params = 0\n",
    "        for layer in summary:\n",
    "            # input_shape, output_shape, trainable, nb_params\n",
    "            line_new = \"{:>20}  {:>25} {:>15}\".format(\n",
    "\t\t\t\tlayer,\n",
    "\t\t\t\tstr(summary[layer][\"output_shape\"]),\n",
    "\t\t\t\t\"{0:,}\".format(summary[layer][\"nb_params\"]),\n",
    "\t\t\t)\n",
    "            total_params += summary[layer][\"nb_params\"]\n",
    "            total_output += np.prod(summary[layer][\"output_shape\"])\n",
    "            if \"trainable\" in summary[layer]:\n",
    "                if summary[layer][\"trainable\"] == True:\n",
    "                    trainable_params += summary[layer][\"nb_params\"]\n",
    "            print_fn(line_new)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<All keys matched successfully>"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = RawNet(d_args = d_args, device = 'cpu')\n",
    "model_dict = model.state_dict()\n",
    "ckpt = torch.load('pre_trained_DF_RawNet2.pth', map_location = torch.device('cpu'))\n",
    "model.load_state_dict(ckpt, model_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "import librosa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_and_preprocess_audio(audio_path):\n",
    "    '''Loads and returns a torch tensor object'''\n",
    "    x, sr = librosa.load(audio_path)\n",
    "    x_pt = torch.Tensor(x)\n",
    "    x_pt = torch.unsqueeze(x_pt, dim = 0)\n",
    "    return x_pt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[2.5792e-05, 3.1405e-05, 4.5405e-05,  ..., 0.0000e+00, 0.0000e+00,\n",
       "         0.0000e+00]])"
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ds = load_and_preprocess_audio(audio_path = 'audios/DF_E_2000027.flac')\n",
    "ds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "grads = model(ds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[-6.5565e-06, -1.1934e+01]], grad_fn=<LogSoftmaxBackward0>)"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grads"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch = grads[:, 1].data.cpu().numpy().ravel()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-11.933539], dtype=float32)"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "_, batch_pred = grads.max(dim=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([0])"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "res = np.argmin(grads.detach().numpy())\n",
    "res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-11.933546"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "grads.detach().numpy()[0][0] + grads.detach().numpy()[0][1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}