prithivMLmods
/

Inkscope-Captions-2B-0526

+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "id": "XKQwuI75LWLA"
+      },
+      "outputs": [],
+      "source": [
+        "%%capture\n",
+        "!pip install gradio transformers pillow opencv-python\n",
+        "!pip install accelerate torchvision torch huggingface_hub\n",
+        "!pip install hf_xet qwen-vl-utils gradio_client\n",
+        "!pip install transformers-stream-generator spaces"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "import uuid\n",
+        "import time\n",
+        "from threading import Thread\n",
+        "\n",
+        "import gradio as gr\n",
+        "import torch\n",
+        "import numpy as np\n",
+        "import cv2\n",
+        "from PIL import Image\n",
+        "from transformers import Qwen2VLForConditionalGeneration, AutoProcessor\n",
+        "\n",
+        "# Ensure CUDA if available\n",
+        "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
+        "\n",
+        "# Load Callisto OCR3 multimodal model and processor\n",
+        "MODEL_ID = \"prithivMLmods/Inkscope-Captions-2B-0526\"\n",
+        "processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)\n",
+        "model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
+        "    MODEL_ID,\n",
+        "    trust_remote_code=True,\n",
+        "    torch_dtype=torch.float16\n",
+        ").to(device).eval()\n",
+        "\n",
+        "# Constants\n",
+        "MAX_INPUT_TOKEN_LENGTH = 4096\n",
+        "\n",
+        "\n",
+        "def downsample_video(video_path: str, num_frames: int = 10):\n",
+        "    \"\"\"\n",
+        "    Extracts 'num_frames' evenly spaced frames from the video.\n",
+        "    Returns a list of (PIL.Image, timestamp_seconds).\n",
+        "    \"\"\"\n",
+        "    vidcap = cv2.VideoCapture(video_path)\n",
+        "    total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
+        "    fps = vidcap.get(cv2.CAP_PROP_FPS) or 1\n",
+        "    indices = np.linspace(0, total - 1, num_frames, dtype=int)\n",
+        "    frames = []\n",
+        "    for idx in indices:\n",
+        "        vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)\n",
+        "        ret, frame = vidcap.read()\n",
+        "        if not ret:\n",
+        "            continue\n",
+        "        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n",
+        "        pil = Image.fromarray(frame)\n",
+        "        timestamp = round(idx / fps, 2)\n",
+        "        frames.append((pil, timestamp))\n",
+        "    vidcap.release()\n",
+        "    return frames\n",
+        "\n",
+        "\n",
+        "def generate(video_file: str):\n",
+        "    \"\"\"\n",
+        "    Process the uploaded video through OCR and return concatenated output.\n",
+        "    \"\"\"\n",
+        "    # Step 1: extract frames\n",
+        "    frames = downsample_video(video_file)\n",
+        "\n",
+        "    # Step 2: build chat-like messages\n",
+        "    messages = [\n",
+        "        {\"role\": \"system\", \"content\": [{\"type\": \"text\", \"text\": \"You are a helpful assistant, for video understanding.\"}]},\n",
+        "        {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": \"Please explain the content of the following video frames:\"}]\n",
+        "        }\n",
+        "    ]\n",
+        "    for img, ts in frames:\n",
+        "        # save temporary frame image\n",
+        "        path = f\"frame_{uuid.uuid4().hex}.png\"\n",
+        "        img.save(path)\n",
+        "        messages[1][\"content\"].append({\"type\": \"text\", \"text\": f\"Frame at {ts}s:\"})\n",
+        "        messages[1][\"content\"].append({\"type\": \"image\", \"url\": path})\n",
+        "\n",
+        "    # Step 3: tokenize with truncation\n",
+        "    inputs = processor.apply_chat_template(\n",
+        "        messages,\n",
+        "        tokenize=True,\n",
+        "        add_generation_prompt=True,\n",
+        "        return_dict=True,\n",
+        "        return_tensors=\"pt\",\n",
+        "        truncation=True,\n",
+        "        max_length=MAX_INPUT_TOKEN_LENGTH\n",
+        "    ).to(device)\n",
+        "\n",
+        "    # Step 4: use streamer to collect output\n",
+        "    from transformers import TextIteratorStreamer\n",
+        "    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)\n",
+        "    gen_kwargs = {\n",
+        "        **inputs,\n",
+        "        \"streamer\": streamer,\n",
+        "        \"max_new_tokens\": 1024,\n",
+        "        \"do_sample\": True,\n",
+        "        \"temperature\": 0.7,\n",
+        "    }\n",
+        "    thread = Thread(target=model.generate, kwargs=gen_kwargs)\n",
+        "    thread.start()\n",
+        "\n",
+        "    # collect all tokens\n",
+        "    buffer = \"\"\n",
+        "    for chunk in streamer:\n",
+        "        buffer += chunk.replace(\"<|im_end|>\", \"\")\n",
+        "        time.sleep(0.01)\n",
+        "\n",
+        "    # return full concatenated response\n",
+        "    return buffer\n",
+        "\n",
+        "\n",
+        "def launch_app():\n",
+        "    demo = gr.Interface(\n",
+        "        fn=generate,\n",
+        "        inputs=gr.Video(label=\"Upload Video\"),\n",
+        "        outputs=gr.Textbox(label=\"Video Caption\"),\n",
+        "        title=\"Video Understanding with Inkscope-Captions-2B-0526\",\n",
+        "        description=\"Upload a video and get an OCR-based description of its frames.\",\n",
+        "        allow_flagging=\"never\"\n",
+        "    )\n",
+        "    demo.queue().launch(debug=True)\n",
+        "\n",
+        "\n",
+        "if __name__ == \"__main__\":\n",
+        "    launch_app()"
+      ],
+      "metadata": {
+        "id": "GZXqC00zLbS1"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}