prithivMLmods commited on
Commit
8c2ebad
·
verified ·
1 Parent(s): 531c1e4

Upload Inkscope-Captions-2B-0526 [ Video Understanding ] Demo (#1)

Browse files

- Upload Inkscope-Captions-2B-0526 [ Video Understanding ] Demo (5babef05b1050b0b41d691e98386b558e46d9d47)

Inkscope-Captions-2B-0526-Video-Understanding/Inkscope-Captions-2B-0526-Video-Understanding.ipynb ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 1,
22
+ "metadata": {
23
+ "id": "XKQwuI75LWLA"
24
+ },
25
+ "outputs": [],
26
+ "source": [
27
+ "%%capture\n",
28
+ "!pip install gradio transformers pillow opencv-python\n",
29
+ "!pip install accelerate torchvision torch huggingface_hub\n",
30
+ "!pip install hf_xet qwen-vl-utils gradio_client\n",
31
+ "!pip install transformers-stream-generator spaces"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "source": [
37
+ "import os\n",
38
+ "import uuid\n",
39
+ "import time\n",
40
+ "from threading import Thread\n",
41
+ "\n",
42
+ "import gradio as gr\n",
43
+ "import torch\n",
44
+ "import numpy as np\n",
45
+ "import cv2\n",
46
+ "from PIL import Image\n",
47
+ "from transformers import Qwen2VLForConditionalGeneration, AutoProcessor\n",
48
+ "\n",
49
+ "# Ensure CUDA if available\n",
50
+ "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")\n",
51
+ "\n",
52
+ "# Load Callisto OCR3 multimodal model and processor\n",
53
+ "MODEL_ID = \"prithivMLmods/Inkscope-Captions-2B-0526\"\n",
54
+ "processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)\n",
55
+ "model = Qwen2VLForConditionalGeneration.from_pretrained(\n",
56
+ " MODEL_ID,\n",
57
+ " trust_remote_code=True,\n",
58
+ " torch_dtype=torch.float16\n",
59
+ ").to(device).eval()\n",
60
+ "\n",
61
+ "# Constants\n",
62
+ "MAX_INPUT_TOKEN_LENGTH = 4096\n",
63
+ "\n",
64
+ "\n",
65
+ "def downsample_video(video_path: str, num_frames: int = 10):\n",
66
+ " \"\"\"\n",
67
+ " Extracts 'num_frames' evenly spaced frames from the video.\n",
68
+ " Returns a list of (PIL.Image, timestamp_seconds).\n",
69
+ " \"\"\"\n",
70
+ " vidcap = cv2.VideoCapture(video_path)\n",
71
+ " total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
72
+ " fps = vidcap.get(cv2.CAP_PROP_FPS) or 1\n",
73
+ " indices = np.linspace(0, total - 1, num_frames, dtype=int)\n",
74
+ " frames = []\n",
75
+ " for idx in indices:\n",
76
+ " vidcap.set(cv2.CAP_PROP_POS_FRAMES, idx)\n",
77
+ " ret, frame = vidcap.read()\n",
78
+ " if not ret:\n",
79
+ " continue\n",
80
+ " frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)\n",
81
+ " pil = Image.fromarray(frame)\n",
82
+ " timestamp = round(idx / fps, 2)\n",
83
+ " frames.append((pil, timestamp))\n",
84
+ " vidcap.release()\n",
85
+ " return frames\n",
86
+ "\n",
87
+ "\n",
88
+ "def generate(video_file: str):\n",
89
+ " \"\"\"\n",
90
+ " Process the uploaded video through OCR and return concatenated output.\n",
91
+ " \"\"\"\n",
92
+ " # Step 1: extract frames\n",
93
+ " frames = downsample_video(video_file)\n",
94
+ "\n",
95
+ " # Step 2: build chat-like messages\n",
96
+ " messages = [\n",
97
+ " {\"role\": \"system\", \"content\": [{\"type\": \"text\", \"text\": \"You are a helpful assistant, for video understanding.\"}]},\n",
98
+ " {\"role\": \"user\", \"content\": [{\"type\": \"text\", \"text\": \"Please explain the content of the following video frames:\"}]\n",
99
+ " }\n",
100
+ " ]\n",
101
+ " for img, ts in frames:\n",
102
+ " # save temporary frame image\n",
103
+ " path = f\"frame_{uuid.uuid4().hex}.png\"\n",
104
+ " img.save(path)\n",
105
+ " messages[1][\"content\"].append({\"type\": \"text\", \"text\": f\"Frame at {ts}s:\"})\n",
106
+ " messages[1][\"content\"].append({\"type\": \"image\", \"url\": path})\n",
107
+ "\n",
108
+ " # Step 3: tokenize with truncation\n",
109
+ " inputs = processor.apply_chat_template(\n",
110
+ " messages,\n",
111
+ " tokenize=True,\n",
112
+ " add_generation_prompt=True,\n",
113
+ " return_dict=True,\n",
114
+ " return_tensors=\"pt\",\n",
115
+ " truncation=True,\n",
116
+ " max_length=MAX_INPUT_TOKEN_LENGTH\n",
117
+ " ).to(device)\n",
118
+ "\n",
119
+ " # Step 4: use streamer to collect output\n",
120
+ " from transformers import TextIteratorStreamer\n",
121
+ " streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)\n",
122
+ " gen_kwargs = {\n",
123
+ " **inputs,\n",
124
+ " \"streamer\": streamer,\n",
125
+ " \"max_new_tokens\": 1024,\n",
126
+ " \"do_sample\": True,\n",
127
+ " \"temperature\": 0.7,\n",
128
+ " }\n",
129
+ " thread = Thread(target=model.generate, kwargs=gen_kwargs)\n",
130
+ " thread.start()\n",
131
+ "\n",
132
+ " # collect all tokens\n",
133
+ " buffer = \"\"\n",
134
+ " for chunk in streamer:\n",
135
+ " buffer += chunk.replace(\"<|im_end|>\", \"\")\n",
136
+ " time.sleep(0.01)\n",
137
+ "\n",
138
+ " # return full concatenated response\n",
139
+ " return buffer\n",
140
+ "\n",
141
+ "\n",
142
+ "def launch_app():\n",
143
+ " demo = gr.Interface(\n",
144
+ " fn=generate,\n",
145
+ " inputs=gr.Video(label=\"Upload Video\"),\n",
146
+ " outputs=gr.Textbox(label=\"Video Caption\"),\n",
147
+ " title=\"Video Understanding with Inkscope-Captions-2B-0526\",\n",
148
+ " description=\"Upload a video and get an OCR-based description of its frames.\",\n",
149
+ " allow_flagging=\"never\"\n",
150
+ " )\n",
151
+ " demo.queue().launch(debug=True)\n",
152
+ "\n",
153
+ "\n",
154
+ "if __name__ == \"__main__\":\n",
155
+ " launch_app()"
156
+ ],
157
+ "metadata": {
158
+ "id": "GZXqC00zLbS1"
159
+ },
160
+ "execution_count": null,
161
+ "outputs": []
162
+ }
163
+ ]
164
+ }