Luigi's picture
default to 500m model
fa03d73
import time
import logging
import gradio as gr
import cv2
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler
import base64
import gc
import io
from contextlib import redirect_stdout, redirect_stderr
import sys, llama_cpp
# ----------------------------------------
# Model configurations: per-size prefixes and repos
MODELS = {
"256M": {
"model_repo": "mradermacher/SmolVLM2-256M-Video-Instruct-GGUF",
"clip_repo": "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF",
"model_prefix": "SmolVLM2-256M-Video-Instruct",
"clip_prefix": "mmproj-SmolVLM2-256M-Video-Instruct",
"model_variants": ["f16", "Q8_0", "Q2_K", "Q4_K_M"],
"clip_variants": ["Q8_0", "f16"],
},
"500M": {
"model_repo": "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF",
"clip_repo": "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF",
"model_prefix": "SmolVLM2-500M-Video-Instruct",
"clip_prefix": "mmproj-SmolVLM2-500M-Video-Instruct",
"model_variants": ["f16", "Q4_K_M", "Q8_0", "Q2_K"],
"clip_variants": ["Q8_0", "f16"],
},
"2.2B": {
"model_repo": "mradermacher/SmolVLM2-2.2B-Instruct-GGUF",
"clip_repo": "ggml-org/SmolVLM2-2.2B-Instruct-GGUF",
"model_prefix": "SmolVLM2-2.2B-Instruct",
"clip_prefix": "mmproj-SmolVLM2-2.2B-Instruct",
"model_variants": ["f16", "Q4_K_M", "Q8_0", "Q2_K"],
"clip_variants": ["Q8_0", "f16"],
},
}
# ----------------------------------------
# Cache for loaded model instance
model_cache = {
'size': None,
'model_file': None,
'clip_file': None,
'verbose': None,
'n_threads': None,
'llm': None
}
# Helper to download weights and return their cache paths
def ensure_weights(cfg, model_file, clip_file):
# Download model and clip into HF cache (writable, e.g. /tmp/.cache)
model_path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file)
clip_path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file)
return model_path, clip_path
# Custom chat handler
class SmolVLM2ChatHandler(Llava15ChatHandler):
CHAT_FORMAT = (
"<|im_start|>"
"{% for message in messages %}"
"{{ message['role'] | capitalize }}"
"{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:"
"{% else %}: "
"{% endif %}"
"{% for content in message['content'] %}"
"{% if content['type']=='text' %}{{ content['text'] }}"
"{% elif content['type']=='image_url' %}"
"{% if content['image_url'] is string %}"
"{{ content['image_url'] }}\n"
"{% elif content['image_url'] is mapping %}"
"{{ content['image_url']['url'] }}\n"
"{% endif %}"
"{% endif %}"
"{% endfor %}"
"<end_of_utterance>\n"
"{% endfor %}"
"{% if add_generation_prompt %}Assistant:{% endif %}"
)
# Load and cache LLM (only on dropdown or verbose or thread change)
def update_llm(size, model_file, clip_file, verbose_mode, n_threads):
# Only reload if any of parameters changed
if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose'], model_cache['n_threads']) != (size, model_file, clip_file, verbose_mode, n_threads):
mf, cf = ensure_weights(MODELS[size], model_file, clip_file)
handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode)
llm = Llama(
model_path=mf,
chat_handler=handler,
n_ctx=512,
verbose=verbose_mode,
n_threads=n_threads,
use_mlock=True,
)
model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'verbose': verbose_mode, 'n_threads': n_threads, 'llm': llm})
return None
# Build weight filename lists
def get_weight_files(size):
cfg = MODELS[size]
model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']]
clip_files = [f"{cfg['clip_prefix']}-{v}.gguf" for v in cfg['clip_variants']]
return model_files, clip_files
# Caption using cached llm with real-time debug logs
def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt, reset_clip, verbose_mode):
debug_msgs = []
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] Verbose mode: {verbose_mode}")
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] llama_cpp version: {llama_cpp.__version__}")
debug_msgs.append(f"[{timestamp}] Python version: {sys.version.split()[0]}")
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}")
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}")
debug_msgs.append(f"[{timestamp}] Using CLIP weights: {clip_file}")
t_resize = time.time()
img = cv2.resize(frame.copy(), (384, 384))
elapsed = (time.time() - t_resize) * 1000
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] Resized to 384x384 in {elapsed:.1f} ms")
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] Sleeping for {interval_ms} ms")
time.sleep(interval_ms / 1000)
t_enc = time.time()
params = [int(cv2.IMWRITE_JPEG_QUALITY), 75]
success, jpeg = cv2.imencode('.jpg', img, params)
elapsed = (time.time() - t_enc) * 1000
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] JPEG encode: success={success}, bytes={len(jpeg)} in {elapsed:.1f} ms")
uri = 'data:image/jpeg;base64,' + base64.b64encode(jpeg.tobytes()).decode()
messages = [
{"role": "system", "content": sys_prompt},
{"role": "user", "content": [
{"type": "image_url", "image_url": uri},
{"type": "text", "text": usr_prompt}
]}
]
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM")
if reset_clip:
model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=verbose_mode)
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] Reinitialized chat handler")
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}")
if model_cache.get('n_threads') is not None:
debug_msgs.append(f"[{timestamp}] llama_cpp n_threads = {model_cache['n_threads']}")
t_start = time.time()
buf = io.StringIO()
with redirect_stdout(buf), redirect_stderr(buf):
resp = model_cache['llm'].create_chat_completion(
messages=messages,
max_tokens=128,
temperature=0.1,
stop=["<end_of_utterance>"]
)
for line in buf.getvalue().splitlines():
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] {line}")
elapsed = (time.time() - t_start) * 1000
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] LLM response in {elapsed:.1f} ms")
content = resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip()
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] Caption length: {len(content)} chars")
gc.collect()
timestamp = time.strftime('%H:%M:%S')
debug_msgs.append(f"[{timestamp}] Garbage collected")
return content, "\n".join(debug_msgs)
# Gradio UI
def main():
logging.basicConfig(level=logging.INFO)
default = '500M'
default_verbose = True
default_threads = 2
mf, cf = get_weight_files(default)
with gr.Blocks() as demo:
gr.Markdown("## 🎥 Real-Time Camera Captioning with Debug Logs")
with gr.Row():
size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size')
model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights')
clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights')
verbose_cb= gr.Checkbox(value=default_verbose, label='Verbose Mode')
thread_dd = gr.Slider(minimum=1, maximum=os.cpu_count(), step=1, value=default_threads, label='CPU Threads (n_threads)')
def on_size_change(sz, verbose, n_threads):
mlist, clist = get_weight_files(sz)
update_llm(sz, mlist[0], clist[0], verbose, n_threads)
return gr.update(choices=mlist, value=mlist[0]), gr.update(choices=clist, value=clist[0])
size_dd.change(
fn=on_size_change,
inputs=[size_dd, verbose_cb, thread_dd],
outputs=[model_dd, clip_dd]
)
model_dd.change(
fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads),
inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd],
outputs=[]
)
clip_dd.change(
fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads),
inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd],
outputs=[]
)
verbose_cb.change(
fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads),
inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd],
outputs=[]
)
thread_dd.change(
fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads),
inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd],
outputs=[]
)
# Initial load
update_llm(default, mf[0], cf[0], default_verbose, default_threads)
interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)')
sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt')
usr_p = gr.Textbox(lines=1, value="Analyze the image and determine if there is any person lying on the floor. Respond with exactly YES or NO.", label='User Prompt')
reset_clip = gr.Checkbox(value=False, label="Reset CLIP handler each frame")
cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed')
cap = gr.Textbox(interactive=False, label='Caption')
log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log')
cam.stream(
fn=caption_frame,
inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip, verbose_cb],
outputs=[cap, log_box],
time_limit=600,
)
demo.launch()
if __name__ == '__main__':
main()