import time import logging import gradio as gr import cv2 import os from pathlib import Path from huggingface_hub import hf_hub_download from llama_cpp import Llama from llama_cpp.llama_chat_format import Llava15ChatHandler import base64 import gc import io from contextlib import redirect_stdout, redirect_stderr import sys, llama_cpp # ---------------------------------------- # Model configurations: per-size prefixes and repos MODELS = { "256M": { "model_repo": "mradermacher/SmolVLM2-256M-Video-Instruct-GGUF", "clip_repo": "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF", "model_prefix": "SmolVLM2-256M-Video-Instruct", "clip_prefix": "mmproj-SmolVLM2-256M-Video-Instruct", "model_variants": ["f16", "Q8_0", "Q2_K", "Q4_K_M"], "clip_variants": ["Q8_0", "f16"], }, "500M": { "model_repo": "mradermacher/SmolVLM2-500M-Video-Instruct-GGUF", "clip_repo": "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF", "model_prefix": "SmolVLM2-500M-Video-Instruct", "clip_prefix": "mmproj-SmolVLM2-500M-Video-Instruct", "model_variants": ["f16", "Q4_K_M", "Q8_0", "Q2_K"], "clip_variants": ["Q8_0", "f16"], }, "2.2B": { "model_repo": "mradermacher/SmolVLM2-2.2B-Instruct-GGUF", "clip_repo": "ggml-org/SmolVLM2-2.2B-Instruct-GGUF", "model_prefix": "SmolVLM2-2.2B-Instruct", "clip_prefix": "mmproj-SmolVLM2-2.2B-Instruct", "model_variants": ["f16", "Q4_K_M", "Q8_0", "Q2_K"], "clip_variants": ["Q8_0", "f16"], }, } # ---------------------------------------- # Cache for loaded model instance model_cache = { 'size': None, 'model_file': None, 'clip_file': None, 'verbose': None, 'n_threads': None, 'llm': None } # Helper to download weights and return their cache paths def ensure_weights(cfg, model_file, clip_file): # Download model and clip into HF cache (writable, e.g. /tmp/.cache) model_path = hf_hub_download(repo_id=cfg['model_repo'], filename=model_file) clip_path = hf_hub_download(repo_id=cfg['clip_repo'], filename=clip_file) return model_path, clip_path # Custom chat handler class SmolVLM2ChatHandler(Llava15ChatHandler): CHAT_FORMAT = ( "<|im_start|>" "{% for message in messages %}" "{{ message['role'] | capitalize }}" "{% if message['role']=='user' and message['content'][0]['type']=='image_url' %}:" "{% else %}: " "{% endif %}" "{% for content in message['content'] %}" "{% if content['type']=='text' %}{{ content['text'] }}" "{% elif content['type']=='image_url' %}" "{% if content['image_url'] is string %}" "{{ content['image_url'] }}\n" "{% elif content['image_url'] is mapping %}" "{{ content['image_url']['url'] }}\n" "{% endif %}" "{% endif %}" "{% endfor %}" "\n" "{% endfor %}" "{% if add_generation_prompt %}Assistant:{% endif %}" ) # Load and cache LLM (only on dropdown or verbose or thread change) def update_llm(size, model_file, clip_file, verbose_mode, n_threads): # Only reload if any of parameters changed if (model_cache['size'], model_cache['model_file'], model_cache['clip_file'], model_cache['verbose'], model_cache['n_threads']) != (size, model_file, clip_file, verbose_mode, n_threads): mf, cf = ensure_weights(MODELS[size], model_file, clip_file) handler = SmolVLM2ChatHandler(clip_model_path=cf, verbose=verbose_mode) llm = Llama( model_path=mf, chat_handler=handler, n_ctx=512, verbose=verbose_mode, n_threads=n_threads, use_mlock=True, ) model_cache.update({'size': size, 'model_file': mf, 'clip_file': cf, 'verbose': verbose_mode, 'n_threads': n_threads, 'llm': llm}) return None # Build weight filename lists def get_weight_files(size): cfg = MODELS[size] model_files = [f"{cfg['model_prefix']}.{v}.gguf" for v in cfg['model_variants']] clip_files = [f"{cfg['clip_prefix']}-{v}.gguf" for v in cfg['clip_variants']] return model_files, clip_files # Caption using cached llm with real-time debug logs def caption_frame(frame, size, model_file, clip_file, interval_ms, sys_prompt, usr_prompt, reset_clip, verbose_mode): debug_msgs = [] timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] Verbose mode: {verbose_mode}") timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] llama_cpp version: {llama_cpp.__version__}") debug_msgs.append(f"[{timestamp}] Python version: {sys.version.split()[0]}") timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] Received frame shape: {frame.shape}") timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] Using model weights: {model_file}") debug_msgs.append(f"[{timestamp}] Using CLIP weights: {clip_file}") t_resize = time.time() img = cv2.resize(frame.copy(), (384, 384)) elapsed = (time.time() - t_resize) * 1000 timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] Resized to 384x384 in {elapsed:.1f} ms") timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] Sleeping for {interval_ms} ms") time.sleep(interval_ms / 1000) t_enc = time.time() params = [int(cv2.IMWRITE_JPEG_QUALITY), 75] success, jpeg = cv2.imencode('.jpg', img, params) elapsed = (time.time() - t_enc) * 1000 timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] JPEG encode: success={success}, bytes={len(jpeg)} in {elapsed:.1f} ms") uri = 'data:image/jpeg;base64,' + base64.b64encode(jpeg.tobytes()).decode() messages = [ {"role": "system", "content": sys_prompt}, {"role": "user", "content": [ {"type": "image_url", "image_url": uri}, {"type": "text", "text": usr_prompt} ]} ] timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] Sending prompt of length {len(usr_prompt)} to LLM") if reset_clip: model_cache['llm'].chat_handler = SmolVLM2ChatHandler(clip_model_path=clip_file, verbose=verbose_mode) timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] Reinitialized chat handler") timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] CPU count = {os.cpu_count()}") if model_cache.get('n_threads') is not None: debug_msgs.append(f"[{timestamp}] llama_cpp n_threads = {model_cache['n_threads']}") t_start = time.time() buf = io.StringIO() with redirect_stdout(buf), redirect_stderr(buf): resp = model_cache['llm'].create_chat_completion( messages=messages, max_tokens=128, temperature=0.1, stop=[""] ) for line in buf.getvalue().splitlines(): timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] {line}") elapsed = (time.time() - t_start) * 1000 timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] LLM response in {elapsed:.1f} ms") content = resp.get('choices', [{}])[0].get('message', {}).get('content', '').strip() timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] Caption length: {len(content)} chars") gc.collect() timestamp = time.strftime('%H:%M:%S') debug_msgs.append(f"[{timestamp}] Garbage collected") return content, "\n".join(debug_msgs) # Gradio UI def main(): logging.basicConfig(level=logging.INFO) default = '500M' default_verbose = True default_threads = 2 mf, cf = get_weight_files(default) with gr.Blocks() as demo: gr.Markdown("## 🎥 Real-Time Camera Captioning with Debug Logs") with gr.Row(): size_dd = gr.Dropdown(list(MODELS.keys()), value=default, label='Model Size') model_dd = gr.Dropdown(mf, value=mf[0], label='Decoder Weights') clip_dd = gr.Dropdown(cf, value=cf[0], label='CLIP Weights') verbose_cb= gr.Checkbox(value=default_verbose, label='Verbose Mode') thread_dd = gr.Slider(minimum=1, maximum=os.cpu_count(), step=1, value=default_threads, label='CPU Threads (n_threads)') def on_size_change(sz, verbose, n_threads): mlist, clist = get_weight_files(sz) update_llm(sz, mlist[0], clist[0], verbose, n_threads) return gr.update(choices=mlist, value=mlist[0]), gr.update(choices=clist, value=clist[0]) size_dd.change( fn=on_size_change, inputs=[size_dd, verbose_cb, thread_dd], outputs=[model_dd, clip_dd] ) model_dd.change( fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads), inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd], outputs=[] ) clip_dd.change( fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads), inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd], outputs=[] ) verbose_cb.change( fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads), inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd], outputs=[] ) thread_dd.change( fn=lambda sz, mf, cf, verbose, n_threads: update_llm(sz, mf, cf, verbose, n_threads), inputs=[size_dd, model_dd, clip_dd, verbose_cb, thread_dd], outputs=[] ) # Initial load update_llm(default, mf[0], cf[0], default_verbose, default_threads) interval = gr.Slider(100, 20000, step=100, value=3000, label='Interval (ms)') sys_p = gr.Textbox(lines=2, value="Focus on key dramatic action…", label='System Prompt') usr_p = gr.Textbox(lines=1, value="Analyze the image and determine if there is any person lying on the floor. Respond with exactly YES or NO.", label='User Prompt') reset_clip = gr.Checkbox(value=False, label="Reset CLIP handler each frame") cam = gr.Image(sources=['webcam'], streaming=True, label='Webcam Feed') cap = gr.Textbox(interactive=False, label='Caption') log_box = gr.Textbox(lines=8, interactive=False, label='Debug Log') cam.stream( fn=caption_frame, inputs=[cam, size_dd, model_dd, clip_dd, interval, sys_p, usr_p, reset_clip, verbose_cb], outputs=[cap, log_box], time_limit=600, ) demo.launch() if __name__ == '__main__': main()