SmolVLM2-on-transformers

Running

App Files Files Community

SkyNait commited on Jul 9

Commit

e0b2fe7

verified ·

1 Parent(s): caa6ba9

file upload

Browse files

Files changed (1) hide show

app.py +116 -85

app.py CHANGED Viewed

@@ -27,33 +27,51 @@ has_xpu = hasattr(torch, 'xpu') and torch.xpu.is_available()
 def update_model(model_id, device):
     if model_cache['model_id'] != model_id or model_cache['device'] != device:
         logging.info(f'Loading model {model_id} on {device}')
-        processor = AutoProcessor.from_pretrained(model_id)
-        # Load model with appropriate precision for each device
-        if device == 'cuda':
-            # Use bfloat16 for CUDA for performance
-            model = AutoModelForImageTextToText.from_pretrained(
-                model_id,
-                torch_dtype=torch.bfloat16,
-                _attn_implementation='flash_attention_2'
-            ).to('cuda')
-        elif device == 'xpu' and has_xpu:
-            # Use float32 on XPU to avoid bfloat16 layernorm issues
-            model = AutoModelForImageTextToText.from_pretrained(
-                model_id,
-                torch_dtype=torch.float32
-            ).to('xpu')
-        else:
-            # Default to float32 on CPU
-            model = AutoModelForImageTextToText.from_pretrained(model_id).to('cpu')
-        model.eval()
-        model_cache.update({'model_id': model_id, 'processor': processor, 'model': model, 'device': device})
 def extract_frames_from_video(video_path, max_frames=10):
     """Extract frames from video file for processing"""
     cap = cv2.VideoCapture(video_path)
     frames = []
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
     # Calculate step size to extract evenly distributed frames
     step = max(1, frame_count // max_frames)
@@ -77,71 +95,78 @@ def extract_frames_from_video(video_path, max_frames=10):
 def caption_frame(frame, model_id, interval_ms, sys_prompt, usr_prompt, device):
     """Caption a single frame (used for webcam streaming)"""
     debug_msgs = []
-    update_model(model_id, device)
-    processor = model_cache['processor']
-    model = model_cache['model']
-    # Control capture interval
-    time.sleep(interval_ms / 1000)
-    # Preprocess frame
-    t0 = time.time()
-    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    pil_img = Image.fromarray(rgb)
-    temp_path = 'frame.jpg'
-    pil_img.save(temp_path, format='JPEG', quality=50)
-    debug_msgs.append(f'Preprocess: {int((time.time()-t0)*1000)} ms')
-    # Prepare multimodal chat messages
-    messages = [
-        {'role': 'system', 'content': [{'type': 'text', 'text': sys_prompt}]},
-        {'role': 'user', 'content': [
-            {'type': 'image', 'url': temp_path},
-            {'type': 'text', 'text': usr_prompt}
-        ]}
-    ]
-    # Tokenize and encode
-    t1 = time.time()
-    inputs = processor.apply_chat_template(
-        messages,
-        add_generation_prompt=True,
-        tokenize=True,
-        return_dict=True,
-        return_tensors='pt'
-    )
-    # Move inputs to correct device and dtype (matching model parameters)
-    param_dtype = next(model.parameters()).dtype
-    cast_inputs = {}
-    for k, v in inputs.items():
-        if isinstance(v, torch.Tensor):
-            if v.dtype.is_floating_point:
-                # cast floating-point tensors to model's parameter dtype
-                cast_inputs[k] = v.to(device=model.device, dtype=param_dtype)
             else:
-                # move integer/mask tensors without changing dtype
-                cast_inputs[k] = v.to(device=model.device)
-        else:
-            cast_inputs[k] = v
-    inputs = cast_inputs
-    debug_msgs.append(f'Tokenize: {int((time.time()-t1)*1000)} ms')
-    # Inference
-    t2 = time.time()
-    outputs = model.generate(**inputs, do_sample=False, max_new_tokens=128)
-    debug_msgs.append(f'Inference: {int((time.time()-t2)*1000)} ms')
-    # Decode and strip history
-    t3 = time.time()
-    raw = processor.batch_decode(outputs, skip_special_tokens=True)[0]
-    debug_msgs.append(f'Decode: {int((time.time()-t3)*1000)} ms')
-    if "Assistant:" in raw:
-        caption = raw.split("Assistant:")[-1].strip()
-    else:
-        lines = raw.splitlines()
-        caption = lines[-1].strip() if len(lines) > 1 else raw.strip()
-    return caption, '\n'.join(debug_msgs)
 @spaces.GPU
 def process_video_file(video_file, model_id, sys_prompt, usr_prompt, device, max_frames):
@@ -150,11 +175,13 @@ def process_video_file(video_file, model_id, sys_prompt, usr_prompt, device, max
         return "No video file uploaded", ""
     debug_msgs = []
-    update_model(model_id, device)
-    processor = model_cache['processor']
-    model = model_cache['model']
     try:
         # Extract frames from video
         t0 = time.time()
         frames = extract_frames_from_video(video_file, max_frames)
@@ -171,6 +198,7 @@ def process_video_file(video_file, model_id, sys_prompt, usr_prompt, device, max
             rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             pil_img = Image.fromarray(rgb)
             temp_path = f'frame_{i}.jpg'
             pil_img.save(temp_path, format='JPEG', quality=50)
             # Prepare multimodal chat messages
@@ -216,17 +244,20 @@ def process_video_file(video_file, model_id, sys_prompt, usr_prompt, device, max
                 caption = lines[-1].strip() if len(lines) > 1 else raw.strip()
             captions.append(f"Frame {i+1}: {caption}")
-            # Clean up temp file
-            if os.path.exists(temp_path):
-                os.remove(temp_path)
             debug_msgs.append(f'Frame {i+1} processed in {int((time.time()-t1)*1000)} ms')
         return '\n\n'.join(captions), '\n'.join(debug_msgs)
     except Exception as e:
         return f"Error processing video: {str(e)}", '\n'.join(debug_msgs)
 def toggle_input_mode(input_mode):
     """Toggle between webcam and video file input"""

 def update_model(model_id, device):
     if model_cache['model_id'] != model_id or model_cache['device'] != device:
         logging.info(f'Loading model {model_id} on {device}')
+        try:
+            processor = AutoProcessor.from_pretrained(model_id)
+            # Load model with appropriate precision for each device
+            if device == 'cuda':
+                # Use bfloat16 for CUDA for performance
+                model = AutoModelForImageTextToText.from_pretrained(
+                    model_id,
+                    torch_dtype=torch.bfloat16,
+                    _attn_implementation='flash_attention_2'
+                ).to('cuda')
+            elif device == 'xpu' and has_xpu:
+                # Use float32 on XPU to avoid bfloat16 layernorm issues
+                model = AutoModelForImageTextToText.from_pretrained(
+                    model_id,
+                    torch_dtype=torch.float32
+                ).to('xpu')
+            else:
+                # Default to float32 on CPU
+                model = AutoModelForImageTextToText.from_pretrained(model_id).to('cpu')
+            model.eval()
+            model_cache.update({'model_id': model_id, 'processor': processor, 'model': model, 'device': device})
+        except Exception as e:
+            logging.error(f'Error loading model: {e}')
+            raise e
 def extract_frames_from_video(video_path, max_frames=10):
     """Extract frames from video file for processing"""
+    if not os.path.exists(video_path):
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+    # Validate video file
+    if not video_path.lower().endswith(('.mp4', '.avi', '.mov', '.mkv', '.webm')):
+        raise ValueError("Unsupported video format. Please use MP4, AVI, MOV, MKV, or WEBM.")
     cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Cannot open video file: {video_path}")
     frames = []
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if frame_count == 0:
+        cap.release()
+        raise ValueError("Video file appears to be empty or corrupted")
     # Calculate step size to extract evenly distributed frames
     step = max(1, frame_count // max_frames)
 def caption_frame(frame, model_id, interval_ms, sys_prompt, usr_prompt, device):
     """Caption a single frame (used for webcam streaming)"""
     debug_msgs = []
+    try:
+        update_model(model_id, device)
+        processor = model_cache['processor']
+        model = model_cache['model']
+        # Control capture interval
+        time.sleep(interval_ms / 1000)
+        # Preprocess frame
+        t0 = time.time()
+        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        pil_img = Image.fromarray(rgb)
+        temp_path = 'frame.jpg'
+        pil_img.save(temp_path, format='JPEG', quality=50)
+        debug_msgs.append(f'Preprocess: {int((time.time()-t0)*1000)} ms')
+        # Prepare multimodal chat messages
+        messages = [
+            {'role': 'system', 'content': [{'type': 'text', 'text': sys_prompt}]},
+            {'role': 'user', 'content': [
+                {'type': 'image', 'url': temp_path},
+                {'type': 'text', 'text': usr_prompt}
+            ]}
+        ]
+        # Tokenize and encode
+        t1 = time.time()
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors='pt'
+        )
+        # Move inputs to correct device and dtype (matching model parameters)
+        param_dtype = next(model.parameters()).dtype
+        cast_inputs = {}
+        for k, v in inputs.items():
+            if isinstance(v, torch.Tensor):
+                if v.dtype.is_floating_point:
+                    # cast floating-point tensors to model's parameter dtype
+                    cast_inputs[k] = v.to(device=model.device, dtype=param_dtype)
+                else:
+                    # move integer/mask tensors without changing dtype
+                    cast_inputs[k] = v.to(device=model.device)
             else:
+                cast_inputs[k] = v
+        inputs = cast_inputs
+        debug_msgs.append(f'Tokenize: {int((time.time()-t1)*1000)} ms')
+        # Inference
+        t2 = time.time()
+        outputs = model.generate(**inputs, do_sample=False, max_new_tokens=128)
+        debug_msgs.append(f'Inference: {int((time.time()-t2)*1000)} ms')
+        # Decode and strip history
+        t3 = time.time()
+        raw = processor.batch_decode(outputs, skip_special_tokens=True)[0]
+        debug_msgs.append(f'Decode: {int((time.time()-t3)*1000)} ms')
+        if "Assistant:" in raw:
+            caption = raw.split("Assistant:")[-1].strip()
+        else:
+            lines = raw.splitlines()
+            caption = lines[-1].strip() if len(lines) > 1 else raw.strip()
+        # Clean up temp file
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+        return caption, '\n'.join(debug_msgs)
+    except Exception as e:
+        return f"Error: {str(e)}", '\n'.join(debug_msgs)
 @spaces.GPU
 def process_video_file(video_file, model_id, sys_prompt, usr_prompt, device, max_frames):
         return "No video file uploaded", ""
     debug_msgs = []
+    temp_files = []  # Track temporary files for cleanup
     try:
+        update_model(model_id, device)
+        processor = model_cache['processor']
+        model = model_cache['model']
         # Extract frames from video
         t0 = time.time()
         frames = extract_frames_from_video(video_file, max_frames)
             rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             pil_img = Image.fromarray(rgb)
             temp_path = f'frame_{i}.jpg'
+            temp_files.append(temp_path)  # Track for cleanup
             pil_img.save(temp_path, format='JPEG', quality=50)
             # Prepare multimodal chat messages
                 caption = lines[-1].strip() if len(lines) > 1 else raw.strip()
             captions.append(f"Frame {i+1}: {caption}")
             debug_msgs.append(f'Frame {i+1} processed in {int((time.time()-t1)*1000)} ms')
         return '\n\n'.join(captions), '\n'.join(debug_msgs)
     except Exception as e:
         return f"Error processing video: {str(e)}", '\n'.join(debug_msgs)
+    finally:
+        # Clean up all temporary files
+        for temp_file in temp_files:
+            if os.path.exists(temp_file):
+                try:
+                    os.remove(temp_file)
+                except Exception as cleanup_error:
+                    logging.warning(f"Failed to cleanup {temp_file}: {cleanup_error}")
 def toggle_input_mode(input_mode):
     """Toggle between webcam and video file input"""