#!/usr/bin/env python3 import cv2, os, subprocess, argparse from PIL import Image import torch from transformers import AutoModelForCausalLM, AutoTokenizer from tqdm import tqdm import numpy as np from datetime import datetime # Constants TEST_MODE_DURATION = 3 # Process only first 3 seconds in test mode FFMPEG_PRESETS = [ "ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow", "slower", "veryslow", ] FONT = cv2.FONT_HERSHEY_SIMPLEX # Font for bounding-box-style labels # Detection parameters IOU_THRESHOLD = 0.5 # IoU threshold for considering boxes related # Hitmarker parameters HITMARKER_SIZE = 20 # Size of the hitmarker in pixels HITMARKER_GAP = 3 # Size of the empty space in the middle (reduced from 8) HITMARKER_THICKNESS = 2 # Thickness of hitmarker lines HITMARKER_COLOR = (255, 255, 255) # White color for hitmarker HITMARKER_SHADOW_COLOR = (80, 80, 80) # Lighter gray for shadow effect HITMARKER_SHADOW_OFFSET = 1 # Smaller shadow offset def load_moondream(): """Load Moondream model and tokenizer.""" model = AutoModelForCausalLM.from_pretrained( "vikhyatk/moondream2", trust_remote_code=True, device_map={"": "cuda"} ) tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2") return model, tokenizer def get_video_properties(video_path): """Get basic video properties.""" video = cv2.VideoCapture(video_path) fps = video.get(cv2.CAP_PROP_FPS) frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT)) width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) video.release() return {"fps": fps, "frame_count": frame_count, "width": width, "height": height} def is_valid_box(box): """Check if box coordinates are reasonable.""" x1, y1, x2, y2 = box width = x2 - x1 height = y2 - y1 # Reject boxes that are too large (over 90% of frame in both dimensions) if width > 0.9 and height > 0.9: return False # Reject boxes that are too small (less than 1% of frame) if width < 0.01 or height < 0.01: return False return True def split_frame_into_tiles(frame, rows, cols): """Split a frame into a grid of tiles.""" height, width = frame.shape[:2] tile_height = height // rows tile_width = width // cols tiles = [] tile_positions = [] for i in range(rows): for j in range(cols): y1 = i * tile_height y2 = (i + 1) * tile_height if i < rows - 1 else height x1 = j * tile_width x2 = (j + 1) * tile_width if j < cols - 1 else width tile = frame[y1:y2, x1:x2] tiles.append(tile) tile_positions.append((x1, y1, x2, y2)) return tiles, tile_positions def convert_tile_coords_to_frame(box, tile_pos, frame_shape): """Convert coordinates from tile space to frame space.""" frame_height, frame_width = frame_shape[:2] tile_x1, tile_y1, tile_x2, tile_y2 = tile_pos tile_width = tile_x2 - tile_x1 tile_height = tile_y2 - tile_y1 x1_tile_abs = box[0] * tile_width y1_tile_abs = box[1] * tile_height x2_tile_abs = box[2] * tile_width y2_tile_abs = box[3] * tile_height x1_frame_abs = tile_x1 + x1_tile_abs y1_frame_abs = tile_y1 + y1_tile_abs x2_frame_abs = tile_x1 + x2_tile_abs y2_frame_abs = tile_y1 + y2_tile_abs x1_norm = x1_frame_abs / frame_width y1_norm = y1_frame_abs / frame_height x2_norm = x2_frame_abs / frame_width y2_norm = y2_frame_abs / frame_height x1_norm = max(0.0, min(1.0, x1_norm)) y1_norm = max(0.0, min(1.0, y1_norm)) x2_norm = max(0.0, min(1.0, x2_norm)) y2_norm = max(0.0, min(1.0, y2_norm)) return [x1_norm, y1_norm, x2_norm, y2_norm] def merge_tile_detections(tile_detections, iou_threshold=0.5): """Merge detections from different tiles using NMS-like approach.""" if not tile_detections: return [] all_boxes = [] all_keywords = [] # Collect all boxes and their keywords for detections in tile_detections: for box, keyword in detections: all_boxes.append(box) all_keywords.append(keyword) if not all_boxes: return [] # Convert to numpy for easier processing boxes = np.array(all_boxes) # Calculate areas x1 = boxes[:, 0] y1 = boxes[:, 1] x2 = boxes[:, 2] y2 = boxes[:, 3] areas = (x2 - x1) * (y2 - y1) # Sort boxes by area order = areas.argsort()[::-1] keep = [] while order.size > 0: i = order[0] keep.append(i) if order.size == 1: break # Calculate IoU with rest of boxes xx1 = np.maximum(x1[i], x1[order[1:]]) yy1 = np.maximum(y1[i], y1[order[1:]]) xx2 = np.minimum(x2[i], x2[order[1:]]) yy2 = np.minimum(y2[i], y2[order[1:]]) w = np.maximum(0.0, xx2 - xx1) h = np.maximum(0.0, yy2 - yy1) inter = w * h ovr = inter / (areas[i] + areas[order[1:]] - inter) # Get indices of boxes with IoU less than threshold inds = np.where(ovr <= iou_threshold)[0] order = order[inds + 1] return [(all_boxes[i], all_keywords[i]) for i in keep] def detect_ads_in_frame(model, tokenizer, image, detect_keyword, rows=1, cols=1): """Detect objects in a frame using grid-based detection.""" if rows == 1 and cols == 1: return detect_ads_in_frame_single(model, tokenizer, image, detect_keyword) # Convert numpy array to PIL Image if needed if not isinstance(image, Image.Image): image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # Split frame into tiles tiles, tile_positions = split_frame_into_tiles(image, rows, cols) # Process each tile tile_detections = [] for tile, tile_pos in zip(tiles, tile_positions): # Convert tile to PIL Image tile_pil = Image.fromarray(tile) # Detect objects in tile response = model.detect(tile_pil, detect_keyword) if response and "objects" in response and response["objects"]: objects = response["objects"] tile_objects = [] for obj in objects: if all(k in obj for k in ["x_min", "y_min", "x_max", "y_max"]): box = [obj["x_min"], obj["y_min"], obj["x_max"], obj["y_max"]] if is_valid_box(box): # Convert tile coordinates to frame coordinates frame_box = convert_tile_coords_to_frame( box, tile_pos, image.shape ) tile_objects.append((frame_box, detect_keyword)) if tile_objects: # Only append if we found valid objects tile_detections.append(tile_objects) # Merge detections from all tiles merged_detections = merge_tile_detections(tile_detections) return merged_detections def detect_ads_in_frame_single(model, tokenizer, image, detect_keyword): """Single-frame detection function.""" detected_objects = [] # Convert numpy array to PIL Image if needed if not isinstance(image, Image.Image): image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) # Detect objects response = model.detect(image, detect_keyword) # Check if we have valid objects if response and "objects" in response and response["objects"]: objects = response["objects"] for obj in objects: if all(k in obj for k in ["x_min", "y_min", "x_max", "y_max"]): box = [obj["x_min"], obj["y_min"], obj["x_max"], obj["y_max"]] # If box is valid (not full-frame), add it if is_valid_box(box): detected_objects.append((box, detect_keyword)) return detected_objects def draw_hitmarker( frame, center_x, center_y, size=HITMARKER_SIZE, color=HITMARKER_COLOR, shadow=True ): """Draw a COD-style hitmarker cross with more space in the middle.""" half_size = size // 2 # Draw shadow first if enabled if shadow: # Top-left to center shadow cv2.line( frame, ( center_x - half_size + HITMARKER_SHADOW_OFFSET, center_y - half_size + HITMARKER_SHADOW_OFFSET, ), ( center_x - HITMARKER_GAP + HITMARKER_SHADOW_OFFSET, center_y - HITMARKER_GAP + HITMARKER_SHADOW_OFFSET, ), HITMARKER_SHADOW_COLOR, HITMARKER_THICKNESS, ) # Top-right to center shadow cv2.line( frame, ( center_x + half_size + HITMARKER_SHADOW_OFFSET, center_y - half_size + HITMARKER_SHADOW_OFFSET, ), ( center_x + HITMARKER_GAP + HITMARKER_SHADOW_OFFSET, center_y - HITMARKER_GAP + HITMARKER_SHADOW_OFFSET, ), HITMARKER_SHADOW_COLOR, HITMARKER_THICKNESS, ) # Bottom-left to center shadow cv2.line( frame, ( center_x - half_size + HITMARKER_SHADOW_OFFSET, center_y + half_size + HITMARKER_SHADOW_OFFSET, ), ( center_x - HITMARKER_GAP + HITMARKER_SHADOW_OFFSET, center_y + HITMARKER_GAP + HITMARKER_SHADOW_OFFSET, ), HITMARKER_SHADOW_COLOR, HITMARKER_THICKNESS, ) # Bottom-right to center shadow cv2.line( frame, ( center_x + half_size + HITMARKER_SHADOW_OFFSET, center_y + half_size + HITMARKER_SHADOW_OFFSET, ), ( center_x + HITMARKER_GAP + HITMARKER_SHADOW_OFFSET, center_y + HITMARKER_GAP + HITMARKER_SHADOW_OFFSET, ), HITMARKER_SHADOW_COLOR, HITMARKER_THICKNESS, ) # Draw main hitmarker # Top-left to center cv2.line( frame, (center_x - half_size, center_y - half_size), (center_x - HITMARKER_GAP, center_y - HITMARKER_GAP), color, HITMARKER_THICKNESS, ) # Top-right to center cv2.line( frame, (center_x + half_size, center_y - half_size), (center_x + HITMARKER_GAP, center_y - HITMARKER_GAP), color, HITMARKER_THICKNESS, ) # Bottom-left to center cv2.line( frame, (center_x - half_size, center_y + half_size), (center_x - HITMARKER_GAP, center_y + HITMARKER_GAP), color, HITMARKER_THICKNESS, ) # Bottom-right to center cv2.line( frame, (center_x + half_size, center_y + half_size), (center_x + HITMARKER_GAP, center_y + HITMARKER_GAP), color, HITMARKER_THICKNESS, ) def draw_ad_boxes(frame, detected_objects, detect_keyword, box_style="censor"): """Draw detection visualizations over detected objects. Args: frame: The video frame to draw on detected_objects: List of (box, keyword) tuples detect_keyword: The detection keyword box_style: Visualization style ('censor', 'bounding-box', or 'hitmarker') """ height, width = frame.shape[:2] for box, keyword in detected_objects: try: # Convert normalized coordinates to pixel coordinates x1 = int(box[0] * width) y1 = int(box[1] * height) x2 = int(box[2] * width) y2 = int(box[3] * height) # Ensure coordinates are within frame boundaries x1 = max(0, min(x1, width - 1)) y1 = max(0, min(y1, height - 1)) x2 = max(0, min(x2, width - 1)) y2 = max(0, min(y2, height - 1)) # Only draw if box has reasonable size if x2 > x1 and y2 > y1: if box_style == "censor": # Draw solid black rectangle cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 0), -1) elif box_style == "bounding-box": # Draw red rectangle with thicker line cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 3) # Add label with background label = detect_keyword # Use exact capitalization label_size = cv2.getTextSize(label, FONT, 0.7, 2)[0] cv2.rectangle( frame, (x1, y1 - 25), (x1 + label_size[0], y1), (0, 0, 255), -1 ) cv2.putText( frame, label, (x1, y1 - 6), FONT, 0.7, (255, 255, 255), 2, cv2.LINE_AA, ) elif box_style == "hitmarker": # Calculate center of the box center_x = (x1 + x2) // 2 center_y = (y1 + y2) // 2 # Draw hitmarker at the center draw_hitmarker(frame, center_x, center_y) # Optional: Add small label above hitmarker label = detect_keyword # Use exact capitalization label_size = cv2.getTextSize(label, FONT, 0.5, 1)[0] cv2.putText( frame, label, (center_x - label_size[0] // 2, center_y - HITMARKER_SIZE - 5), FONT, 0.5, HITMARKER_COLOR, 1, cv2.LINE_AA, ) except Exception as e: print(f"Error drawing {box_style} style box: {str(e)}") return frame def filter_temporal_outliers(detections_dict): """Filter out extremely large detections that take up most of the frame. Only keeps detections that are reasonable in size. Args: detections_dict: Dictionary of {frame_number: [(box, keyword), ...]} """ filtered_detections = {} for t, detections in detections_dict.items(): # Only keep detections that aren't too large valid_detections = [] for box, keyword in detections: # Calculate box size as percentage of frame width = box[2] - box[0] height = box[3] - box[1] area = width * height # If box is less than 90% of frame, keep it if area < 0.9: valid_detections.append((box, keyword)) if valid_detections: filtered_detections[t] = valid_detections return filtered_detections def describe_frames( video_path, model, tokenizer, detect_keyword, test_mode=False, rows=1, cols=1 ): """Extract and detect objects in frames.""" props = get_video_properties(video_path) fps = props["fps"] # If in test mode, only process first 3 seconds if test_mode: frame_count = min(int(fps * TEST_MODE_DURATION), props["frame_count"]) else: frame_count = props["frame_count"] ad_detections = {} # Store detection results by frame number print("Extracting frames and detecting objects...") video = cv2.VideoCapture(video_path) # Process every frame frame_count_processed = 0 with tqdm(total=frame_count) as pbar: while frame_count_processed < frame_count: ret, frame = video.read() if not ret: break # Detect objects in the frame detected_objects = detect_ads_in_frame( model, tokenizer, frame, detect_keyword, rows=rows, cols=cols ) # Store results for every frame, even if empty ad_detections[frame_count_processed] = detected_objects frame_count_processed += 1 pbar.update(1) video.release() if frame_count_processed == 0: print("No frames could be read from video") return {} # Filter out only extremely large detections ad_detections = filter_temporal_outliers(ad_detections) return ad_detections def create_detection_video( video_path, ad_detections, detect_keyword, output_path=None, ffmpeg_preset="medium", test_mode=False, box_style="censor", ): """Create video with detection boxes.""" if output_path is None: # Create outputs directory if it doesn't exist outputs_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "outputs" ) os.makedirs(outputs_dir, exist_ok=True) # Clean the detect_keyword for filename safe_keyword = "".join( x for x in detect_keyword if x.isalnum() or x in (" ", "_", "-") ) safe_keyword = safe_keyword.replace(" ", "_") # Create output filename base_name = os.path.splitext(os.path.basename(video_path))[0] output_path = os.path.join( outputs_dir, f"{box_style}_{safe_keyword}_{base_name}.mp4" ) print(f"Will save output to: {output_path}") props = get_video_properties(video_path) fps, width, height = props["fps"], props["width"], props["height"] # If in test mode, only process first few seconds if test_mode: frame_count = min(int(fps * TEST_MODE_DURATION), props["frame_count"]) else: frame_count = props["frame_count"] video = cv2.VideoCapture(video_path) # Create temp output path by adding _temp before the extension base, ext = os.path.splitext(output_path) temp_output = f"{base}_temp{ext}" out = cv2.VideoWriter( temp_output, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height) ) print("Creating detection video...") frame_count_processed = 0 with tqdm(total=frame_count) as pbar: while frame_count_processed < frame_count: ret, frame = video.read() if not ret: break # Get detections for this exact frame if frame_count_processed in ad_detections: current_detections = ad_detections[frame_count_processed] if current_detections: frame = draw_ad_boxes( frame, current_detections, detect_keyword, box_style=box_style ) out.write(frame) frame_count_processed += 1 pbar.update(1) video.release() out.release() # Convert to web-compatible format more efficiently try: subprocess.run( [ "ffmpeg", "-y", "-i", temp_output, "-c:v", "libx264", "-preset", ffmpeg_preset, "-crf", "23", "-movflags", "+faststart", # Better web playback "-loglevel", "error", output_path, ], check=True, ) os.remove(temp_output) # Remove the temporary file if not os.path.exists(output_path): print( f"Warning: FFmpeg completed but output file not found at {output_path}" ) return None return output_path except subprocess.CalledProcessError as e: print(f"Error running FFmpeg: {str(e)}") if os.path.exists(temp_output): os.remove(temp_output) return None def process_video( video_path, detect_keyword, test_mode=False, ffmpeg_preset="medium", rows=1, cols=1, box_style="censor", ): """Process a single video file.""" print(f"\nProcessing: {video_path}") print(f"Looking for: {detect_keyword}") # Load model print("Loading Moondream model...") model, tokenizer = load_moondream() # Process video - detect objects ad_detections = describe_frames( video_path, model, tokenizer, detect_keyword, test_mode, rows, cols ) # Create video with detection boxes output_path = create_detection_video( video_path, ad_detections, detect_keyword, ffmpeg_preset=ffmpeg_preset, test_mode=test_mode, box_style=box_style, ) if output_path is None: print("\nError: Failed to create output video") return None print(f"\nOutput saved to: {output_path}") return output_path def main(): """Process all videos in the inputs directory.""" parser = argparse.ArgumentParser( description="Detect objects in videos using Moondream2" ) parser.add_argument( "--test", action="store_true", help="Process only first 3 seconds of each video" ) parser.add_argument( "--preset", choices=FFMPEG_PRESETS, default="medium", help="FFmpeg encoding preset (default: medium). Faster presets = lower quality", ) parser.add_argument( "--detect", type=str, default="face", help='Object to detect in the video (default: face, use --detect "thing to detect" to override)', ) parser.add_argument( "--rows", type=int, default=1, help="Number of rows to split each frame into (default: 1)", ) parser.add_argument( "--cols", type=int, default=1, help="Number of columns to split each frame into (default: 1)", ) parser.add_argument( "--box-style", choices=["censor", "bounding-box", "hitmarker"], default="censor", help="Style of detection visualization (default: censor)", ) args = parser.parse_args() input_dir = "inputs" os.makedirs(input_dir, exist_ok=True) os.makedirs("outputs", exist_ok=True) video_files = [ f for f in os.listdir(input_dir) if f.lower().endswith((".mp4", ".avi", ".mov", ".mkv", ".webm")) ] if not video_files: print("No video files found in 'inputs' directory") return print(f"Found {len(video_files)} videos to process") print(f"Will detect: {args.detect}") if args.test: print("Running in test mode - processing only first 3 seconds of each video") print(f"Using FFmpeg preset: {args.preset}") print(f"Grid size: {args.rows}x{args.cols}") print(f"Box style: {args.box_style}") success_count = 0 for video_file in video_files: video_path = os.path.join(input_dir, video_file) output_path = process_video( video_path, args.detect, test_mode=args.test, ffmpeg_preset=args.preset, rows=args.rows, cols=args.cols, box_style=args.box_style, ) if output_path: success_count += 1 print( f"\nProcessing complete. Successfully processed {success_count} out of {len(video_files)} videos." ) if __name__ == "__main__": main()