Spaces:

AC2513
/

gemma-demo

Running on Zero

File size: 14,399 Bytes

import os
import cv2
import fitz
import tempfile
from PIL import Image
from loguru import logger

# Constants
MAX_VIDEO_SIZE = 100 * 1024 * 1024  # 100 MB
MAX_IMAGE_SIZE = 10 * 1024 * 1024   # 10 MB

PRESET_PROMPTS = {
    "General Assistant": "You are a helpful AI assistant capable of analyzing images, videos, and PDF documents. Provide clear, accurate, and helpful responses to user queries.",
    
    "Document Analyzer": "You are a specialized document analysis assistant. Focus on extracting key information, summarizing content, and answering specific questions about uploaded documents. For PDFs, provide structured analysis including main topics, key points, and relevant details. For images containing text, perform OCR-like analysis.",
    
    "Visual Content Expert": "You are an expert in visual content analysis. When analyzing images, provide detailed descriptions of visual elements, composition, colors, objects, people, and scenes. For videos, describe the sequence of events, movements, and changes between frames. Identify artistic techniques, styles, and visual storytelling elements.",
    
    "Educational Tutor": "You are a patient and encouraging educational tutor. Break down complex concepts into simple, understandable explanations. When analyzing educational materials (images, videos, or documents), focus on learning objectives, key concepts, and provide additional context or examples to enhance understanding.",
    
    "Technical Reviewer": "You are a technical expert specializing in analyzing technical documents, diagrams, code screenshots, and instructional videos. Provide detailed technical insights, identify potential issues, suggest improvements, and explain technical concepts with precision and accuracy.",
    
    "Creative Storyteller": "You are a creative storyteller who brings visual content to life through engaging narratives. When analyzing images or videos, create compelling stories, describe scenes with rich detail, and help users explore the creative and emotional aspects of visual content.",
}

def check_file_size(file_path: str) -> bool:
    """Check if a file meets the size requirements for processing.

    Validates that the file exists and is within the allowed size limits based on file type.
    Video files (.mp4, .mov) have a limit of 100MB, while image files have a limit of 10MB.

    Args:
        file_path (str): The absolute path to the file to be checked.

    Returns:
        bool: True if the file meets size requirements.

    Raises:
        ValueError: If the file doesn't exist, or if the file size exceeds the maximum
            allowed size for its type.
    """
    if not os.path.exists(file_path):
        raise ValueError(f"File not found: {file_path}")
    
    file_size = os.path.getsize(file_path)
    
    if file_path.lower().endswith((".mp4", ".mov")):
        if file_size > MAX_VIDEO_SIZE:
            raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
    else:
        if file_size > MAX_IMAGE_SIZE:
            raise ValueError(f"Image file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
    
    return True


def get_frames(video_path: str, max_images: int) -> list[tuple[Image.Image, float]]:
    """Extract frames from a video file at regular intervals.

    Opens a video file and extracts frames at evenly distributed intervals to get
    a representative sample of the video content. Each frame is converted to RGB
    format and returned as a PIL Image along with its timestamp.

    Args:
        video_path (str): The absolute path to the video file (.mp4 or .mov).
        max_images (int): The maximum number of frames to extract from the video.
            Must be a positive integer.

    Returns:
        list[tuple[Image.Image, float]]: A list of tuples where each tuple contains
            an Image.Image object (the extracted frame in RGB format) and a float
            (the timestamp of the frame in seconds, rounded to 2 decimal places).

    Raises:
        ValueError: If the video file cannot be opened or if file size validation fails.
    """
    check_file_size(video_path)
    
    frames: list[tuple[Image.Image, float]] = []
    capture = cv2.VideoCapture(video_path)
    if not capture.isOpened():
        raise ValueError(f"Could not open video file: {video_path}")

    fps = capture.get(cv2.CAP_PROP_FPS)
    total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))

    frame_interval = max(total_frames // max_images, 1)
    max_position = min(total_frames, max_images * frame_interval)
    i = 0

    while i < max_position and len(frames) < max_images:
        capture.set(cv2.CAP_PROP_POS_FRAMES, i)
        success, image = capture.read()
        if success:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(image)
            timestamp = round(i / fps, 2)
            frames.append((pil_image, timestamp))

        i += frame_interval

    capture.release()
    return frames


def process_video(video_path: str, max_images: int) -> list[dict]:
    """Process a video file and return formatted content for model input.

    Extracts frames from a video file, saves them as temporary PNG files, and
    formats them into a structure suitable for multimodal model input. Each frame
    is paired with descriptive text indicating its timestamp.

    Args:
        video_path (str): The absolute path to the video file to be processed.
        max_images (int): The maximum number of frames to extract and process.

    Returns:
        list[dict]: A list of dictionaries representing the processed video content.
            The structure alternates between text descriptions and image references:
            {"type": "text", "text": "Frame {timestamp}:"} and
            {"type": "image", "url": "/path/to/temp/frame.png"}.

    Note:
        Creates temporary PNG files that are not automatically cleaned up.
        The caller is responsible for cleanup if needed.
    """
    result_content = []
    frames = get_frames(video_path, max_images)
    for frame in frames:
        image, timestamp = frame
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
            image.save(temp_file.name)
            result_content.append({"type": "text", "text": f"Frame {timestamp}:"})
            result_content.append({"type": "image", "url": temp_file.name})
    logger.debug(
        f"Processed {len(frames)} frames from video {video_path} with frames {result_content}"
    )
    return result_content


def extract_pdf_text(pdf_path: str) -> str:
    """Extract text content from a PDF file.

    Opens a PDF file and extracts all readable text content from each page.
    Pages are numbered and formatted for readability. Empty pages are skipped.

    Args:
        pdf_path (str): The absolute path to the PDF file to be processed.

    Returns:
        str: The extracted text content with page numbers and formatting.
            If no text is found, returns a message indicating no content was found.

    Raises:
        ValueError: If the file size validation fails or if PDF processing encounters
            an error that prevents text extraction.
    """
    check_file_size(pdf_path)
    
    try:
        doc = fitz.open(pdf_path)
        text_content = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            if text.strip():  # Only add non-empty pages
                text_content.append(f"Page {page_num + 1}:\n{text}")
        
        doc.close()
        
        if not text_content:
            return "No text content found in the PDF."
        
        return "\n\n".join(text_content)
    
    except Exception as e:
        logger.error(f"Error extracting text from PDF {pdf_path}: {e}")
        raise ValueError(f"Failed to extract text from PDF: {str(e)}")


def process_user_input(message: dict, max_images: int) -> list[dict]:
    """Process user input including files and return formatted content for the model.

    Takes a user message that may contain text and file attachments, processes each
    file according to its type, and returns a structured format suitable for
    multimodal model input. Handles videos, PDFs, and image files.

    Args:
        message (dict): A dictionary containing user input with keys:
            "text" (str) - The user's text message, and
            "files" (list[str]) - List of file paths attached to the message.
        max_images (int): Maximum number of frames to extract from video files.

    Returns:
        list[dict]: A list of dictionaries representing the processed content with
            types "text" or "image" and corresponding content data. Includes error
            messages for files that cannot be processed.
    """
    if not message["files"]:
        return [{"type": "text", "text": message["text"]}]

    result_content = [{"type": "text", "text": message["text"]}]

    for file_path in message["files"]:
        try:
            check_file_size(file_path)
        except ValueError as e:
            logger.error(f"File size check failed: {e}")
            result_content.append({"type": "text", "text": f"Error: {str(e)}"})
            continue
            
        if file_path.endswith((".mp4", ".mov")):
            try:
                result_content = [*result_content, *process_video(file_path, max_images)]
            except Exception as e:
                logger.error(f"Video processing failed: {e}")
                result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
        elif file_path.lower().endswith(".pdf"):
            try:
                logger.info(f"Processing PDF file: {file_path}")
                pdf_text = extract_pdf_text(file_path)
                logger.debug(f"PDF text extracted successfully, length: {len(pdf_text)} characters")
                result_content.append({"type": "text", "text": f"PDF Content:\n{pdf_text}"})
            except ValueError as ve:
                logger.error(f"PDF validation failed: {ve}")
                result_content.append({"type": "text", "text": f"Error processing PDF: {str(ve)}"})
            except Exception as e:
                logger.error(f"PDF processing failed: {e}")
                result_content.append({"type": "text", "text": f"Error processing PDF: {str(e)}"})
        else:
            result_content = [*result_content, {"type": "image", "url": file_path}]

    return result_content


def process_history(history: list[dict]) -> list[dict]:
    """Process chat history into the format expected by the model.

    Converts chat history from the UI format into the structured format required
    by multimodal language models. Groups consecutive user messages and handles
    different content types (text, images, videos, PDFs) appropriately.

    Args:
        history (list[dict]): A list of chat history items, where each item contains
            "role" (str) - either "user" or "assistant", and
            "content" - the message content (str for text, tuple for files).

    Returns:
        list[dict]: A list of messages formatted for the model with "role" and
            "content" keys, where content is a list of dictionaries with "type"
            and associated data.

    Note:
        Groups consecutive user messages into a single message. Videos and PDFs
        in history are replaced with placeholder text to avoid reprocessing.
    """
    messages = []
    content_buffer = []

    for item in history:
        if item["role"] == "assistant":
            if content_buffer:
                messages.append({"role": "user", "content": content_buffer})
                content_buffer = []

            messages.append(
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": item["content"]}],
                }
            )
        else:
            content = item["content"]
            if isinstance(content, str):
                content_buffer.append({"type": "text", "text": content})
            elif isinstance(content, tuple) and len(content) > 0:
                file_path = content[0]
                if file_path.endswith((".mp4", ".mov")):
                    content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
                elif file_path.lower().endswith(".pdf"):
                    content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
                else:
                    content_buffer.append({"type": "image", "url": file_path})

    if content_buffer:
        messages.append({"role": "user", "content": content_buffer})

    return messages


def update_custom_prompt(preset_choice: str) -> str:
    """Update the custom prompt based on preset selection.

    Returns the appropriate preset prompt text based on the user's selection.
    If "Custom Prompt" is selected, returns an empty string to allow manual input.

    Args:
        preset_choice (str): The name of the selected preset prompt. Should match
            one of the keys in PRESET_PROMPTS or be "Custom Prompt".

    Returns:
        str: The preset prompt text corresponding to the selection, or an empty
            string if "Custom Prompt" is selected or if the preset is not found.
    """
    if preset_choice == "Custom Prompt":
        return ""
    return PRESET_PROMPTS.get(preset_choice, "")


def get_preset_prompts() -> dict[str, str]:
    """Return the dictionary of preset prompts for the main application.

    Provides a copy of the predefined prompt templates that can be used throughout
    the application. Each preset is designed for a specific use case and contains
    detailed instructions for the AI model's behavior.

    Returns:
        dict[str, str]: A dictionary mapping preset names to their prompt texts.
            Includes prompts for general assistance, document analysis, visual content
            analysis, educational tutoring, technical review, and creative storytelling.

    Note:
        Returns a copy of the PRESET_PROMPTS dictionary to prevent accidental
        modification of the original constants.
    """
    return PRESET_PROMPTS.copy()