File size: 14,399 Bytes
ceb2ea0
 
 
 
 
 
 
 
 
 
 
ce78f65
 
 
 
 
 
 
 
 
 
 
 
 
ceb2ea0
 
54e151e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceb2ea0
 
 
 
 
1a184e0
ceb2ea0
 
 
 
1a184e0
ceb2ea0
 
 
 
 
54e151e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceb2ea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54e151e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceb2ea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54e151e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceb2ea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54e151e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceb2ea0
 
 
 
 
 
 
 
 
 
 
 
 
1a184e0
ceb2ea0
 
 
 
 
1a184e0
ceb2ea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54e151e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ceb2ea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a184e0
ceb2ea0
1a184e0
ceb2ea0
 
 
 
 
 
 
 
 
 
ce78f65
54e151e
 
 
 
 
 
 
 
 
 
 
 
 
ce78f65
 
 
ceb2ea0
 
ce78f65
54e151e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
import os
import cv2
import fitz
import tempfile
from PIL import Image
from loguru import logger

# Constants
MAX_VIDEO_SIZE = 100 * 1024 * 1024  # 100 MB
MAX_IMAGE_SIZE = 10 * 1024 * 1024   # 10 MB

PRESET_PROMPTS = {
    "General Assistant": "You are a helpful AI assistant capable of analyzing images, videos, and PDF documents. Provide clear, accurate, and helpful responses to user queries.",
    
    "Document Analyzer": "You are a specialized document analysis assistant. Focus on extracting key information, summarizing content, and answering specific questions about uploaded documents. For PDFs, provide structured analysis including main topics, key points, and relevant details. For images containing text, perform OCR-like analysis.",
    
    "Visual Content Expert": "You are an expert in visual content analysis. When analyzing images, provide detailed descriptions of visual elements, composition, colors, objects, people, and scenes. For videos, describe the sequence of events, movements, and changes between frames. Identify artistic techniques, styles, and visual storytelling elements.",
    
    "Educational Tutor": "You are a patient and encouraging educational tutor. Break down complex concepts into simple, understandable explanations. When analyzing educational materials (images, videos, or documents), focus on learning objectives, key concepts, and provide additional context or examples to enhance understanding.",
    
    "Technical Reviewer": "You are a technical expert specializing in analyzing technical documents, diagrams, code screenshots, and instructional videos. Provide detailed technical insights, identify potential issues, suggest improvements, and explain technical concepts with precision and accuracy.",
    
    "Creative Storyteller": "You are a creative storyteller who brings visual content to life through engaging narratives. When analyzing images or videos, create compelling stories, describe scenes with rich detail, and help users explore the creative and emotional aspects of visual content.",
}

def check_file_size(file_path: str) -> bool:
    """Check if a file meets the size requirements for processing.

    Validates that the file exists and is within the allowed size limits based on file type.
    Video files (.mp4, .mov) have a limit of 100MB, while image files have a limit of 10MB.

    Args:
        file_path (str): The absolute path to the file to be checked.

    Returns:
        bool: True if the file meets size requirements.

    Raises:
        ValueError: If the file doesn't exist, or if the file size exceeds the maximum
            allowed size for its type.
    """
    if not os.path.exists(file_path):
        raise ValueError(f"File not found: {file_path}")
    
    file_size = os.path.getsize(file_path)
    
    if file_path.lower().endswith((".mp4", ".mov")):
        if file_size > MAX_VIDEO_SIZE:
            raise ValueError(f"Video file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_VIDEO_SIZE / (1024*1024):.0f}MB")
    else:
        if file_size > MAX_IMAGE_SIZE:
            raise ValueError(f"Image file too large: {file_size / (1024*1024):.1f}MB. Maximum allowed: {MAX_IMAGE_SIZE / (1024*1024):.0f}MB")
    
    return True


def get_frames(video_path: str, max_images: int) -> list[tuple[Image.Image, float]]:
    """Extract frames from a video file at regular intervals.

    Opens a video file and extracts frames at evenly distributed intervals to get
    a representative sample of the video content. Each frame is converted to RGB
    format and returned as a PIL Image along with its timestamp.

    Args:
        video_path (str): The absolute path to the video file (.mp4 or .mov).
        max_images (int): The maximum number of frames to extract from the video.
            Must be a positive integer.

    Returns:
        list[tuple[Image.Image, float]]: A list of tuples where each tuple contains
            an Image.Image object (the extracted frame in RGB format) and a float
            (the timestamp of the frame in seconds, rounded to 2 decimal places).

    Raises:
        ValueError: If the video file cannot be opened or if file size validation fails.
    """
    check_file_size(video_path)
    
    frames: list[tuple[Image.Image, float]] = []
    capture = cv2.VideoCapture(video_path)
    if not capture.isOpened():
        raise ValueError(f"Could not open video file: {video_path}")

    fps = capture.get(cv2.CAP_PROP_FPS)
    total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))

    frame_interval = max(total_frames // max_images, 1)
    max_position = min(total_frames, max_images * frame_interval)
    i = 0

    while i < max_position and len(frames) < max_images:
        capture.set(cv2.CAP_PROP_POS_FRAMES, i)
        success, image = capture.read()
        if success:
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(image)
            timestamp = round(i / fps, 2)
            frames.append((pil_image, timestamp))

        i += frame_interval

    capture.release()
    return frames


def process_video(video_path: str, max_images: int) -> list[dict]:
    """Process a video file and return formatted content for model input.

    Extracts frames from a video file, saves them as temporary PNG files, and
    formats them into a structure suitable for multimodal model input. Each frame
    is paired with descriptive text indicating its timestamp.

    Args:
        video_path (str): The absolute path to the video file to be processed.
        max_images (int): The maximum number of frames to extract and process.

    Returns:
        list[dict]: A list of dictionaries representing the processed video content.
            The structure alternates between text descriptions and image references:
            {"type": "text", "text": "Frame {timestamp}:"} and
            {"type": "image", "url": "/path/to/temp/frame.png"}.

    Note:
        Creates temporary PNG files that are not automatically cleaned up.
        The caller is responsible for cleanup if needed.
    """
    result_content = []
    frames = get_frames(video_path, max_images)
    for frame in frames:
        image, timestamp = frame
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as temp_file:
            image.save(temp_file.name)
            result_content.append({"type": "text", "text": f"Frame {timestamp}:"})
            result_content.append({"type": "image", "url": temp_file.name})
    logger.debug(
        f"Processed {len(frames)} frames from video {video_path} with frames {result_content}"
    )
    return result_content


def extract_pdf_text(pdf_path: str) -> str:
    """Extract text content from a PDF file.

    Opens a PDF file and extracts all readable text content from each page.
    Pages are numbered and formatted for readability. Empty pages are skipped.

    Args:
        pdf_path (str): The absolute path to the PDF file to be processed.

    Returns:
        str: The extracted text content with page numbers and formatting.
            If no text is found, returns a message indicating no content was found.

    Raises:
        ValueError: If the file size validation fails or if PDF processing encounters
            an error that prevents text extraction.
    """
    check_file_size(pdf_path)
    
    try:
        doc = fitz.open(pdf_path)
        text_content = []
        
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text = page.get_text()
            if text.strip():  # Only add non-empty pages
                text_content.append(f"Page {page_num + 1}:\n{text}")
        
        doc.close()
        
        if not text_content:
            return "No text content found in the PDF."
        
        return "\n\n".join(text_content)
    
    except Exception as e:
        logger.error(f"Error extracting text from PDF {pdf_path}: {e}")
        raise ValueError(f"Failed to extract text from PDF: {str(e)}")


def process_user_input(message: dict, max_images: int) -> list[dict]:
    """Process user input including files and return formatted content for the model.

    Takes a user message that may contain text and file attachments, processes each
    file according to its type, and returns a structured format suitable for
    multimodal model input. Handles videos, PDFs, and image files.

    Args:
        message (dict): A dictionary containing user input with keys:
            "text" (str) - The user's text message, and
            "files" (list[str]) - List of file paths attached to the message.
        max_images (int): Maximum number of frames to extract from video files.

    Returns:
        list[dict]: A list of dictionaries representing the processed content with
            types "text" or "image" and corresponding content data. Includes error
            messages for files that cannot be processed.
    """
    if not message["files"]:
        return [{"type": "text", "text": message["text"]}]

    result_content = [{"type": "text", "text": message["text"]}]

    for file_path in message["files"]:
        try:
            check_file_size(file_path)
        except ValueError as e:
            logger.error(f"File size check failed: {e}")
            result_content.append({"type": "text", "text": f"Error: {str(e)}"})
            continue
            
        if file_path.endswith((".mp4", ".mov")):
            try:
                result_content = [*result_content, *process_video(file_path, max_images)]
            except Exception as e:
                logger.error(f"Video processing failed: {e}")
                result_content.append({"type": "text", "text": f"Error processing video: {str(e)}"})
        elif file_path.lower().endswith(".pdf"):
            try:
                logger.info(f"Processing PDF file: {file_path}")
                pdf_text = extract_pdf_text(file_path)
                logger.debug(f"PDF text extracted successfully, length: {len(pdf_text)} characters")
                result_content.append({"type": "text", "text": f"PDF Content:\n{pdf_text}"})
            except ValueError as ve:
                logger.error(f"PDF validation failed: {ve}")
                result_content.append({"type": "text", "text": f"Error processing PDF: {str(ve)}"})
            except Exception as e:
                logger.error(f"PDF processing failed: {e}")
                result_content.append({"type": "text", "text": f"Error processing PDF: {str(e)}"})
        else:
            result_content = [*result_content, {"type": "image", "url": file_path}]

    return result_content


def process_history(history: list[dict]) -> list[dict]:
    """Process chat history into the format expected by the model.

    Converts chat history from the UI format into the structured format required
    by multimodal language models. Groups consecutive user messages and handles
    different content types (text, images, videos, PDFs) appropriately.

    Args:
        history (list[dict]): A list of chat history items, where each item contains
            "role" (str) - either "user" or "assistant", and
            "content" - the message content (str for text, tuple for files).

    Returns:
        list[dict]: A list of messages formatted for the model with "role" and
            "content" keys, where content is a list of dictionaries with "type"
            and associated data.

    Note:
        Groups consecutive user messages into a single message. Videos and PDFs
        in history are replaced with placeholder text to avoid reprocessing.
    """
    messages = []
    content_buffer = []

    for item in history:
        if item["role"] == "assistant":
            if content_buffer:
                messages.append({"role": "user", "content": content_buffer})
                content_buffer = []

            messages.append(
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": item["content"]}],
                }
            )
        else:
            content = item["content"]
            if isinstance(content, str):
                content_buffer.append({"type": "text", "text": content})
            elif isinstance(content, tuple) and len(content) > 0:
                file_path = content[0]
                if file_path.endswith((".mp4", ".mov")):
                    content_buffer.append({"type": "text", "text": "[Video uploaded previously]"})
                elif file_path.lower().endswith(".pdf"):
                    content_buffer.append({"type": "text", "text": "[PDF uploaded previously]"})
                else:
                    content_buffer.append({"type": "image", "url": file_path})

    if content_buffer:
        messages.append({"role": "user", "content": content_buffer})

    return messages


def update_custom_prompt(preset_choice: str) -> str:
    """Update the custom prompt based on preset selection.

    Returns the appropriate preset prompt text based on the user's selection.
    If "Custom Prompt" is selected, returns an empty string to allow manual input.

    Args:
        preset_choice (str): The name of the selected preset prompt. Should match
            one of the keys in PRESET_PROMPTS or be "Custom Prompt".

    Returns:
        str: The preset prompt text corresponding to the selection, or an empty
            string if "Custom Prompt" is selected or if the preset is not found.
    """
    if preset_choice == "Custom Prompt":
        return ""
    return PRESET_PROMPTS.get(preset_choice, "")


def get_preset_prompts() -> dict[str, str]:
    """Return the dictionary of preset prompts for the main application.

    Provides a copy of the predefined prompt templates that can be used throughout
    the application. Each preset is designed for a specific use case and contains
    detailed instructions for the AI model's behavior.

    Returns:
        dict[str, str]: A dictionary mapping preset names to their prompt texts.
            Includes prompts for general assistance, document analysis, visual content
            analysis, educational tutoring, technical review, and creative storytelling.

    Note:
        Returns a copy of the PRESET_PROMPTS dictionary to prevent accidental
        modification of the original constants.
    """
    return PRESET_PROMPTS.copy()