Spaces:

davidlms
/

describepdf

Running

App Files Files Community

David commited on May 1

Commit

2e375e5

1 Parent(s): 5d46fa8

add pages selection

Browse files

Files changed (5) hide show

describepdf/cli.py +10 -0
describepdf/config.py +9 -35
describepdf/core.py +82 -4
describepdf/ui.py +26 -12
describepdf/ui_ollama.py +32 -15

describepdf/cli.py CHANGED Viewed

@@ -61,6 +61,10 @@ def setup_cli_parser() -> argparse.ArgumentParser:
         help="VLM model to use (default: configured in .env)"
     )
     parser.add_argument(
         "-l", "--language",
         help="Output language (default: configured in .env)"
@@ -157,6 +161,7 @@ def run_cli() -> None:
         "output_language": args.language if args.language else env_config.get("output_language"),
         "use_markitdown": args.use_markitdown if args.use_markitdown is not None else env_config.get("use_markitdown"),
         "use_summary": args.use_summary if args.use_summary is not None else env_config.get("use_summary"),
     }
     # Configure provider-specific settings
@@ -219,6 +224,11 @@ def run_cli() -> None:
     logger.info(f"Summary: {'Yes' if run_config['use_summary'] else 'No'}")
     if run_config.get('use_summary') and run_config.get('summary_llm_model'):
         logger.info(f"Summary model: {run_config['summary_llm_model']}")
     # Create progress callback
     progress_callback = create_progress_callback()

         help="VLM model to use (default: configured in .env)"
     )
+    parser.add_argument(
+        "--pages",
+        help="Pages to process (e.g. '1,3,5-10,15'). Default: all pages."
+    )
     parser.add_argument(
         "-l", "--language",
         help="Output language (default: configured in .env)"
         "output_language": args.language if args.language else env_config.get("output_language"),
         "use_markitdown": args.use_markitdown if args.use_markitdown is not None else env_config.get("use_markitdown"),
         "use_summary": args.use_summary if args.use_summary is not None else env_config.get("use_summary"),
+        "page_selection": args.pages if args.pages else env_config.get("page_selection")
     }
     # Configure provider-specific settings
     logger.info(f"Summary: {'Yes' if run_config['use_summary'] else 'No'}")
     if run_config.get('use_summary') and run_config.get('summary_llm_model'):
         logger.info(f"Summary model: {run_config['summary_llm_model']}")
+    if run_config.get('page_selection'):
+        logger.info(f"Page selection: {run_config['page_selection']}")
+    else:
+        logger.info("Page selection: All pages")
     # Create progress callback
     progress_callback = create_progress_callback()

describepdf/config.py CHANGED Viewed

@@ -6,7 +6,7 @@ and prompt templates from files.
 """
 import os
 import logging
-from typing import Dict, Any, Optional
 from dotenv import load_dotenv
 import pathlib
@@ -14,41 +14,11 @@ import pathlib
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(module)s] - %(message)s')
 logger = logging.getLogger('describepdf')
-def _resolve_prompts_directory() -> pathlib.Path:
-    """
-    Resolve the path to the prompts directory with multiple fallback strategies.
-    Returns:
-        pathlib.Path: Path to the prompts directory
-    """
-    # List of potential paths to check
-    potential_paths = [
-        # Current file's parent directory
-        pathlib.Path(__file__).parent.parent / "prompts",
-        # Relative to the current working directory
-        pathlib.Path.cwd() / "prompts",
-        # Absolute path fallback (useful in deployment)
-        pathlib.Path("/app/prompts"),
-        pathlib.Path("/workspace/prompts"),
-        # Hugging Face Spaces specific path
-        pathlib.Path("/home/user/app/prompts")
-    ]
-    # Try each path
-    for path in potential_paths:
-        if path.is_dir():
-            logger.info(f"Prompts directory found at: {path}")
-            return path
-    # If no path is found
-    logger.error("Could not locate prompts directory. Using a temporary fallback.")
-    return pathlib.Path(__file__).parent / "prompts"
 # Directory containing prompt templates (making path absolute by using current file location)
 SCRIPT_DIR = pathlib.Path(__file__).parent.parent.absolute()
-PROMPTS_DIR = _resolve_prompts_directory()
 # Default configuration values
 DEFAULT_CONFIG: Dict[str, Any] = {
@@ -62,7 +32,8 @@ DEFAULT_CONFIG: Dict[str, Any] = {
     "output_language": "English",
     "use_markitdown": False,
-    "use_summary": False
 }
 # Mapping of prompt template identifiers to their file names
@@ -122,6 +93,9 @@ def load_env_config() -> Dict[str, Any]:
     if os.getenv("DEFAULT_USE_SUMMARY"):
         loaded_config["use_summary"] = str(os.getenv("DEFAULT_USE_SUMMARY")).lower() == 'true'
     logger.info("Configuration loaded from environment variables.")

 """
 import os
 import logging
+from typing import Dict, Any, Optional, List
 from dotenv import load_dotenv
 import pathlib
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(module)s] - %(message)s')
 logger = logging.getLogger('describepdf')
 # Directory containing prompt templates (making path absolute by using current file location)
 SCRIPT_DIR = pathlib.Path(__file__).parent.parent.absolute()
+PROMPTS_DIR = pathlib.Path(SCRIPT_DIR) / "prompts"
+# Default configuration values
 # Default configuration values
 DEFAULT_CONFIG: Dict[str, Any] = {
     "output_language": "English",
     "use_markitdown": False,
+    "use_summary": False,
+    "page_selection": None
 }
 # Mapping of prompt template identifiers to their file names
     if os.getenv("DEFAULT_USE_SUMMARY"):
         loaded_config["use_summary"] = str(os.getenv("DEFAULT_USE_SUMMARY")).lower() == 'true'
+    if os.getenv("DEFAULT_PAGE_SELECTION"):
+        loaded_config["page_selection"] = os.getenv("DEFAULT_PAGE_SELECTION")
     logger.info("Configuration loaded from environment variables.")

describepdf/core.py CHANGED Viewed

@@ -24,22 +24,87 @@ class ConversionError(Exception):
     """Error raised during PDF conversion process."""
     pass
-def format_markdown_output(descriptions: List[str], original_filename: str) -> str:
     """
     Combine page descriptions into a single Markdown file.
     Args:
         descriptions: List of strings, each being a description of a page
         original_filename: Name of the original PDF file
     Returns:
         str: Complete Markdown content
     """
     md_content = f"# Description of PDF: {original_filename}\n\n"
     for i, desc in enumerate(descriptions):
-        md_content += f"## Page {i + 1}\n\n"
         md_content += desc if desc else "*No description generated for this page.*"
         md_content += "\n\n---\n\n"
     return md_content
 def convert_pdf_to_markdown(
@@ -175,7 +240,17 @@ def convert_pdf_to_markdown(
             page_processing_progress_start = pdf_load_progress
             total_page_progress_ratio = (0.98 - page_processing_progress_start) if total_pages > 0 else 0
-            for i, page in enumerate(pages):
                 page_num = i + 1
                 current_page_ratio = (page_num / total_pages) if total_pages > 0 else 1.0
@@ -306,7 +381,10 @@ def convert_pdf_to_markdown(
         # Generate final markdown
         final_progress = 0.99
         progress_callback(final_progress, "Combining page descriptions into final Markdown...")
-        final_markdown = format_markdown_output(all_descriptions, original_filename)
         logger.info("Final Markdown content assembled.")
         # Report completion

     """Error raised during PDF conversion process."""
     pass
+def parse_page_selection(selection_string: Optional[str], total_pages: int) -> List[int]:
+    """
+    Parse a page selection string into a list of page indices.
+    Args:
+        selection_string: String with page selection (e.g. "1,3,5-10,15")
+        total_pages: Total number of pages in the document
+    Returns:
+        List[int]: List of zero-based page indices to process
+    """
+    if not selection_string:
+        # Return all pages if selection is empty
+        return list(range(total_pages))
+    page_indices = []
+    try:
+        sections = selection_string.split(',')
+        for section in sections:
+            section = section.strip()
+            if not section:
+                continue
+            if '-' in section:
+                # Handle page range
+                start, end = section.split('-', 1)
+                start_idx = int(start.strip()) - 1  # Convert to 0-based index
+                end_idx = int(end.strip()) - 1
+                # Validate range
+                if start_idx < 0 or end_idx >= total_pages or start_idx > end_idx:
+                    logger.warning(f"Invalid page range: {section}. Must be between 1 and {total_pages}.")
+                    continue
+                page_indices.extend(range(start_idx, end_idx + 1))
+            else:
+                # Handle single page
+                page_idx = int(section) - 1  # Convert to 0-based index
+                # Validate page number
+                if page_idx < 0 or page_idx >= total_pages:
+                    logger.warning(f"Invalid page number: {section}. Must be between 1 and {total_pages}.")
+                    continue
+                page_indices.append(page_idx)
+        # Remove duplicates and sort
+        page_indices = sorted(set(page_indices))
+        if not page_indices:
+            logger.warning("No valid pages specified. Processing all pages.")
+            return list(range(total_pages))
+        return page_indices
+    except ValueError as e:
+        logger.error(f"Error parsing page selection '{selection_string}': {e}. Processing all pages.")
+        return list(range(total_pages))
+def format_markdown_output(descriptions: List[str], original_filename: str, page_numbers: Optional[List[int]] = None) -> str:
     """
     Combine page descriptions into a single Markdown file.
     Args:
         descriptions: List of strings, each being a description of a page
         original_filename: Name of the original PDF file
+        page_numbers: List of actual page numbers corresponding to descriptions (1-based)
     Returns:
         str: Complete Markdown content
     """
     md_content = f"# Description of PDF: {original_filename}\n\n"
     for i, desc in enumerate(descriptions):
+        # Use actual page number if provided, otherwise use sequential numbering
+        page_num = page_numbers[i] if page_numbers else (i + 1)
+        md_content += f"## Page {page_num}\n\n"
         md_content += desc if desc else "*No description generated for this page.*"
         md_content += "\n\n---\n\n"
     return md_content
 def convert_pdf_to_markdown(
             page_processing_progress_start = pdf_load_progress
             total_page_progress_ratio = (0.98 - page_processing_progress_start) if total_pages > 0 else 0
+            # Parse page selection
+            page_selection = cfg.get("page_selection")
+            selected_indices = parse_page_selection(page_selection, total_pages)
+            if page_selection:
+                logger.info(f"Processing {len(selected_indices)} selected pages out of {total_pages} total pages.")
+            else:
+                logger.info(f"Processing all {total_pages} pages.")
+            for i in selected_indices:
+                page = pages[i]
                 page_num = i + 1
                 current_page_ratio = (page_num / total_pages) if total_pages > 0 else 1.0
         # Generate final markdown
         final_progress = 0.99
         progress_callback(final_progress, "Combining page descriptions into final Markdown...")
+        actual_page_numbers = [i + 1 for i in selected_indices] if 'selected_indices' in locals() else None
+        final_markdown = format_markdown_output(all_descriptions, original_filename, actual_page_numbers)
         logger.info("Final Markdown content assembled.")
         # Report completion

describepdf/ui.py CHANGED Viewed

@@ -21,7 +21,7 @@ theme = gr.themes.Soft(
     spacing_size="lg",
 )
-def generate(
     pdf_file_obj: Optional[gr.File],
     ui_api_key: str,
     ui_vlm_model: str,
@@ -29,25 +29,32 @@ def generate(
     ui_use_md: bool,
     ui_use_sum: bool,
     ui_sum_model: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ) -> Tuple[str, gr.update, Optional[str]]:
     """
-    Wrapper function to call the core conversion process and handle the Gradio UI.
     Args:
         pdf_file_obj: Gradio File object for the uploaded PDF
         ui_api_key: OpenRouter API key from UI
-        ui_vlm_model: VLM model name from UI
-        ui_lang: Output language from UI
-        ui_use_md: Whether to use Markitdown from UI
-        ui_use_sum: Whether to generate a summary from UI
-        ui_sum_model: Summary model name from UI
         progress: Gradio progress tracker
     Returns:
         Tuple containing:
-        - str: Status message
-        - gr.update: Download button update
         - Optional[str]: Markdown result content
     """
     # Validate input file
@@ -67,7 +74,8 @@ def generate(
         "output_language": ui_lang,
         "use_markitdown": ui_use_md,
         "use_summary": ui_use_sum,
-        "summary_llm_model": ui_sum_model if ui_sum_model else env_config.get("or_summary_model")
     }
     # Validate API key
@@ -238,6 +246,12 @@ def create_ui() -> gr.Blocks:
                     allow_custom_value=True,
                     info="Select or type the desired output language (e.g., English, Spanish)"
                 )
                 with gr.Row():
                     use_markitdown_checkbox = gr.Checkbox(
                         label="Use Markitdown for extra text context",
@@ -258,13 +272,13 @@ def create_ui() -> gr.Blocks:
         # Connect UI components
         conversion_inputs = [
             pdf_input, api_key_input, vlm_model_input, output_language_input,
-            use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input
         ]
         conversion_outputs = [
             progress_output, download_button, markdown_output
         ]
         convert_button.click(
-            fn=generate,
             inputs=conversion_inputs,
             outputs=conversion_outputs
         )

     spacing_size="lg",
 )
+def convert_pdf_to_descriptive_markdown(
     pdf_file_obj: Optional[gr.File],
     ui_api_key: str,
     ui_vlm_model: str,
     ui_use_md: bool,
     ui_use_sum: bool,
     ui_sum_model: str,
+    ui_page_selection: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ) -> Tuple[str, gr.update, Optional[str]]:
     """
+    Convert a PDF file to detailed page-by-page Markdown descriptions using Vision-Language Models.
+    This function processes the uploaded PDF, analyzing the visual and textual content of each page
+    using OpenRouter's Vision-Language Models (VLMs). It generates rich, contextual descriptions in
+    Markdown format that capture both the visual elements and text content of the document, making
+    the PDF accessible and searchable in contexts where traditional text extraction would fail.
     Args:
         pdf_file_obj: Gradio File object for the uploaded PDF
         ui_api_key: OpenRouter API key from UI
+        ui_vlm_model: VLM model name from UI (e.g., qwen/qwen2.5-vl-72b-instruct)
+        ui_lang: Output language for descriptions (e.g., English, Spanish)
+        ui_use_md: Whether to use Markitdown for enhanced text extraction
+        ui_use_sum: Whether to generate a document summary for context
+        ui_sum_model: Summary model name from UI (e.g., google/gemini-2.5-flash-preview)
+        ui_page_selection: Optional page selection string (e.g., "1,3,5-10")
         progress: Gradio progress tracker
     Returns:
         Tuple containing:
+        - str: Status message indicating success or failure
+        - gr.update: Download button update with the result file
         - Optional[str]: Markdown result content
     """
     # Validate input file
         "output_language": ui_lang,
         "use_markitdown": ui_use_md,
         "use_summary": ui_use_sum,
+        "summary_llm_model": ui_sum_model if ui_sum_model else env_config.get("or_summary_model"),
+        "page_selection": ui_page_selection.strip() if ui_page_selection.strip() else None
     }
     # Validate API key
                     allow_custom_value=True,
                     info="Select or type the desired output language (e.g., English, Spanish)"
                 )
+                page_selection_input = gr.Textbox(
+                    label="Page Selection (Optional)",
+                    value="",
+                    placeholder="Example: 1,3,5-10,15 (leave empty for all pages)",
+                    info="Specify individual pages or ranges to process"
+                )
                 with gr.Row():
                     use_markitdown_checkbox = gr.Checkbox(
                         label="Use Markitdown for extra text context",
         # Connect UI components
         conversion_inputs = [
             pdf_input, api_key_input, vlm_model_input, output_language_input,
+            use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input, page_selection_input
         ]
         conversion_outputs = [
             progress_output, download_button, markdown_output
         ]
         convert_button.click(
+            fn=convert_pdf_to_descriptive_markdown,
             inputs=conversion_inputs,
             outputs=conversion_outputs
         )

describepdf/ui_ollama.py CHANGED Viewed

@@ -22,7 +22,7 @@ theme = gr.themes.Soft(
     spacing_size="lg",
 )
-def generate(
     pdf_file_obj: Optional[gr.File],
     ollama_endpoint: str,
     ui_vlm_model: str,
@@ -30,25 +30,36 @@ def generate(
     ui_use_md: bool,
     ui_use_sum: bool,
     ui_sum_model: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ) -> Tuple[str, gr.update, Optional[str]]:
     """
-    Wrapper function to call the core conversion process and handle the Gradio UI for Ollama.
     Args:
         pdf_file_obj: Gradio File object for the uploaded PDF
-        ollama_endpoint: Ollama server endpoint URL
-        ui_vlm_model: VLM model name from UI
-        ui_lang: Output language from UI
-        ui_use_md: Whether to use Markitdown from UI
-        ui_use_sum: Whether to generate a summary from UI
-        ui_sum_model: Summary model name from UI
         progress: Gradio progress tracker
     Returns:
         Tuple containing:
-        - str: Status message
-        - gr.update: Download button update
         - Optional[str]: Markdown result content
     """
     # Validate input file
@@ -69,7 +80,8 @@ def generate(
         "output_language": ui_lang,
         "use_markitdown": ui_use_md,
         "use_summary": ui_use_sum,
-        "summary_llm_model": ui_sum_model
     }
     # Create progress callback for Gradio
@@ -160,7 +172,7 @@ def create_ui() -> gr.Blocks:
         gr.Markdown("<center><img src='https://davidlms.github.io/DescribePDF/assets/poster.png' alt='Describe PDF Logo' width='600px'/></center>")
         gr.Markdown(
             """<div style="display: flex;align-items: center;justify-content: center">
-            [<a href="https://davidlms.github.io/describepdf/">Project Page</a>] | [<a href="https://github.com/DavidLMS/describepdf">Github</a>]</div>
             """
         )
         gr.Markdown(
@@ -223,6 +235,12 @@ def create_ui() -> gr.Blocks:
                     allow_custom_value=True,
                     info="Select or type the desired output language (e.g., English, Spanish)"
                 )
                 with gr.Row():
                     use_markitdown_checkbox = gr.Checkbox(
                         label="Use Markitdown for extra text context",
@@ -239,17 +257,16 @@ def create_ui() -> gr.Blocks:
                     allow_custom_value=True,
                     info="Select or type the Ollama LLM model name for summaries"
                 )
         # Connect UI components
         conversion_inputs = [
             pdf_input, ollama_endpoint_input, vlm_model_input, output_language_input,
-            use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input
         ]
         conversion_outputs = [
             progress_output, download_button, markdown_output
         ]
         convert_button.click(
-            fn=generate,
             inputs=conversion_inputs,
             outputs=conversion_outputs
         )

     spacing_size="lg",
 )
+def convert_pdf_to_descriptive_markdown(
     pdf_file_obj: Optional[gr.File],
     ollama_endpoint: str,
     ui_vlm_model: str,
     ui_use_md: bool,
     ui_use_sum: bool,
     ui_sum_model: str,
+    ui_page_selection: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True)
 ) -> Tuple[str, gr.update, Optional[str]]:
     """
+    Convert a PDF file to detailed page-by-page Markdown descriptions using local Ollama Vision-Language Models.
+    This function processes the uploaded PDF, analyzing the visual and textual content of each page
+    using locally hosted Vision-Language Models (VLMs) through Ollama. It generates rich, contextual
+    descriptions in Markdown format that capture both the visual elements and text content of the document,
+    making the PDF accessible and searchable in contexts where traditional text extraction would fail.
+    Unlike the OpenRouter version, this function utilizes local models running through Ollama,
+    providing privacy and eliminating the need for API keys, but potentially with different model options
+    and performance characteristics.
     Args:
         pdf_file_obj: Gradio File object for the uploaded PDF
+        ollama_endpoint: Ollama server endpoint URL (e.g., http://localhost:11434)
+        ui_vlm_model: VLM model name from UI (e.g., llama3.2-vision)
+        ui_lang: Output language for descriptions (e.g., English, Spanish)
+        ui_use_md: Whether to use Markitdown for enhanced text extraction
+        ui_use_sum: Whether to generate a document summary for context
+        ui_sum_model: Summary model name from UI (e.g., qwen2.5)
+        ui_page_selection: Optional page selection string (e.g., "1,3,5-10")
         progress: Gradio progress tracker
     Returns:
         Tuple containing:
+        - str: Status message indicating success or failure
+        - gr.update: Download button update with the result file
         - Optional[str]: Markdown result content
     """
     # Validate input file
         "output_language": ui_lang,
         "use_markitdown": ui_use_md,
         "use_summary": ui_use_sum,
+        "summary_llm_model": ui_sum_model,
+        "page_selection": ui_page_selection.strip() if ui_page_selection.strip() else None
     }
     # Create progress callback for Gradio
         gr.Markdown("<center><img src='https://davidlms.github.io/DescribePDF/assets/poster.png' alt='Describe PDF Logo' width='600px'/></center>")
         gr.Markdown(
             """<div style="display: flex;align-items: center;justify-content: center">
+            [<a href="https://davidlms.github.io/DescribePDF/">Project Page</a>] | [<a href="https://github.com/DavidLMS/describepdf">Github</a>]</div>
             """
         )
         gr.Markdown(
                     allow_custom_value=True,
                     info="Select or type the desired output language (e.g., English, Spanish)"
                 )
+                page_selection_input = gr.Textbox(
+                    label="Page Selection (Optional)",
+                    value="",
+                    placeholder="Example: 1,3,5-10,15 (leave empty for all pages)",
+                    info="Specify individual pages or ranges to process"
+                )
                 with gr.Row():
                     use_markitdown_checkbox = gr.Checkbox(
                         label="Use Markitdown for extra text context",
                     allow_custom_value=True,
                     info="Select or type the Ollama LLM model name for summaries"
                 )
         # Connect UI components
         conversion_inputs = [
             pdf_input, ollama_endpoint_input, vlm_model_input, output_language_input,
+            use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input, page_selection_input
         ]
         conversion_outputs = [
             progress_output, download_button, markdown_output
         ]
         convert_button.click(
+            fn=convert_pdf_to_descriptive_markdown,
             inputs=conversion_inputs,
             outputs=conversion_outputs
         )