Spaces:

davidlms
/

describepdf

Running

App Files Files Community

David commited on Apr 26

Commit

b499397

1 Parent(s): 08073f0

Setup DescribePDF for Hugging Face Space

Browse files

Files changed (28) hide show

.gitattributes +2 -35
.github/workflows/jekyll-gh-pages.yml +51 -0
.gitignore +5 -0
CITATION.cff +8 -0
CONTRIBUTING.md +138 -0
LICENSE +21 -0
README.md +14 -7
app.py +13 -0
describepdf/__init__.py +11 -0
describepdf/cli.py +252 -0
describepdf/config.py +215 -0
describepdf/core.py +328 -0
describepdf/markitdown_processor.py +81 -0
describepdf/ollama_client.py +172 -0
describepdf/openrouter_client.py +208 -0
describepdf/pdf_processor.py +185 -0
describepdf/summarizer.py +127 -0
describepdf/ui.py +291 -0
describepdf/ui_ollama.py +269 -0
main.py +90 -0
prompts/summary_prompt.md +6 -0
prompts/vlm_prompt_base.md +17 -0
prompts/vlm_prompt_full.md +26 -0
prompts/vlm_prompt_with_markdown.md +10 -0
prompts/vlm_prompt_with_summary.md +20 -0
pytest.ini +17 -0
requirements.txt +8 -0
setup.py +23 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ # Auto detect text files and perform LF normalization
2	+ * text=auto

.github/workflows/jekyll-gh-pages.yml ADDED Viewed

	@@ -0,0 +1,51 @@

+# Sample workflow for building and deploying a Jekyll site to GitHub Pages
+name: Deploy Jekyll with GitHub Pages dependencies preinstalled
+on:
+  # Runs on pushes targeting the default branch
+  push:
+    branches: ["main"]
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+jobs:
+  # Build job
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup Pages
+        uses: actions/configure-pages@v5
+      - name: Build with Jekyll
+        uses: actions/jekyll-build-pages@v1
+        with:
+          source: ./
+          destination: ./_site
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+  # Deployment job
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    needs: build
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+.env
+.DS_Store
+/describepdf/__pycache__
+/describepdf.egg-info
+/tests/__pycache__

CITATION.cff ADDED Viewed

	@@ -0,0 +1,8 @@

+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+- family-names: "Romero Santos"
+  given-names: "David"
+title: "DescribePDF: A tool to convert visual PDF files to detailed descriptions"
+date-released: 2025-04-26
+url: "https://github.com/DavidLMS/DescribePDF"

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,138 @@

+<!-- omit in toc -->
+# Contributing to DescribePDF
+First off, thanks for taking the time to contribute! Your help is greatly appreciated.
+All types of contributions are encouraged and valued, whether it's code, documentation, suggestions for new features, or bug reports. Please read through the following guidelines before contributing to ensure a smooth process for everyone involved.
+> And if you like the project, but just don't have time to contribute, that's fine. There are other easy ways to support the project and show your appreciation, which we would also be very happy about:
+> - Star the project
+> - Tweet about it
+> - Refer this project in your project's README
+> - Mention the project at local meetups and tell your friends/colleagues
+<!-- omit in toc -->
+## Table of Contents
+- [I Have a Question](#i-have-a-question)
+- [I Want To Contribute](#i-want-to-contribute)
+  - [Reporting Bugs](#reporting-bugs)
+  - [Suggesting Enhancements](#suggesting-enhancements)
+  - [Your First Code Contribution](#your-first-code-contribution)
+  - [Improving The Documentation](#improving-the-documentation)
+- [Styleguides](#styleguides)
+  - [Commit Messages](#commit-messages)
+## I Have a Question
+If you want to ask a question, we assume that you have read the available [Documentation](https://github.com/DavidLMS/DescribePDF/blob/main/README.md).
+Before you ask a question, it is best to search for existing [Issues](https://github.com/DavidLMS/DescribePDF/issues) that might help you. If you find a relevant issue but still need clarification, feel free to comment on it. Additionally, it’s a good idea to search the web for answers before asking.
+If you still need to ask a question, we recommend the following:
+- Open an [Issue](https://github.com/DavidLMS/DescribePDF/issues/new).
+- Provide as much context as you can about what you're running into.
+- Provide project and platform versions (Python, OS, etc.), depending on what seems relevant.
+We (or someone in the community) will then take care of the issue as soon as possible.
+## I Want To Contribute
+> ### Legal Notice <!-- omit in toc -->
+> When contributing to this project, you must agree that you have authored 100% of the content, that you have the necessary rights to the content, and that the content you contribute may be provided under the project license.
+### Reporting Bugs
+#### Before Submitting a Bug Report
+A good bug report shouldn't leave others needing to chase you up for more information. Please investigate carefully, collect information, and describe the issue in detail in your report. Follow these steps to help us fix any potential bugs as quickly as possible:
+- Ensure you are using the latest version.
+- Verify that your issue is not due to misconfiguration or environmental issues. Make sure you have read the [documentation](https://github.com/DavidLMS/DescribePDF/blob/main/README.md).
+- Check if the issue has already been reported by searching the [bug tracker](https://github.com/DavidLMS/DescribePDF/issues?q=label%3Abug).
+- Gather as much information as possible about the bug:
+  - Stack trace (if applicable)
+  - OS, platform, and version (Windows, Linux, macOS, etc.)
+  - Python version and any relevant package versions
+  - Steps to reliably reproduce the issue
+#### How Do I Submit a Good Bug Report?
+> Do not report security-related issues, vulnerabilities, or bugs with sensitive information in public forums. Instead, report these issues privately by emailing hola_at_davidlms.com.
+We use GitHub issues to track bugs and errors. If you run into an issue with the project:
+- Open an [Issue](https://github.com/DavidLMS/DescribePDF/issues/new). (Since we can't be sure yet if it’s a bug, avoid labeling it as such until confirmed.)
+- Explain the behavior you expected and what actually happened.
+- Provide as much context as possible and describe the steps someone else can follow to recreate the issue. This usually includes a code snippet or an example project.
+Once it's filed:
+- The project team will label the issue accordingly.
+- A team member will try to reproduce the issue. If the issue cannot be reproduced, the team will ask for more information and label the issue as `needs-repro`.
+- If the issue is reproducible, it will be labeled `needs-fix` and potentially other relevant tags.
+### Suggesting Enhancements
+This section guides you through submitting an enhancement suggestion for DescribePDF, whether it's a new feature or an improvement to existing functionality.
+#### Before Submitting an Enhancement
+- Ensure you are using the latest version.
+- Check the [documentation](https://github.com/DavidLMS/DescribePDF/blob/main/README.md) to see if your suggestion is already supported.
+- Search the [issue tracker](https://github.com/DavidLMS/DescribePDF/issues) to see if the enhancement has already been suggested. If so, add a comment to the existing issue instead of opening a new one.
+- Make sure your suggestion aligns with the scope and aims of the project. It's important to suggest features that will be beneficial to the majority of users.
+#### How Do I Submit a Good Enhancement Suggestion?
+Enhancement suggestions are tracked as [GitHub issues](https://github.com/DavidLMS/DescribePDF/issues).
+- Use a **clear and descriptive title** for the suggestion.
+- Provide a **detailed description** of the enhancement, including any relevant context.
+- **Describe the current behavior** and **explain what you would expect instead**, along with reasons why the enhancement would be beneficial.
+- Include **screenshots or diagrams** if applicable to help illustrate the suggestion.
+- Explain why this enhancement would be useful to most `DescribePDF` users.
+### Your First Code Contribution
+#### Pre-requisites
+You should first [fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) the `DescribePDF` repository and then clone your forked repository:
+```bash
+git clone https://github.com/<YOUR_GITHUB_USER>/DescribePDF.git
+````
+Once in the cloned repository directory, create a new branch for your contribution:
+```bash
+git checkout -B <feature-description>
+````
+### Contributing Workflow
+1. Make sure your code follows the style guide and passes linting with `pylint`.
+2. Write tests for any new functionality you add.
+3. Ensure all tests pass before submitting a pull request.
+4. Document any changes to APIs or core functionality.
+5. Submit your pull request, providing a clear and descriptive title and description of your changes.
+### Improving The Documentation
+Contributions to documentation are welcome! Well-documented code is easier to understand and maintain. If you see areas where documentation can be improved, feel free to submit your suggestions.
+## Styleguides
+### Commit Messages
+- Use clear and descriptive commit messages.
+- Follow the general format: `Short summary (50 characters or less)` followed by an optional detailed explanation.
+### Code Style
+- Ensure your code adheres to the project's coding standards and passes all linting checks with `pylint`.
+## License
+By contributing to DescribePDF, you agree that your contributions will be licensed under the MIT License.

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 David Romero
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,14 +1,21 @@
 ---
-title: Describepdf
-emoji: 🐢
-colorFrom: purple
-colorTo: green
 sdk: gradio
-sdk_version: 5.27.0
 app_file: app.py
 pinned: false
 license: mit
-short_description: A tool to convert PDF files to detailed Markdown description
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: DescribePDF
+emoji: 📄
+colorFrom: red
+colorTo: pink
 sdk: gradio
+sdk_version: 5.20.1
 app_file: app.py
 pinned: false
 license: mit
 ---
+# DescribePDF
+DescribePDF is an open-source tool designed to convert PDF files into detailed page-by-page descriptions in Markdown format using Vision-Language Models (VLMs).
+This Space demonstrates the OpenRouter web interface of DescribePDF. You'll need an OpenRouter API key to use it. For the full version including local Ollama support, please install the package locally.
+![poster](https://davidlms.github.io/DescribePDF/assets/poster.png)
+[GitHub Repository](https://github.com/DavidLMS/DescribePDF)

app.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+Entry point for DescribePDF Hugging Face Space.
+This file imports and launches the Gradio UI from the original package.
+"""
+from describepdf.ui import create_ui, launch_app
+# Create the Gradio interface
+app = create_ui()
+# This will be used by Gradio when deployed
+if __name__ == "__main__":
+    app.launch()

describepdf/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+DescribePDF - A tool to convert PDF files to detailed Markdown descriptions using VLMs.
+This package provides functionality to analyze PDF files and generate detailed
+Markdown descriptions using Vision-Language Models (VLMs) from either OpenRouter
+or local Ollama instances.
+"""
+__version__ = "0.1.0"
+__author__ = "David Romero"
+__license__ = "MIT"

describepdf/cli.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+Command-line interface for DescribePDF.
+This module provides the CLI functionality for converting PDF files to markdown descriptions.
+"""
+import argparse
+import os
+import sys
+import logging
+from typing import Dict, Any, Callable, Optional
+from tqdm import tqdm
+from . import config
+from . import core
+from . import ollama_client
+# Get logger from config module
+logger = logging.getLogger('describepdf')
+def setup_cli_parser() -> argparse.ArgumentParser:
+    """
+    Set up the command line argument parser.
+    Returns:
+        argparse.ArgumentParser: Configured parser for command line arguments
+    """
+    parser = argparse.ArgumentParser(
+        description="DescribePDF - Convert PDF files to detailed Markdown descriptions",
+        epilog="Example: describepdf input.pdf -o output.md -l Spanish"
+    )
+    parser.add_argument(
+        "pdf_file",
+        help="Path to the PDF file to process"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Path to the output Markdown file (default: [pdf_name]_description.md)"
+    )
+    parser.add_argument(
+        "-k", "--api-key",
+        help="OpenRouter API Key (overrides the one in .env file)"
+    )
+    parser.add_argument(
+        "--local",
+        action="store_true",
+        help="Use local Ollama instead of OpenRouter"
+    )
+    parser.add_argument(
+        "--endpoint",
+        help="Ollama endpoint URL (default: http://localhost:11434)"
+    )
+    parser.add_argument(
+        "-m", "--vlm-model",
+        help="VLM model to use (default: configured in .env)"
+    )
+    parser.add_argument(
+        "-l", "--language",
+        help="Output language (default: configured in .env)"
+    )
+    parser.add_argument(
+        "--use-markitdown",
+        action="store_true",
+        help="Use Markitdown for enhanced text extraction"
+    )
+    parser.add_argument(
+        "--use-summary",
+        action="store_true",
+        help="Generate and use a PDF summary"
+    )
+    parser.add_argument(
+        "--summary-model",
+        help="Model to generate the summary (default: configured in .env)"
+    )
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Verbose mode (show debug messages)"
+    )
+    return parser
+def create_progress_callback() -> Callable[[float, str], None]:
+    """
+    Create a progress callback function that displays progress with tqdm.
+    Returns:
+        Callable[[float, str], None]: Progress callback function
+    """
+    progress_bar = tqdm(total=100, desc="Processing", unit="%")
+    def callback(progress_value: float, status: str) -> None:
+        """
+        Display progress in the command line.
+        Args:
+            progress_value (float): Progress value between 0.0 and 1.0
+            status (str): Current status message
+        """
+        nonlocal progress_bar
+        current_progress = int(progress_value * 100)
+        last_progress = progress_bar.n
+        progress_diff = current_progress - last_progress
+        if progress_diff > 0:
+            progress_bar.update(progress_diff)
+        progress_bar.set_description(status)
+        if progress_value >= 1.0:
+            progress_bar.close()
+    return callback
+def run_cli() -> None:
+    """
+    Main function for the command line interface.
+    This function parses arguments, configures the application based on
+    provided parameters, and runs the PDF to Markdown conversion.
+    """
+    # Parse command line arguments
+    parser = setup_cli_parser()
+    args = parser.parse_args()
+    # Configure logging based on verbosity
+    if args.verbose:
+        logger.setLevel(logging.DEBUG)
+    # Validate input file exists
+    if not os.path.exists(args.pdf_file) or not os.path.isfile(args.pdf_file):
+        logger.error(f"The PDF file '{args.pdf_file}' does not exist or is not a valid file.")
+        logger.info("Exiting with error code 1")
+        sys.exit(1)
+    # Load configuration from environment
+    env_config = config.get_config()
+    # Determine provider
+    provider = "ollama" if args.local else "openrouter"
+    # Prepare run configuration by merging environment config and CLI args
+    run_config: Dict[str, Any] = {
+        "provider": provider,
+        "output_language": args.language if args.language else env_config.get("output_language"),
+        "use_markitdown": args.use_markitdown if args.use_markitdown is not None else env_config.get("use_markitdown"),
+        "use_summary": args.use_summary if args.use_summary is not None else env_config.get("use_summary"),
+    }
+    # Configure provider-specific settings
+    vlm_model: Optional[str] = args.vlm_model
+    summary_model: Optional[str] = args.summary_model
+    if provider == "openrouter":
+        run_config["openrouter_api_key"] = args.api_key if args.api_key else env_config.get("openrouter_api_key")
+        if not vlm_model:
+            vlm_model = env_config.get("or_vlm_model")
+        if not summary_model and run_config["use_summary"]:
+            summary_model = env_config.get("or_summary_model")
+        if not run_config.get("openrouter_api_key"):
+            logger.error("An OpenRouter API key is required. Provide one with --api-key or configure it in the .env file")
+            logger.info("Exiting with error code 1")
+            sys.exit(1)
+    elif provider == "ollama":
+        run_config["ollama_endpoint"] = args.endpoint if args.endpoint else env_config.get("ollama_endpoint")
+        if not vlm_model:
+            vlm_model = env_config.get("ollama_vlm_model")
+        if not summary_model and run_config["use_summary"]:
+            summary_model = env_config.get("ollama_summary_model")
+        if not ollama_client.OLLAMA_AVAILABLE:
+            logger.error("Ollama Python client not installed. Install with 'pip install ollama'")
+            logger.info("Exiting with error code 1")
+            sys.exit(1)
+        if not ollama_client.check_ollama_availability(run_config["ollama_endpoint"]):
+            logger.error(f"Could not connect to Ollama at {run_config['ollama_endpoint']}. Make sure it is running.")
+            logger.info("Exiting with error code 1")
+            sys.exit(1)
+    run_config["vlm_model"] = vlm_model
+    if run_config["use_summary"]:
+        run_config["summary_llm_model"] = summary_model
+    # Print configuration summary
+    logger.info(f"Processing PDF: {os.path.basename(args.pdf_file)}")
+    logger.info(f"Provider: {run_config['provider']}")
+    if run_config['provider'] == 'openrouter':
+        if run_config.get('openrouter_api_key'):
+            masked_key = '*' * 8 + run_config['openrouter_api_key'][-5:] if len(run_config['openrouter_api_key']) > 5 else '*****'
+            logger.info(f"OpenRouter API Key: {masked_key}")
+        else:
+            logger.info("OpenRouter API Key: Not provided")
+    else:
+        logger.info(f"Ollama Endpoint: {run_config['ollama_endpoint']}")
+    logger.info(f"VLM Model: {run_config['vlm_model']}")
+    logger.info(f"Language: {run_config['output_language']}")
+    logger.info(f"Markitdown: {'Yes' if run_config['use_markitdown'] else 'No'}")
+    logger.info(f"Summary: {'Yes' if run_config['use_summary'] else 'No'}")
+    if run_config.get('use_summary') and run_config.get('summary_llm_model'):
+        logger.info(f"Summary model: {run_config['summary_llm_model']}")
+    # Create progress callback
+    progress_callback = create_progress_callback()
+    # Run conversion
+    status, markdown_result = core.convert_pdf_to_markdown(
+        args.pdf_file,
+        run_config,
+        progress_callback
+    )
+    if not markdown_result:
+        logger.error(f"Error: {status}")
+        logger.info("Exiting with error code 1")
+        sys.exit(1)
+    # Determine output filename
+    output_filename = args.output
+    if not output_filename:
+        base_name = os.path.splitext(os.path.basename(args.pdf_file))[0]
+        output_filename = f"{base_name}_description.md"
+    # Save output file
+    try:
+        with open(output_filename, "w", encoding="utf-8") as md_file:
+            md_file.write(markdown_result)
+        logger.info(f"Conversion completed. Result saved to: {output_filename}")
+    except Exception as e:
+        logger.error(f"Error saving output file: {e}")
+        logger.info("Exiting with error code 1")
+        sys.exit(1)

describepdf/config.py ADDED Viewed

	@@ -0,0 +1,215 @@

+"""
+Configuration module for DescribePDF.
+This module manages loading configuration from environment variables
+and prompt templates from files.
+"""
+import os
+import logging
+from typing import Dict, Any, Optional
+from dotenv import load_dotenv
+import pathlib
+# Setup central logging configuration
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(module)s] - %(message)s')
+logger = logging.getLogger('describepdf')
+# Directory containing prompt templates (making path absolute by using current file location)
+SCRIPT_DIR = pathlib.Path(__file__).parent.parent.absolute()
+PROMPTS_DIR = pathlib.Path(SCRIPT_DIR) / "prompts"
+# Default configuration values
+DEFAULT_CONFIG: Dict[str, Any] = {
+    "openrouter_api_key": None,
+    "or_vlm_model": "qwen/qwen2.5-vl-72b-instruct",
+    "or_summary_model": "google/gemini-2.5-flash-preview",
+    "ollama_endpoint": "http://localhost:11434",
+    "ollama_vlm_model": "llama3.2-vision",
+    "ollama_summary_model": "qwen2.5",
+    "output_language": "English",
+    "use_markitdown": False,
+    "use_summary": False
+}
+# Mapping of prompt template identifiers to their file names
+PROMPT_FILES: Dict[str, str] = {
+    "summary": "summary_prompt.md",
+    "vlm_base": "vlm_prompt_base.md",
+    "vlm_markdown": "vlm_prompt_with_markdown.md",
+    "vlm_summary": "vlm_prompt_with_summary.md",
+    "vlm_full": "vlm_prompt_full.md"
+}
+# Cache for loaded configuration
+_CONFIG_CACHE: Optional[Dict[str, Any]] = None
+# Cache for loaded prompts
+_PROMPTS_CACHE: Optional[Dict[str, str]] = None
+def load_env_config() -> Dict[str, Any]:
+    """
+    Load configuration from environment variables (.env file).
+    This function reads configuration values from environment variables,
+    falling back to default values when environment variables are not set.
+    Returns:
+        Dict[str, Any]: Dictionary with the loaded configuration
+    """
+    load_dotenv()
+    # Start with the default config
+    loaded_config = DEFAULT_CONFIG.copy()
+    # Override defaults with environment variables if present
+    if os.getenv("OPENROUTER_API_KEY"):
+        loaded_config["openrouter_api_key"] = os.getenv("OPENROUTER_API_KEY")
+    if os.getenv("DEFAULT_OR_VLM_MODEL"):
+        loaded_config["or_vlm_model"] = os.getenv("DEFAULT_OR_VLM_MODEL")
+    if os.getenv("DEFAULT_OR_SUMMARY_MODEL"):
+        loaded_config["or_summary_model"] = os.getenv("DEFAULT_OR_SUMMARY_MODEL")
+    if os.getenv("OLLAMA_ENDPOINT"):
+        loaded_config["ollama_endpoint"] = os.getenv("OLLAMA_ENDPOINT")
+    if os.getenv("DEFAULT_OLLAMA_VLM_MODEL"):
+        loaded_config["ollama_vlm_model"] = os.getenv("DEFAULT_OLLAMA_VLM_MODEL")
+    if os.getenv("DEFAULT_OLLAMA_SUMMARY_MODEL"):
+        loaded_config["ollama_summary_model"] = os.getenv("DEFAULT_OLLAMA_SUMMARY_MODEL")
+    if os.getenv("DEFAULT_LANGUAGE"):
+        loaded_config["output_language"] = os.getenv("DEFAULT_LANGUAGE")
+    if os.getenv("DEFAULT_USE_MARKITDOWN"):
+        loaded_config["use_markitdown"] = str(os.getenv("DEFAULT_USE_MARKITDOWN")).lower() == 'true'
+    if os.getenv("DEFAULT_USE_SUMMARY"):
+        loaded_config["use_summary"] = str(os.getenv("DEFAULT_USE_SUMMARY")).lower() == 'true'
+    logger.info("Configuration loaded from environment variables.")
+    # Log configuration without sensitive data
+    log_config = loaded_config.copy()
+    if "openrouter_api_key" in log_config and log_config["openrouter_api_key"]:
+        log_config["openrouter_api_key"] = f"***{log_config['openrouter_api_key'][-5:]}" if len(log_config['openrouter_api_key']) > 5 else "*****"
+    logger.debug(f"Effective configuration: {log_config}")
+    return loaded_config
+def load_prompt_templates() -> Dict[str, str]:
+    """
+    Load prompt templates from the prompts directory.
+    This function reads template files from the prompts directory specified by
+    PROMPTS_DIR and maps them to their corresponding keys in the PROMPT_FILES dictionary.
+    Returns:
+        Dict[str, str]: Dictionary with loaded prompt templates
+    """
+    templates: Dict[str, str] = {}
+    if not PROMPTS_DIR.is_dir():
+        logger.error(f"Prompts directory '{PROMPTS_DIR}' not found.")
+        return templates
+    for key, filename in PROMPT_FILES.items():
+        filepath = PROMPTS_DIR / filename
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                templates[key] = f.read()
+        except FileNotFoundError:
+            logger.error(f"Prompt file not found: {filepath}")
+        except Exception as e:
+            logger.error(f"Error reading prompt file {filepath}: {e}")
+    logger.info(f"Loaded {len(templates)} prompt templates.")
+    return templates
+def get_config() -> Dict[str, Any]:
+    """
+    Get the configuration from .env.
+    This function loads the configuration only once and returns the cached version
+    on subsequent calls, improving efficiency and ensuring consistency.
+    Returns:
+        Dict[str, Any]: Current configuration dictionary
+    """
+    global _CONFIG_CACHE
+    if _CONFIG_CACHE is None:
+        _CONFIG_CACHE = load_env_config()
+    return _CONFIG_CACHE
+def reload_config() -> Dict[str, Any]:
+    """
+    Force reload of configuration from .env.
+    This function can be used when configuration needs to be explicitly refreshed.
+    Returns:
+        Dict[str, Any]: Updated configuration dictionary
+    """
+    global _CONFIG_CACHE
+    _CONFIG_CACHE = load_env_config()
+    return _CONFIG_CACHE
+def get_prompts() -> Dict[str, str]:
+    """
+    Get the prompt templates.
+    This function loads the prompt templates only once and returns the cached version
+    on subsequent calls, improving efficiency.
+    Returns:
+        Dict[str, str]: Dictionary with loaded prompt templates
+    """
+    global _PROMPTS_CACHE
+    if _PROMPTS_CACHE is None:
+        _PROMPTS_CACHE = load_prompt_templates()
+    return _PROMPTS_CACHE
+def get_required_prompts_for_config(cfg: Dict[str, Any]) -> Dict[str, str]:
+    """
+    Get only the prompt templates required for the given configuration.
+    This function determines which prompt templates are necessary based on the
+    provided configuration and returns only those templates.
+    Args:
+        cfg (Dict[str, Any]): Configuration dictionary
+    Returns:
+        Dict[str, str]: Dictionary with required prompt templates
+    """
+    prompts = get_prompts()
+    required_keys: List[str] = ["vlm_base"]
+    has_markdown = cfg.get("use_markitdown", False)
+    has_summary = cfg.get("use_summary", False)
+    if has_markdown and has_summary:
+        required_keys.append("vlm_full")
+    elif has_markdown:
+        required_keys.append("vlm_markdown")
+    elif has_summary:
+        required_keys.append("vlm_summary")
+    if has_summary:
+        required_keys.append("summary")
+    # Check if all required prompts are available
+    missing = [key for key in required_keys if key not in prompts]
+    if missing:
+        logger.error(f"Missing required prompt templates: {', '.join(missing)}")
+        return {}
+    return {key: prompts[key] for key in required_keys if key in prompts}

describepdf/core.py ADDED Viewed

	@@ -0,0 +1,328 @@

+"""
+Core module for DescribePDF.
+This module contains the main orchestration logic for converting PDFs to Markdown descriptions.
+"""
+import os
+import time
+from typing import Dict, Any, Callable, Tuple, List, Optional
+import contextlib
+import logging
+from . import config
+from . import pdf_processor
+from . import markitdown_processor
+from . import summarizer
+from . import openrouter_client
+from . import ollama_client
+# Get logger from config module
+logger = logging.getLogger('describepdf')
+class ConversionError(Exception):
+    """Error raised during PDF conversion process."""
+    pass
+def format_markdown_output(descriptions: List[str], original_filename: str) -> str:
+    """
+    Combine page descriptions into a single Markdown file.
+    Args:
+        descriptions: List of strings, each being a description of a page
+        original_filename: Name of the original PDF file
+    Returns:
+        str: Complete Markdown content
+    """
+    md_content = f"# Description of PDF: {original_filename}\n\n"
+    for i, desc in enumerate(descriptions):
+        md_content += f"## Page {i + 1}\n\n"
+        md_content += desc if desc else "*No description generated for this page.*"
+        md_content += "\n\n---\n\n"
+    return md_content
+def convert_pdf_to_markdown(
+    pdf_path: str,
+    cfg: Dict[str, Any],
+    progress_callback: Callable[[float, str], None]
+) -> Tuple[str, Optional[str]]:
+    """
+    Orchestrate the complete PDF to descriptive Markdown conversion process.
+    Args:
+        pdf_path: Path to the PDF file
+        cfg: Configuration dictionary for this run
+        progress_callback: Function accepting (float_progress, string_status)
+    Returns:
+        tuple: (status_message, result_markdown or None)
+    """
+    start_time = time.time()
+    progress_callback(0.0, "Starting conversion process...")
+    logger.info("Starting conversion process...")
+    # Validate provider
+    provider = cfg.get("provider", "openrouter").lower()
+    logger.info(f"Using provider: {provider}")
+    if provider == "openrouter":
+        api_key = cfg.get("openrouter_api_key")
+        if not api_key:
+            msg = "Error: OpenRouter API Key is missing."
+            logger.error(msg)
+            progress_callback(0.0, msg)
+            return msg, None
+    elif provider == "ollama":
+        ollama_endpoint = cfg.get("ollama_endpoint", "http://localhost:11434")
+        if not ollama_client.OLLAMA_AVAILABLE:
+            msg = "Error: Ollama Python client not installed. Install with 'pip install ollama'."
+            logger.error(msg)
+            progress_callback(0.0, msg)
+            return msg, None
+        if not ollama_client.check_ollama_availability(ollama_endpoint):
+            msg = f"Error: Could not connect to Ollama at {ollama_endpoint}. Make sure it is running."
+            logger.error(msg)
+            progress_callback(0.0, msg)
+            return msg, None
+    else:
+        msg = f"Error: Unknown provider '{provider}'. Use 'openrouter' or 'ollama'."
+        logger.error(msg)
+        progress_callback(0.0, msg)
+        return msg, None
+    # Validate input file
+    if not pdf_path or not os.path.exists(pdf_path) or not os.path.isfile(pdf_path):
+        msg = "Error: Invalid or missing PDF file."
+        logger.error(msg)
+        progress_callback(0.0, msg)
+        return msg, None
+    original_filename = os.path.basename(pdf_path)
+    logger.info(f"Processing file: {original_filename}")
+    pdf_doc = None
+    try:
+        # Load required prompts
+        required_prompts = config.get_required_prompts_for_config(cfg)
+        if not required_prompts:
+            msg = "Error: Could not load all required prompt templates. Check the 'prompts' directory."
+            progress_callback(0.0, msg)
+            logger.error(msg)
+            return msg, None
+        # Generate summary if needed
+        pdf_summary = None
+        summary_progress = 0.05
+        if cfg.get("use_summary"):
+            summary_model = cfg.get("summary_llm_model")
+            progress_callback(summary_progress, f"Generating summary using {summary_model}...")
+            try:
+                pdf_summary = summarizer.generate_summary(
+                    pdf_path,
+                    provider=provider,
+                    api_key=cfg.get("openrouter_api_key"),
+                    ollama_endpoint=cfg.get("ollama_endpoint"),
+                    model=summary_model
+                )
+                if pdf_summary:
+                    progress_callback(summary_progress, "Summary generated.")
+                    logger.info("PDF summary generated.")
+                else:
+                    progress_callback(summary_progress, "Warning: Could not generate summary (LLM might have returned empty).")
+                    logger.warning("Failed to generate PDF summary or summary was empty.")
+                    # Set use_summary to False since we don't have a summary
+                    cfg["use_summary"] = False
+            except Exception as e:
+                 error_msg = f"Warning: Summary generation failed: {e}"
+                 progress_callback(summary_progress, error_msg)
+                 logger.warning(error_msg)
+                 # Set use_summary to False since summary generation failed
+                 cfg["use_summary"] = False
+        else:
+            summary_progress = 0.0
+        # Load PDF and process pages
+        pdf_load_progress = summary_progress + 0.05
+        progress_callback(pdf_load_progress, "Analyzing PDF structure...")
+        # Use context manager to ensure PDF document is closed
+        with contextlib.ExitStack() as stack:
+            pdf_doc, pages, total_pages = pdf_processor.get_pdf_pages(pdf_path)
+            # Register PDF document for cleanup only if it was successfully opened
+            if pdf_doc is not None:
+                stack.callback(pdf_doc.close)
+            else:
+                msg = f"Error: Could not process PDF file: {original_filename}"
+                progress_callback(pdf_load_progress, msg)
+                logger.error(msg)
+                return msg, None
+            if not pages or total_pages == 0:
+                msg = f"Error: PDF file is empty: {original_filename}"
+                progress_callback(pdf_load_progress, msg)
+                logger.error(msg)
+                return msg, None
+            progress_callback(pdf_load_progress, f"PDF has {total_pages} pages. Starting page processing...")
+            # Process each page
+            all_descriptions = []
+            page_processing_progress_start = pdf_load_progress
+            total_page_progress_ratio = (0.98 - page_processing_progress_start) if total_pages > 0 else 0
+            for i, page in enumerate(pages):
+                page_num = i + 1
+                current_page_ratio = (page_num / total_pages) if total_pages > 0 else 1.0
+                # Calculate progress for this specific page
+                current_progress = page_processing_progress_start + (current_page_ratio * total_page_progress_ratio)
+                # Update progress for the start of page processing
+                progress_callback(current_progress, f"Processing page {page_num}/{total_pages}...")
+                logger.info(f"Processing page {page_num}/{total_pages}")
+                page_description = None
+                temp_page_pdf_path = None
+                try:
+                    # Render page to image
+                    render_progress_message = f"Page {page_num}: Rendering image..."
+                    progress_callback(current_progress, render_progress_message)
+                    image_bytes, mime_type = pdf_processor.render_page_to_image_bytes(page, image_format="jpeg")
+                    if not image_bytes:
+                        logger.warning(f"Could not render image for page {page_num}. Skipping VLM call.")
+                        all_descriptions.append(f"*Error: Could not render image for page {page_num}.*")
+                        continue
+                    # Extract markdown context if needed
+                    markdown_context = None
+                    if cfg.get("use_markitdown"):
+                        markitdown_progress_message = f"Page {page_num}: Extracting text (Markitdown)..."
+                        progress_callback(current_progress, markitdown_progress_message)
+                        # Verify Markitdown availability
+                        if not markitdown_processor.MARKITDOWN_AVAILABLE:
+                            logger.warning(f"Markitdown not available for page {page_num}. Proceeding without it.")
+                            progress_callback(current_progress, f"Page {page_num}: Markitdown not available, skipping extraction.")
+                        else:
+                            temp_page_pdf_path = pdf_processor.save_page_as_temp_pdf(pdf_doc, i)
+                            if temp_page_pdf_path:
+                                # Register temp file for cleanup
+                                stack.callback(lambda p=temp_page_pdf_path: os.remove(p) if os.path.exists(p) else None)
+                                try:
+                                    markdown_context = markitdown_processor.get_markdown_for_page_via_temp_pdf(temp_page_pdf_path)
+                                    if markdown_context is None:
+                                        logger.warning(f"Markitdown failed for page {page_num}. Proceeding without it.")
+                                        progress_callback(current_progress, f"Page {page_num}: Markitdown extraction failed.")
+                                    else:
+                                        logger.info(f"Markitdown context extracted for page {page_num}.")
+                                except Exception as markdown_err:
+                                    logger.warning(f"Error extracting Markitdown for page {page_num}: {markdown_err}")
+                                    progress_callback(current_progress, f"Page {page_num}: Markitdown extraction error.")
+                            else:
+                                logger.warning(f"Could not create temporary PDF for Markitdown on page {page_num}.")
+                                progress_callback(current_progress, f"Page {page_num}: Failed to prepare for Markitdown.")
+                    # Select appropriate prompt
+                    prompt_key = "vlm_base"
+                    has_markdown = cfg.get("use_markitdown") and markdown_context is not None
+                    has_summary = cfg.get("use_summary") and pdf_summary is not None
+                    if has_markdown and has_summary:
+                        prompt_key = "vlm_full"
+                    elif has_markdown:
+                        prompt_key = "vlm_markdown"
+                    elif has_summary:
+                        prompt_key = "vlm_summary"
+                    vlm_prompt_template = required_prompts.get(prompt_key)
+                    if not vlm_prompt_template:
+                        error_msg = f"Missing required prompt template: {prompt_key}"
+                        progress_callback(current_progress, error_msg)
+                        logger.error(error_msg)
+                        all_descriptions.append(f"*Error: Could not generate description for page {page_num} due to missing prompt template.*")
+                        continue
+                    # Prepare prompt
+                    prompt_text = vlm_prompt_template.replace("[PAGE_NUM]", str(page_num))
+                    prompt_text = prompt_text.replace("[TOTAL_PAGES]", str(total_pages))
+                    prompt_text = prompt_text.replace("[LANGUAGE]", cfg.get("output_language", "English"))
+                    if "[MARKDOWN_CONTEXT]" in prompt_text:
+                        prompt_text = prompt_text.replace("[MARKDOWN_CONTEXT]", markdown_context if markdown_context else "N/A")
+                    if "[SUMMARY_CONTEXT]" in prompt_text:
+                        prompt_text = prompt_text.replace("[SUMMARY_CONTEXT]", pdf_summary if pdf_summary else "N/A")
+                    # Call VLM
+                    vlm_model = cfg.get("vlm_model")
+                    vlm_progress_message = f"Page {page_num}: Calling VLM ({vlm_model})..."
+                    progress_callback(current_progress, vlm_progress_message)
+                    try:
+                        if provider == "openrouter":
+                            page_description = openrouter_client.get_vlm_description(
+                                cfg.get("openrouter_api_key"), vlm_model, prompt_text, image_bytes, mime_type
+                            )
+                        elif provider == "ollama":
+                            page_description = ollama_client.get_vlm_description(
+                                cfg.get("ollama_endpoint"), vlm_model, prompt_text, image_bytes, mime_type
+                            )
+                        if page_description:
+                            logger.info(f"VLM description received for page {page_num}.")
+                        else:
+                            page_description = f"*Warning: VLM did not return a description for page {page_num}.*"
+                            progress_callback(current_progress, f"Page {page_num}: VLM returned no description.")
+                            logger.warning(f"VLM returned no description for page {page_num}.")
+                    except (ValueError, ConnectionError, TimeoutError, ImportError) as api_err:
+                        error_msg = f"API Error on page {page_num}: {api_err}. Aborting."
+                        progress_callback(current_progress, error_msg)
+                        logger.error(error_msg)
+                        raise ConversionError(error_msg)
+                    except Exception as vlm_err:
+                        error_msg = f"Unexpected error during VLM call for page {page_num}: {vlm_err}. Skipping page."
+                        progress_callback(current_progress, error_msg)
+                        logger.exception(error_msg)
+                        page_description = f"*Error: Failed to get VLM description for page {page_num} due to an unexpected error.*"
+                    all_descriptions.append(page_description if page_description else "*No description available.*")
+                except ConversionError:
+                    # Let critical errors propagate up
+                    raise
+                except Exception as page_err:
+                    error_msg = f"Unexpected error processing page {page_num}: {page_err}. Skipping page."
+                    progress_callback(current_progress, error_msg)
+                    logger.exception(error_msg)
+                    all_descriptions.append(f"*Error: An unexpected error occurred while processing page {page_num}.*")
+        # Generate final markdown
+        final_progress = 0.99
+        progress_callback(final_progress, "Combining page descriptions into final Markdown...")
+        final_markdown = format_markdown_output(all_descriptions, original_filename)
+        logger.info("Final Markdown content assembled.")
+        # Report completion
+        end_time = time.time()
+        duration = end_time - start_time
+        final_status = f"Conversion completed successfully in {duration:.2f} seconds."
+        progress_callback(1.0, final_status)
+        logger.info(final_status)
+        return final_status, final_markdown
+    except ConversionError as critical_err:
+        return str(critical_err), None
+    except Exception as e:
+        error_msg = f"Critical Error during conversion: {e}"
+        progress_callback(0.0, error_msg)
+        logger.exception(error_msg)
+        return error_msg, None

describepdf/markitdown_processor.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""
+MarkItDown processor module for DescribePDF.
+This module handles the enhanced text extraction functionality using the
+MarkItDown library to convert PDF content to markdown format.
+"""
+import os
+import logging
+from typing import Optional
+logger = logging.getLogger('describepdf')
+# Check if MarkItDown is available
+try:
+    from markitdown import MarkItDown
+    MARKITDOWN_AVAILABLE = True
+    logger.info("MarkItDown library is available and successfully imported.")
+except ImportError:
+    logger.warning("MarkItDown library not installed. Install with 'pip install markitdown[pdf]'")
+    MARKITDOWN_AVAILABLE = False
+except Exception as e:
+    logger.error(f"Failed to initialize MarkItDown: {e}")
+    MARKITDOWN_AVAILABLE = False
+def _get_markdown_converter() -> Optional['MarkItDown']:
+    """
+    Initialize and return a MarkItDown converter instance.
+    Returns:
+        MarkItDown: An initialized MarkItDown converter or None if not available
+    """
+    if not MARKITDOWN_AVAILABLE:
+        logger.error("Cannot initialize MarkItDown converter - library not available.")
+        return None
+    try:
+        converter = MarkItDown()
+        return converter
+    except Exception as e:
+        logger.error(f"Failed to initialize MarkItDown converter: {e}")
+        return None
+def get_markdown_for_page_via_temp_pdf(temp_pdf_path: str) -> Optional[str]:
+    """
+    Use MarkItDown to extract Markdown from a PDF file (single page).
+    Args:
+        temp_pdf_path: Path to the temporary single-page PDF file
+    Returns:
+        str: Extracted Markdown content, or None if there was an error
+    """
+    if not MARKITDOWN_AVAILABLE:
+        logger.error("MarkItDown converter is not available.")
+        return None
+    if not os.path.exists(temp_pdf_path):
+        logger.error(f"Temporary PDF file not found: {temp_pdf_path}")
+        return None
+    try:
+        md_converter = _get_markdown_converter()
+        if not md_converter:
+            return None
+        result = md_converter.convert(temp_pdf_path)
+        logger.debug(f"Extracted Markdown from temporary PDF: {temp_pdf_path}")
+        return result.text_content if result else ""
+    except Exception as e:
+        logger.error(f"MarkItDown failed to process {temp_pdf_path}: {e}")
+        return None
+def is_available() -> bool:
+    """
+    Check if MarkItDown functionality is available.
+    Returns:
+        bool: True if MarkItDown is available, False otherwise
+    """
+    return MARKITDOWN_AVAILABLE

describepdf/ollama_client.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""
+Ollama client module for DescribePDF.
+This module handles all interactions with local Ollama API for
+VLM (Vision Language Model) image description and LLM text summarization.
+"""
+import logging
+import base64
+import requests
+from typing import Any, Dict, List
+# Try to import Ollama, but handle gracefully if it's not available
+try:
+    import ollama
+    from ollama import Client
+    OLLAMA_AVAILABLE = True
+except ImportError:
+    OLLAMA_AVAILABLE = False
+    logging.warning("Ollama Python client not available. Install with 'pip install ollama'")
+# Get logger
+logger = logging.getLogger('describepdf')
+def check_ollama_availability(endpoint: str) -> bool:
+    """
+    Check if Ollama is available at the specified endpoint.
+    Args:
+        endpoint: URL of the Ollama endpoint
+    Returns:
+        bool: True if Ollama is available, False otherwise
+    """
+    if not OLLAMA_AVAILABLE:
+        logger.error("Ollama Python client not installed.")
+        return False
+    try:
+        # Normalize endpoint URL by removing trailing slashes
+        endpoint = endpoint.rstrip('/')
+        # Use requests to check API availability (faster than creating a Client)
+        response = requests.get(f"{endpoint}/api/version", timeout=5)
+        response.raise_for_status()
+        logger.info(f"Ollama is available at {endpoint}. Response status: {response.status_code}")
+        return True
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Could not connect to Ollama at {endpoint}: {e}")
+        return False
+    except Exception as e:
+        logger.error(f"Unexpected error checking Ollama availability: {e}")
+        return False
+def get_vlm_description(endpoint: str, model: str, prompt_text: str, image_bytes: bytes, mime_type: str) -> str:
+    """
+    Get a page description using a VLM through Ollama.
+    Args:
+        endpoint: URL of the Ollama endpoint
+        model: Ollama VLM model name
+        prompt_text: Text prompt
+        image_bytes: Bytes of the page image
+        mime_type: MIME type of the image ('image/png' or 'image/jpeg')
+    Returns:
+        str: Generated description
+    Raises:
+        ImportError: If Ollama Python client is not installed
+        ConnectionError: If communication with Ollama fails
+        ValueError: If there's an issue with the request parameters
+    """
+    if not OLLAMA_AVAILABLE:
+        raise ImportError("Ollama Python client not installed. Install with 'pip install ollama'")
+    try:
+        # Create Ollama client
+        client: Client = Client(host=endpoint.rstrip('/'))
+        # Encode image to base64
+        encoded_image = base64.b64encode(image_bytes).decode('utf-8')
+        # Prepare messages for chat API
+        messages: List[Dict[str, Any]] = [
+            {
+                'role': 'user',
+                'content': prompt_text,
+                'images': [encoded_image]
+            }
+        ]
+        logger.info(f"Calling Ollama VLM model: {model}")
+        # Call Ollama chat API
+        response: Dict[str, Any] = client.chat(
+            model=model,
+            messages=messages
+        )
+        # Extract and validate response
+        if response and 'message' in response and 'content' in response['message']:
+            content = response['message']['content']
+            logger.info(f"Received VLM description from Ollama (model: {model}).")
+            return str(content)
+        else:
+            logger.warning(f"Ollama VLM response structure unexpected: {response}")
+            raise ValueError("Ollama returned unexpected response structure or empty content")
+    except ollama.ResponseError as e:
+        logger.error(f"Ollama API error: {e}")
+        raise ConnectionError(f"Ollama API error: {e}")
+    except Exception as e:
+        logger.error(f"Error getting VLM description from Ollama: {e}")
+        raise
+def get_llm_summary(endpoint: str, model: str, prompt_text: str) -> str:
+    """
+    Get a summary using an LLM through Ollama.
+    Args:
+        endpoint: URL of the Ollama endpoint
+        model: Ollama LLM model for summary
+        prompt_text: Prompt including the text to summarize
+    Returns:
+        str: Generated summary
+    Raises:
+        ImportError: If Ollama Python client is not installed
+        ConnectionError: If communication with Ollama fails
+        ValueError: If there's an issue with the request parameters
+    """
+    if not OLLAMA_AVAILABLE:
+        raise ImportError("Ollama Python client not installed. Install with 'pip install ollama'")
+    try:
+        # Create Ollama client
+        client: Client = Client(host=endpoint.rstrip('/'))
+        # Prepare messages for chat API
+        messages: List[Dict[str, Any]] = [
+            {
+                'role': 'user',
+                'content': prompt_text
+            }
+        ]
+        logger.info(f"Calling Ollama LLM model for summary: {model}")
+        # Call Ollama chat API
+        response: Dict[str, Any] = client.chat(
+            model=model,
+            messages=messages
+        )
+        # Extract and validate response
+        if response and 'message' in response and 'content' in response['message']:
+            content = response['message']['content']
+            logger.info(f"Received summary from Ollama (model: {model}).")
+            return str(content)
+        else:
+            logger.warning(f"Ollama LLM summary response structure unexpected: {response}")
+            raise ValueError("Ollama returned unexpected response structure or empty content")
+    except ollama.ResponseError as e:
+        logger.error(f"Ollama API error: {e}")
+        raise ConnectionError(f"Ollama API error: {e}")
+    except Exception as e:
+        logger.error(f"Error getting LLM summary from Ollama: {e}")
+        raise

describepdf/openrouter_client.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+OpenRouter client module for DescribePDF.
+This module handles all interactions with the OpenRouter API for
+VLM (Vision Language Model) image description and LLM text summarization.
+"""
+import requests
+import base64
+import json
+import logging
+from typing import Dict, Any, List
+# Get logger from config module
+logger = logging.getLogger('describepdf')
+# Constants
+OPENROUTER_API_URL = "https://openrouter.ai/api/v1/chat/completions"
+DEFAULT_TIMEOUT = 300  # 5 minutes
+def encode_image_to_base64(image_bytes: bytes, mime_type: str) -> str:
+    """
+    Encode image bytes to Base64 string for the API.
+    Args:
+        image_bytes: Raw image bytes
+        mime_type: MIME type of the image ('image/png' or 'image/jpeg')
+    Returns:
+        str: Base64 encoded image string with data URI scheme
+    Raises:
+        ValueError: If image encoding fails
+    """
+    try:
+        encoded = base64.b64encode(image_bytes).decode('utf-8')
+        return f"data:{mime_type};base64,{encoded}"
+    except Exception as e:
+        logger.error(f"Error encoding image to Base64: {e}")
+        raise ValueError(f"Failed to encode image: {e}")
+def call_openrouter_api(api_key: str, model: str, messages: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """
+    Make a call to the OpenRouter Chat Completions API.
+    Args:
+        api_key: OpenRouter API key
+        model: Model name to use
+        messages: List of messages in API format
+    Returns:
+        Dict: The JSON response from the API
+    Raises:
+        ValueError: If API key is missing
+        ConnectionError: If API call fails with error response
+        TimeoutError: If API call times out
+    """
+    if not api_key:
+        logger.error("OpenRouter API Key is missing.")
+        raise ValueError("OpenRouter API Key is missing.")
+    headers: Dict[str, str] = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    payload: Dict[str, Any] = {
+        "model": model,
+        "messages": messages
+    }
+    try:
+        # Log API call (without full message content for privacy/size)
+        msg_log = json.dumps(messages)[:200] + ("..." if len(json.dumps(messages)) > 200 else "")
+        logger.debug(f"Calling OpenRouter API. Model: {model}. Messages: {msg_log}")
+        # Make API request
+        response = requests.post(
+            OPENROUTER_API_URL,
+            headers=headers,
+            json=payload,
+            timeout=DEFAULT_TIMEOUT
+        )
+        response.raise_for_status()
+        logger.debug(f"API call successful. Status: {response.status_code}.")
+        return response.json()
+    except requests.exceptions.Timeout:
+        logger.error(f"API call timed out for model {model}.")
+        raise TimeoutError(f"API call timed out for model {model}.")
+    except requests.exceptions.RequestException as e:
+        # Log error details without assuming response exists
+        status_code = getattr(e.response, 'status_code', 'N/A') if hasattr(e, 'response') else 'N/A'
+        response_text = getattr(e.response, 'text', 'No response') if hasattr(e, 'response') else 'No response'
+        logger.error(f"API call failed for model {model}. Status: {status_code}. Response: {response_text}")
+        # Extract error message from response if possible
+        error_message = f"API Error: {e}"
+        if hasattr(e, 'response') and e.response:
+            try:
+                error_details = e.response.json()
+                if 'error' in error_details and 'message' in error_details['error']:
+                    error_message = f"API Error ({e.response.status_code}): {error_details['error']['message']}"
+                else:
+                    error_message = f"API Error ({e.response.status_code}): {e.response.text[:200]}"
+            except json.JSONDecodeError:
+                error_message = f"API Error ({e.response.status_code}): {e.response.text[:200]}"
+        raise ConnectionError(error_message)
+def get_vlm_description(api_key: str, model: str, prompt_text: str, image_bytes: bytes, mime_type: str) -> str:
+    """
+    Get a page description using a VLM through OpenRouter.
+    Args:
+        api_key: OpenRouter API key
+        model: VLM model name
+        prompt_text: Text prompt
+        image_bytes: Bytes of the page image
+        mime_type: MIME type of the image ('image/png' or 'image/jpeg')
+    Returns:
+        str: Generated description
+    Raises:
+        ValueError: If API key is missing or image encoding fails
+        ConnectionError: If API call fails with error response
+        TimeoutError: If API call times out
+    """
+    # Encode image to base64
+    base64_image = encode_image_to_base64(image_bytes, mime_type)
+    # Prepare messages for API
+    messages: List[Dict[str, Any]] = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt_text},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": base64_image}
+                }
+            ]
+        }
+    ]
+    # Call OpenRouter API
+    response_json = call_openrouter_api(api_key, model, messages)
+    # Process response
+    if response_json and 'choices' in response_json and response_json['choices']:
+        if len(response_json['choices']) > 0:
+            message = response_json['choices'][0].get('message', {})
+            if message and 'content' in message:
+                content = message.get('content')
+                if content:
+                    logger.info(f"Received VLM description for page (model: {model}).")
+                    return str(content)
+        logger.warning(f"VLM response structure unexpected or content empty.")
+        raise ValueError("VLM returned no usable content")
+    else:
+        logger.warning(f"VLM response JSON structure unexpected: {response_json}")
+        raise ValueError("VLM returned unexpected response structure")
+def get_llm_summary(api_key: str, model: str, prompt_text: str) -> str:
+    """
+    Get a summary using an LLM through OpenRouter.
+    Args:
+        api_key: OpenRouter API key
+        model: LLM model for summary
+        prompt_text: Prompt including the text to summarize
+    Returns:
+        str: Generated summary
+    Raises:
+        ValueError: If API key is missing
+        ConnectionError: If API call fails with error response
+        TimeoutError: If API call times out
+    """
+    # Prepare messages for API
+    messages: List[Dict[str, Any]] = [
+        {"role": "user", "content": prompt_text}
+    ]
+    # Call OpenRouter API
+    response_json = call_openrouter_api(api_key, model, messages)
+    # Process response
+    if response_json and 'choices' in response_json and response_json['choices']:
+        if len(response_json['choices']) > 0:
+            message = response_json['choices'][0].get('message', {})
+            if message and 'content' in message:
+                content = message.get('content')
+                if content:
+                    logger.info(f"Received summary (model: {model}).")
+                    return str(content)
+        logger.warning(f"LLM summary response structure unexpected or content empty.")
+        raise ValueError("LLM returned no usable content")
+    else:
+        logger.warning(f"LLM summary response JSON structure unexpected: {response_json}")
+        raise ValueError("LLM returned unexpected response structure")

describepdf/pdf_processor.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""
+PDF processor module for DescribePDF.
+This module handles all PDF file operations using PyMuPDF,
+including rendering, text extraction, and file manipulation.
+"""
+import io
+import os
+import tempfile
+from typing import Tuple, List, Optional
+# Get logger from config module
+from .config import logger
+try:
+    import pymupdf
+    PYMUPDF_AVAILABLE = True
+except ImportError:
+    PYMUPDF_AVAILABLE = False
+    logger.error("PyMuPDF not installed. Install with 'pip install pymupdf'")
+# Import PIL for image processing
+try:
+    from PIL import Image
+    PIL_AVAILABLE = True
+except ImportError:
+    PIL_AVAILABLE = False
+    logger.error("Pillow not installed. Install with 'pip install pillow'")
+def get_pdf_pages(pdf_path: str) -> Tuple[Optional[pymupdf.Document], Optional[List[pymupdf.Page]], int]:
+    """
+    Open a PDF and return a list of page objects and the total number of pages.
+    NOTE: The caller is responsible for calling close() on the returned document
+    when done with it.
+    Args:
+        pdf_path: Path to the PDF file
+    Returns:
+        Tuple containing:
+        - pymupdf.Document: The open PDF document (caller must close)
+        - List[pymupdf.Page]: List of page objects
+        - int: Total number of pages (0 if error)
+    """
+    if not PYMUPDF_AVAILABLE:
+        logger.error("PyMuPDF is required for PDF processing but is not installed.")
+        return None, None, 0
+    try:
+        doc = pymupdf.open(pdf_path)
+        pages = [doc.load_page(i) for i in range(len(doc))]
+        total_pages = len(doc)
+        logger.info(f"Opened PDF '{os.path.basename(pdf_path)}' with {total_pages} pages.")
+        return doc, pages, total_pages
+    except Exception as e:
+        logger.error(f"Error opening or reading PDF {pdf_path}: {e}")
+        return None, None, 0
+def render_page_to_image_bytes(page: pymupdf.Page, image_format: str = "jpeg", dpi: int = 150) -> Tuple[Optional[bytes], Optional[str]]:
+    """
+    Render a PDF page to image bytes in memory.
+    Args:
+        page: PyMuPDF Page object
+        image_format: Desired format ('png' or 'jpeg')
+        dpi: Image resolution
+    Returns:
+        Tuple containing:
+        - bytes: Image bytes
+        - str: MIME type ('image/png' or 'image/jpeg')
+        Returns (None, None) on error
+    """
+    if not PYMUPDF_AVAILABLE or not PIL_AVAILABLE:
+        logger.error("PyMuPDF and Pillow are required for image rendering but are not installed.")
+        return None, None
+    try:
+        # Validate image format
+        if image_format.lower() not in ["png", "jpeg"]:
+            logger.error(f"Unsupported image format: {image_format}")
+            return None, None
+        # Render page to pixmap
+        pix = page.get_pixmap(dpi=dpi)
+        img_bytes_io = io.BytesIO()
+        if image_format.lower() == "png":
+            # Use PyMuPDF's built-in PNG conversion
+            img_bytes = pix.tobytes("png")
+            img_bytes_io.write(img_bytes)
+            mime_type = "image/png"
+        elif image_format.lower() == "jpeg":
+            # Use PIL for JPEG conversion
+            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            img.save(img_bytes_io, format="JPEG", quality=85)
+            mime_type = "image/jpeg"
+        img_bytes_io.seek(0)
+        logger.debug(f"Rendered page {page.number + 1} to {image_format.upper()} bytes.")
+        return img_bytes_io.getvalue(), mime_type
+    except Exception as e:
+        logger.error(f"Error rendering page {page.number + 1} to image: {e}")
+        return None, None
+def extract_all_text(pdf_path: str) -> Optional[str]:
+    """
+    Extract all text from a PDF file.
+    Args:
+        pdf_path: Path to the PDF file
+    Returns:
+        str: Concatenated text from all pages, or None if there was an error
+    """
+    if not PYMUPDF_AVAILABLE:
+        logger.error("PyMuPDF is required for text extraction but is not installed.")
+        return None
+    doc = None
+    try:
+        doc = pymupdf.open(pdf_path)
+        all_text = ""
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            all_text += page.get_text("text") + "\n\n"
+        logger.info(f"Extracted text from all pages of '{os.path.basename(pdf_path)}'.")
+        return all_text
+    except Exception as e:
+        logger.error(f"Error extracting text from PDF {pdf_path}: {e}")
+        return None
+    finally:
+        # Always close the document if we opened it
+        if doc is not None:
+            doc.close()
+def save_page_as_temp_pdf(original_doc: pymupdf.Document, page_num: int) -> Optional[str]:
+    """
+    Save a specific page as a temporary PDF file.
+    Args:
+        original_doc: The open original PDF document
+        page_num: The page number (zero-based)
+    Returns:
+        str: Path to the temporary PDF file, or None if there was an error
+    """
+    if not PYMUPDF_AVAILABLE:
+        logger.error("PyMuPDF is required for PDF processing but is not installed.")
+        return None
+    new_doc = None
+    temp_pdf_path = None
+    try:
+        # Create a temporary file with a proper naming pattern
+        with tempfile.NamedTemporaryFile(suffix=".pdf", prefix="describepdf_page_", delete=False) as tmp_file:
+            temp_pdf_path = tmp_file.name
+        # Create new document with the single page
+        new_doc = pymupdf.open()
+        new_doc.insert_pdf(original_doc, from_page=page_num, to_page=page_num)
+        new_doc.save(temp_pdf_path)
+        logger.debug(f"Saved page {page_num + 1} to temporary PDF: {temp_pdf_path}")
+        return temp_pdf_path
+    except Exception as e:
+        logger.error(f"Error saving page {page_num + 1} as temporary PDF: {e}")
+        # Clean up on error
+        if temp_pdf_path and os.path.exists(temp_pdf_path):
+            try:
+                os.remove(temp_pdf_path)
+                logger.debug(f"Cleaned up temporary PDF due to error: {temp_pdf_path}")
+            except OSError as os_err:
+                logger.warning(f"Failed to remove temporary PDF after error: {os_err}")
+        return None
+    finally:
+        # Always close the new document if we created it
+        if new_doc is not None:
+            new_doc.close()

describepdf/summarizer.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+Summarizer module for DescribePDF.
+This module handles the generation of document summaries from PDF text content
+using either OpenRouter or Ollama LLM models.
+"""
+import logging
+from typing import Optional
+from . import pdf_processor
+from . import openrouter_client
+from . import ollama_client
+from .config import get_prompts
+# Get logger from config module
+logger = logging.getLogger('describepdf')
+# Constants
+MAX_CHARS_FOR_PROMPT = 512000  # Maximum characters to include in prompt (128K tokens approx.)
+def generate_summary(
+    pdf_path: str,
+    provider: str = "openrouter",
+    api_key: Optional[str] = None,
+    ollama_endpoint: Optional[str] = None,
+    model: Optional[str] = None
+) -> Optional[str]:
+    """
+    Generate a summary of the complete textual content of a PDF using specified provider.
+    Args:
+        pdf_path: Path to the PDF file
+        provider: Provider to use ("openrouter" or "ollama")
+        api_key: OpenRouter API key (required for openrouter provider)
+        ollama_endpoint: Ollama endpoint URL (required for ollama provider)
+        model: LLM model to use for the summary
+    Returns:
+        str: The generated summary, or None if any step fails
+    """
+    logger.info(f"Starting summary generation for '{pdf_path}' using provider {provider} with model {model}.")
+    # Extract text from PDF
+    logger.info("Extracting full text from PDF...")
+    full_text = pdf_processor.extract_all_text(pdf_path)
+    # Handle error cases
+    if full_text is None:
+        logger.error("Failed to extract text for summary.")
+        return None
+    if not full_text.strip():
+        logger.warning("PDF contains no extractable text for summary.")
+        return "Document contains no extractable text."
+    logger.info(f"Text extracted ({len(full_text)} characters). Preparing summary prompt...")
+    # Load and prepare prompt
+    prompts = get_prompts()
+    summary_prompt_template = prompts.get("summary")
+    if not summary_prompt_template:
+        logger.error("Summary prompt template not found.")
+        return None
+    # Truncate text if too long
+    if len(full_text) > MAX_CHARS_FOR_PROMPT:
+        logger.warning(
+            f"PDF text ({len(full_text)} chars) exceeds limit ({MAX_CHARS_FOR_PROMPT}), truncating for summary."
+        )
+        full_text = full_text[:MAX_CHARS_FOR_PROMPT] + "\n\n[... text truncated ...]"
+    # Fill prompt template
+    prompt_text = summary_prompt_template.replace("[FULL_PDF_TEXT]", full_text)
+    # Call LLM for summary based on provider
+    try:
+        # Handle OpenRouter provider
+        if provider == "openrouter":
+            if not api_key:
+                logger.error("OpenRouter API key is required for OpenRouter provider.")
+                return None
+            logger.info(f"Calling OpenRouter LLM for summary (model: {model})...")
+            summary = openrouter_client.get_llm_summary(api_key, model, prompt_text)
+            if summary:
+                logger.info("Summary generated successfully via OpenRouter.")
+                return summary
+            else:
+                logger.error("OpenRouter LLM call for summary returned no content.")
+                return None
+        # Handle Ollama provider
+        elif provider == "ollama":
+            if not ollama_endpoint:
+                logger.error("Ollama endpoint URL is required for Ollama provider.")
+                return None
+            logger.info(f"Calling Ollama LLM for summary (model: {model})...")
+            summary = ollama_client.get_llm_summary(ollama_endpoint, model, prompt_text)
+            if summary:
+                logger.info("Summary generated successfully via Ollama.")
+                return summary
+            else:
+                logger.error("Ollama LLM call for summary returned no content.")
+                return None
+        # Handle unsupported provider
+        else:
+            logger.error(f"Unsupported provider: {provider}")
+            return None
+    except ValueError as e:
+        logger.error(f"Value error during summary generation: {e}")
+        return None
+    except ConnectionError as e:
+        logger.error(f"Connection error during summary generation: {e}")
+        return None
+    except TimeoutError as e:
+        logger.error(f"Timeout error during summary generation: {e}")
+        return None
+    except ImportError as e:
+        logger.error(f"Import error during summary generation: {e}")
+        return None
+    except Exception as e:
+        logger.critical(f"Critical unexpected error during summary generation: {e}", exc_info=True)
+        raise

describepdf/ui.py ADDED Viewed

	@@ -0,0 +1,291 @@

+"""
+Web UI module for DescribePDF with OpenRouter.
+This module implements the Gradio-based web interface for the OpenRouter
+provider version of DescribePDF.
+"""
+import gradio as gr
+import os
+import tempfile
+import logging
+import secrets
+from typing import Tuple, Optional, Dict, Any, List
+from . import config
+from . import core
+theme = gr.themes.Soft(
+    primary_hue="red",
+    secondary_hue="rose",
+    spacing_size="lg",
+)
+def generate(
+    pdf_file_obj: Optional[gr.File],
+    ui_api_key: str,
+    ui_vlm_model: str,
+    ui_lang: str,
+    ui_use_md: bool,
+    ui_use_sum: bool,
+    ui_sum_model: str,
+    progress: gr.Progress = gr.Progress(track_tqdm=True)
+) -> Tuple[str, gr.update, Optional[str]]:
+    """
+    Wrapper function to call the core conversion process and handle the Gradio UI.
+    Args:
+        pdf_file_obj: Gradio File object for the uploaded PDF
+        ui_api_key: OpenRouter API key from UI
+        ui_vlm_model: VLM model name from UI
+        ui_lang: Output language from UI
+        ui_use_md: Whether to use Markitdown from UI
+        ui_use_sum: Whether to generate a summary from UI
+        ui_sum_model: Summary model name from UI
+        progress: Gradio progress tracker
+    Returns:
+        Tuple containing:
+        - str: Status message
+        - gr.update: Download button update
+        - Optional[str]: Markdown result content
+    """
+    # Validate input file
+    if pdf_file_obj is None:
+        return "Please upload a PDF file.", gr.update(value=None, visible=False), None
+    # Load environment config
+    env_config = config.get_config()
+    # Prepare configuration for this run
+    api_key = ui_api_key.strip() if ui_api_key.strip() else env_config.get("openrouter_api_key")
+    current_run_config: Dict[str, Any] = {
+        "provider": "openrouter",
+        "openrouter_api_key": api_key,
+        "vlm_model": ui_vlm_model,
+        "output_language": ui_lang,
+        "use_markitdown": ui_use_md,
+        "use_summary": ui_use_sum,
+        "summary_llm_model": ui_sum_model if ui_sum_model else env_config.get("or_summary_model")
+    }
+    # Validate API key
+    if not current_run_config.get("openrouter_api_key"):
+        error_msg = "Error: OpenRouter API Key is missing. Provide it in the UI or set OPENROUTER_API_KEY in the .env file."
+        logging.error(error_msg)
+        return error_msg, gr.update(value=None, visible=False), None
+    # Create progress callback for Gradio
+    def progress_callback_gradio(progress_value: float, status: str) -> None:
+        """
+        Update Gradio progress bar with current progress and status message.
+        Args:
+            progress_value (float): Progress value between 0.0 and 1.0
+            status (str): Current status message to display
+        """
+        clamped_progress = max(0.0, min(1.0, progress_value))
+        progress(clamped_progress, desc=status)
+        logging.info(f"Progress: {status} ({clamped_progress*100:.1f}%)")
+    # Run the conversion
+    status_message, result_markdown = core.convert_pdf_to_markdown(
+        pdf_file_obj.name,
+        current_run_config,
+        progress_callback_gradio
+    )
+    # Handle the download file
+    if result_markdown:
+        try:
+            # Get base filename from the uploaded PDF
+            base_name = os.path.splitext(os.path.basename(pdf_file_obj.name))[0]
+            download_filename = f"{base_name}_description.md"
+            # Create a temporary file with a random component to avoid collisions
+            random_suffix = secrets.token_hex(4)
+            temp_dir = tempfile.gettempdir()
+            download_filepath = os.path.join(temp_dir, f"{base_name}_{random_suffix}.md")
+            # Write markdown result to the temporary file
+            with open(download_filepath, "w", encoding="utf-8") as md_file:
+                md_file.write(result_markdown)
+            logging.info(f"Markdown result saved to temporary file for download: {download_filepath}")
+            download_button_update = gr.update(value=download_filepath, visible=True, label=f"Download '{download_filename}'")
+        except Exception as e:
+            logging.error(f"Error creating temporary file for download: {e}")
+            status_message += " (Error creating download file)"
+            download_button_update = gr.update(value=None, visible=False)
+    else:
+        download_button_update = gr.update(value=None, visible=False)
+    return (
+        status_message,
+        download_button_update,
+        result_markdown if result_markdown else ""
+    )
+def create_ui() -> gr.Blocks:
+    """
+    Create and return the Gradio interface for OpenRouter.
+    This function sets up a Gradio web interface with tabs for PDF conversion
+    and configuration. It loads initial settings from the environment config
+    and provides UI components for adjusting settings for each conversion run.
+    Returns:
+        gr.Blocks: Configured Gradio interface ready to be launched
+    """
+    # Load initial config from environment
+    initial_env_config = config.get_config()
+    # Define suggested model lists and languages
+    suggested_vlms: List[str] = [
+        "qwen/qwen2.5-vl-72b-instruct",
+        "google/gemini-2.5-pro-preview-03-25",
+        "openai/chatgpt-4o-latest"
+    ]
+    suggested_llms: List[str] = [
+        "google/gemini-2.5-flash-preview",
+        "openai/chatgpt-4o-latest",
+        "anthropic/claude-3.5-sonnet"
+    ]
+    suggested_languages: List[str] = [
+        "English", "Spanish", "French", "German",
+        "Chinese", "Japanese", "Italian",
+        "Portuguese", "Russian", "Korean"
+    ]
+    # Set initial values from config
+    initial_vlm = initial_env_config.get("or_vlm_model")
+    initial_llm = initial_env_config.get("or_summary_model")
+    initial_lang = initial_env_config.get("output_language")
+    initial_use_md = initial_env_config.get("use_markitdown")
+    initial_use_sum = initial_env_config.get("use_summary")
+    has_env_api_key = bool(initial_env_config.get("openrouter_api_key"))
+    # Create the Gradio interface
+    with gr.Blocks(title="DescribePDF", theme=theme) as iface:
+        gr.Markdown("<center><img src='https://davidlms.github.io/DescribePDF/assets/poster.png' alt='Describe PDF Logo' width='600px'/></center>")
+        gr.Markdown(
+            """<div style="display: flex;align-items: center;justify-content: center">
+            [<a href="https://davidlms.github.io/describepdf/">Project Page</a>] | [<a href="https://github.com/DavidLMS/describepdf">Github</a>]</div>
+            """
+        )
+        gr.Markdown(
+            "DescribePDF is an open-source tool designed to convert PDF files into detailed page-by-page descriptions in Markdown format using Vision-Language Models (VLMs). Unlike traditional PDF extraction tools that focus on replicating the text layout, DescribePDF generates rich, contextual descriptions of each page's content, making it perfect for visually complex documents like catalogs, scanned documents, and presentations."
+            "\n\n"
+            "Upload a PDF, adjust settings, and click 'Describe'. "
+        )
+        with gr.Tabs():
+            # Generate tab
+            with gr.TabItem("Generate", id=0):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        pdf_input = gr.File(
+                            label="Upload PDF",
+                            file_types=['.pdf'],
+                            type="filepath"
+                        )
+                        convert_button = gr.Button(
+                            "Describe",
+                            variant="primary"
+                        )
+                        progress_output = gr.Textbox(
+                            label="Progress",
+                            interactive=False,
+                            lines=2
+                        )
+                        download_button = gr.File(
+                            label="Download Markdown",
+                            visible=False,
+                            interactive=False
+                        )
+                    with gr.Column(scale=2):
+                        markdown_output = gr.Markdown(label="Result (Markdown)")
+            # Configuration tab
+            with gr.TabItem("Settings", id=1):
+                gr.Markdown(
+                    "Adjust settings for the *next* generation. These settings are **not** saved. "
+                    "Defaults are controlled by the `.env` file."
+                )
+                api_key_input = gr.Textbox(
+                    label="OpenRouter API Key" + (" (set in .env)" if has_env_api_key else ""),
+                    type="password",
+                    placeholder="Enter an API key here to override the one in .env" if has_env_api_key else "Enter your OpenRouter API key",
+                    value=""
+                )
+                vlm_model_input = gr.Dropdown(
+                    label="VLM Model",
+                    choices=suggested_vlms,
+                    value=initial_vlm,
+                    allow_custom_value=True,
+                    info="Select or type the OpenRouter VLM model name"
+                )
+                output_language_input = gr.Dropdown(
+                    label="Output Language",
+                    choices=suggested_languages,
+                    value=initial_lang,
+                    allow_custom_value=True,
+                    info="Select or type the desired output language (e.g., English, Spanish)"
+                )
+                with gr.Row():
+                    use_markitdown_checkbox = gr.Checkbox(
+                        label="Use Markitdown for extra text context",
+                        value=initial_use_md
+                    )
+                    use_summary_checkbox = gr.Checkbox(
+                        label="Use PDF summary for augmented context (requires extra LLM call)",
+                        value=initial_use_sum
+                    )
+                summary_llm_model_input = gr.Dropdown(
+                    label="LLM Model for Summary",
+                    choices=suggested_llms,
+                    value=initial_llm,
+                    allow_custom_value=True,
+                    info="Select or type the OpenRouter LLM model name for summaries"
+                )
+        # Connect UI components
+        conversion_inputs = [
+            pdf_input, api_key_input, vlm_model_input, output_language_input,
+            use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input
+        ]
+        conversion_outputs = [
+            progress_output, download_button, markdown_output
+        ]
+        convert_button.click(
+            fn=generate,
+            inputs=conversion_inputs,
+            outputs=conversion_outputs
+        )
+    return iface
+def launch_app() -> None:
+    """
+    Start the application from the command line.
+    This function creates the Gradio UI and launches it.
+    """
+    app: gr.Blocks = create_ui()
+    # Check if we're on Hugging Face Spaces
+    if "SPACE_ID" in os.environ:
+        return app  # Just return the app object for HF Spaces
+    else:
+        app.launch()  # Launch directly when run locally
+    return app  # Return app in all cases for flexibility
+if __name__ == "__main__":
+    launch_app()

describepdf/ui_ollama.py ADDED Viewed

	@@ -0,0 +1,269 @@

+"""
+Web UI module for DescribePDF with Ollama.
+This module implements the Gradio-based web interface for the Ollama
+provider version of DescribePDF.
+"""
+import gradio as gr
+import os
+import tempfile
+import logging
+import secrets
+from typing import Tuple, Optional, Dict, Any, List
+from . import config
+from . import core
+from . import ollama_client
+theme = gr.themes.Soft(
+    primary_hue="red",
+    secondary_hue="rose",
+    spacing_size="lg",
+)
+def generate(
+    pdf_file_obj: Optional[gr.File],
+    ollama_endpoint: str,
+    ui_vlm_model: str,
+    ui_lang: str,
+    ui_use_md: bool,
+    ui_use_sum: bool,
+    ui_sum_model: str,
+    progress: gr.Progress = gr.Progress(track_tqdm=True)
+) -> Tuple[str, gr.update, Optional[str]]:
+    """
+    Wrapper function to call the core conversion process and handle the Gradio UI for Ollama.
+    Args:
+        pdf_file_obj: Gradio File object for the uploaded PDF
+        ollama_endpoint: Ollama server endpoint URL
+        ui_vlm_model: VLM model name from UI
+        ui_lang: Output language from UI
+        ui_use_md: Whether to use Markitdown from UI
+        ui_use_sum: Whether to generate a summary from UI
+        ui_sum_model: Summary model name from UI
+        progress: Gradio progress tracker
+    Returns:
+        Tuple containing:
+        - str: Status message
+        - gr.update: Download button update
+        - Optional[str]: Markdown result content
+    """
+    # Validate input file
+    if pdf_file_obj is None:
+        return "Please upload a PDF file.", gr.update(value=None, visible=False), None
+    # Check Ollama availability
+    if not ollama_client.check_ollama_availability(ollama_endpoint):
+        error_msg = f"Error: Could not connect to Ollama at {ollama_endpoint}. Make sure it is running."
+        logging.error(error_msg)
+        return error_msg, gr.update(value=None, visible=False), None
+    # Prepare configuration for this run
+    current_run_config: Dict[str, Any] = {
+        "provider": "ollama",
+        "ollama_endpoint": ollama_endpoint,
+        "vlm_model": ui_vlm_model,
+        "output_language": ui_lang,
+        "use_markitdown": ui_use_md,
+        "use_summary": ui_use_sum,
+        "summary_llm_model": ui_sum_model
+    }
+    # Create progress callback for Gradio
+    def progress_callback_gradio(progress_value: float, status: str) -> None:
+        """
+        Update Gradio progress bar with current progress and status message.
+        Args:
+            progress_value (float): Progress value between 0.0 and 1.0
+            status (str): Current status message to display
+        """
+        clamped_progress = max(0.0, min(1.0, progress_value))
+        progress(clamped_progress, desc=status)
+        logging.info(f"Progress: {status} ({clamped_progress*100:.1f}%)")
+    # Run the conversion
+    status_message, result_markdown = core.convert_pdf_to_markdown(
+        pdf_file_obj.name,
+        current_run_config,
+        progress_callback_gradio
+    )
+    # Handle the download file
+    if result_markdown:
+        try:
+            # Get base filename from the uploaded PDF
+            base_name = os.path.splitext(os.path.basename(pdf_file_obj.name))[0]
+            download_filename = f"{base_name}_description.md"
+            # Create a temporary file with a random component to avoid collisions
+            random_suffix = secrets.token_hex(4)
+            temp_dir = tempfile.gettempdir()
+            download_filepath = os.path.join(temp_dir, f"{base_name}_{random_suffix}.md")
+            # Write markdown result to the temporary file
+            with open(download_filepath, "w", encoding="utf-8") as md_file:
+                md_file.write(result_markdown)
+            logging.info(f"Markdown result saved to temporary file for download: {download_filepath}")
+            download_button_update = gr.update(value=download_filepath, visible=True, label=f"Download '{download_filename}'")
+        except Exception as e:
+            logging.error(f"Error creating temporary file for download: {e}")
+            status_message += " (Error creating download file)"
+            download_button_update = gr.update(value=None, visible=False)
+    else:
+        download_button_update = gr.update(value=None, visible=False)
+    return (
+        status_message,
+        download_button_update,
+        result_markdown if result_markdown else ""
+    )
+def create_ui() -> gr.Blocks:
+    """
+    Create and return the Gradio interface for Ollama.
+    This function sets up a Gradio web interface with tabs for PDF conversion
+    and configuration. It loads initial settings from the environment config
+    and provides UI components for adjusting settings for each conversion run.
+    Returns:
+        gr.Blocks: Configured Gradio interface ready to be launched
+    """
+    # Load initial config from environment
+    initial_env_config = config.get_config()
+    # Define suggested model lists and languages
+    suggested_vlms: List[str] = ["llama3.2-vision"]
+    suggested_llms: List[str] = ["qwen2.5", "llama3.2"]
+    suggested_languages: List[str] = [
+        "English", "Spanish", "French", "German",
+        "Chinese", "Japanese", "Italian",
+        "Portuguese", "Russian", "Korean"
+    ]
+    # Set initial values from config
+    initial_endpoint = initial_env_config.get("ollama_endpoint", "http://localhost:11434")
+    initial_vlm = initial_env_config.get("ollama_vlm_model", "llama3.2-vision")
+    initial_llm = initial_env_config.get("ollama_summary_model", "qwen2.5")
+    initial_lang = initial_env_config.get("output_language", "English")
+    initial_use_md = initial_env_config.get("use_markitdown", False)
+    initial_use_sum = initial_env_config.get("use_summary", False)
+    # Create the Gradio interface
+    with gr.Blocks(title="DescribePDF", theme=theme) as iface:
+        gr.Markdown("<center><img src='https://davidlms.github.io/DescribePDF/assets/poster.png' alt='Describe PDF Logo' width='600px'/></center>")
+        gr.Markdown(
+            """<div style="display: flex;align-items: center;justify-content: center">
+            [<a href="https://davidlms.github.io/describepdf/">Project Page</a>] | [<a href="https://github.com/DavidLMS/describepdf">Github</a>]</div>
+            """
+        )
+        gr.Markdown(
+            "DescribePDF is an open-source tool designed to convert PDF files into detailed page-by-page descriptions in Markdown format using Vision-Language Models (VLMs). Unlike traditional PDF extraction tools that focus on replicating the text layout, DescribePDF generates rich, contextual descriptions of each page's content, making it perfect for visually complex documents like catalogs, scanned documents, and presentations."
+            "\n\n"
+            "Upload a PDF, adjust settings, and click 'Describe'. "
+        )
+        with gr.Tabs():
+            # Generate tab
+            with gr.TabItem("Generate", id=0):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        pdf_input = gr.File(
+                            label="Upload PDF",
+                            file_types=['.pdf'],
+                            type="filepath"
+                        )
+                        convert_button = gr.Button(
+                            "Describe",
+                            variant="primary"
+                        )
+                        progress_output = gr.Textbox(
+                            label="Progress",
+                            interactive=False,
+                            lines=2
+                        )
+                        download_button = gr.File(
+                            label="Download Markdown",
+                            visible=False,
+                            interactive=False
+                        )
+                    with gr.Column(scale=2):
+                        markdown_output = gr.Markdown(label="Result (Markdown)")
+            # Configuration tab
+            with gr.TabItem("Settings", id=1):
+                gr.Markdown(
+                    "Adjust settings for the *next* generation. These settings are **not** saved. "
+                    "Defaults are controlled by the `.env` file."
+                )
+                ollama_endpoint_input = gr.Textbox(
+                    label="Ollama Endpoint",
+                    value=initial_endpoint,
+                    placeholder="http://localhost:11434",
+                    info="URL of your Ollama server"
+                )
+                vlm_model_input = gr.Dropdown(
+                    label="VLM Model",
+                    choices=suggested_vlms,
+                    value=initial_vlm,
+                    allow_custom_value=True,
+                    info="Select or type the Ollama vision model name"
+                )
+                output_language_input = gr.Dropdown(
+                    label="Output Language",
+                    choices=suggested_languages,
+                    value=initial_lang,
+                    allow_custom_value=True,
+                    info="Select or type the desired output language (e.g., English, Spanish)"
+                )
+                with gr.Row():
+                    use_markitdown_checkbox = gr.Checkbox(
+                        label="Use Markitdown for extra text context",
+                        value=initial_use_md
+                    )
+                    use_summary_checkbox = gr.Checkbox(
+                        label="Use PDF summary for augmented context (requires extra LLM call)",
+                        value=initial_use_sum
+                    )
+                summary_llm_model_input = gr.Dropdown(
+                    label="LLM Model for Summary",
+                    choices=suggested_llms,
+                    value=initial_llm,
+                    allow_custom_value=True,
+                    info="Select or type the Ollama LLM model name for summaries"
+                )
+        # Connect UI components
+        conversion_inputs = [
+            pdf_input, ollama_endpoint_input, vlm_model_input, output_language_input,
+            use_markitdown_checkbox, use_summary_checkbox, summary_llm_model_input
+        ]
+        conversion_outputs = [
+            progress_output, download_button, markdown_output
+        ]
+        convert_button.click(
+            fn=generate,
+            inputs=conversion_inputs,
+            outputs=conversion_outputs
+        )
+    return iface
+def launch_app() -> None:
+    """
+    Start the application from the command line.
+    This function creates the Gradio UI and launches it.
+    """
+    app: gr.Blocks = create_ui()
+    app.launch()
+if __name__ == "__main__":
+    launch_app()

main.py ADDED Viewed

	@@ -0,0 +1,90 @@

+"""
+Main entry point for DescribePDF application.
+This module handles command-line argument parsing and routes to the appropriate
+UI or CLI functionality based on the provided arguments.
+"""
+import argparse
+import sys
+from typing import List, Optional
+from describepdf.config import logger
+def parse_arguments(args: Optional[List[str]] = None) -> argparse.Namespace:
+    """
+    Parse command line arguments.
+    Args:
+        args: List of command line arguments (default: sys.argv[1:])
+    Returns:
+        argparse.Namespace: Parsed arguments
+    """
+    parser = argparse.ArgumentParser(add_help=False)
+    parser.add_argument('--web', action='store_true', help='Start in web mode with Gradio (OpenRouter)')
+    parser.add_argument('--web-ollama', action='store_true', help='Start in web mode with Gradio (Ollama local)')
+    # Parse known args to allow the rest to be processed by the CLI parser
+    args, _ = parser.parse_known_args(args)
+    return args
+def main(args: Optional[List[str]] = None) -> int:
+    """
+    Main function that starts the appropriate application mode.
+    Args:
+        args: List of command line arguments (default: sys.argv[1:])
+    Returns:
+        int: Exit code (0 for success, non-zero for error)
+    """
+    # Logging is already configured in config.py, we just need to use the logger
+    logger.info("Starting DescribePDF...")
+    # Parse arguments
+    parsed_args = parse_arguments(args)
+    try:
+        # Start in the appropriate mode
+        if parsed_args.web:
+            # Start web UI with OpenRouter
+            from describepdf import ui
+            logger.info("Starting in WEB mode with Gradio interface for OpenRouter...")
+            app_ui = ui.create_ui()
+            app_ui.launch()
+            logger.info("Web UI stopped.")
+            return 0
+        elif parsed_args.web_ollama:
+            # Start web UI with Ollama
+            from describepdf import ui_ollama
+            logger.info("Starting in WEB mode with Gradio interface for Ollama...")
+            app_ui = ui_ollama.create_ui()
+            app_ui.launch()
+            logger.info("Web UI (Ollama) stopped.")
+            return 0
+        else:
+            # Start CLI mode
+            from describepdf import cli
+            logger.info("Starting in CLI mode...")
+            cli.run_cli()
+            return 0
+    except ImportError as e:
+        logger.error(f"Failed to start, likely a missing dependency: {e}")
+        logger.error(f"Details: {e}")
+        return 1
+    except KeyboardInterrupt:
+        logger.info("Application stopped by user.")
+        return 0
+    except Exception as e:
+        logger.error(f"An unexpected error occurred: {e}", exc_info=True)
+        return 1
+if __name__ == "__main__":
+    exit_code = main()
+    sys.exit(exit_code)

prompts/summary_prompt.md ADDED Viewed

	@@ -0,0 +1,6 @@

+Please provide a concise summary of the following document content. The summary should capture the main topics and purpose of the document.
+Document Text:
+```markdown
+[FULL_PDF_TEXT]
+```

prompts/vlm_prompt_base.md ADDED Viewed

	@@ -0,0 +1,17 @@

+IMPORTANT: THE ENTIRE RESPONSE MUST BE WRITTEN IN [LANGUAGE]. DO NOT USE ANY OTHER LANGUAGE.
+Describe the content of this page for a visually impaired person.
+This is page [PAGE_NUM] of [TOTAL_PAGES].
+YOUR TASK:
+1. Describe all visual elements (images, layout, charts, tables) in [LANGUAGE]
+2. Include all text content in [LANGUAGE]
+3. Organize the information in a structured way
+4. If text appears in another language in the document, translate it to [LANGUAGE]
+RESPONSE FORMAT:
+- Start directly with your description
+- Write EVERYTHING in [LANGUAGE] only
+- Be thorough but clear
+Start your response now in [LANGUAGE]:

prompts/vlm_prompt_full.md ADDED Viewed

	@@ -0,0 +1,26 @@

+IMPORTANT: THE ENTIRE RESPONSE MUST BE WRITTEN IN [LANGUAGE]. DO NOT USE ANY OTHER LANGUAGE.
+Describe the content of this page for a visually impaired person.
+This is page [PAGE_NUM] of [TOTAL_PAGES].
+YOUR TASK:
+1. Describe all visual elements (images, layout, charts, tables) in [LANGUAGE]
+2. Include all text content in [LANGUAGE]
+3. Organize the information in a structured way
+4. If text appears in another language in the document, translate it to [LANGUAGE]
+RESPONSE FORMAT:
+- Start directly with your description
+- Write EVERYTHING in [LANGUAGE] only
+- Be thorough but clear
+Start your response now in [LANGUAGE]:
+As additional context, here is a preliminary text extraction from the page:
+```markdown
+[MARKDOWN_CONTEXT]
+```
+This page is part of a document with the following summary:
+[SUMMARY_CONTEXT]
+Start your response directly with the description for page [PAGE_NUM] in [LANGUAGE]:

prompts/vlm_prompt_with_markdown.md ADDED Viewed

	@@ -0,0 +1,10 @@

+Describe the content of this page for a visually impaired person.
+This is page [PAGE_NUM] of [TOTAL_PAGES].
+The description must be in [LANGUAGE].
+Focus on describing visual elements (images, layout, charts, tables) and the text content in a structured way. All text content must be in the description too.
+As additional context, here is a preliminary text extraction from the page:
+```markdown
+[MARKDOWN_CONTEXT]
+```
+Start your response directly with the description for page [PAGE_NUM]:

prompts/vlm_prompt_with_summary.md ADDED Viewed

	@@ -0,0 +1,20 @@

+IMPORTANT: THE ENTIRE RESPONSE MUST BE WRITTEN IN [LANGUAGE]. DO NOT USE ANY OTHER LANGUAGE.
+Describe the content of this page for a visually impaired person.
+This is page [PAGE_NUM] of [TOTAL_PAGES].
+YOUR TASK:
+1. Describe all visual elements (images, layout, charts, tables) in [LANGUAGE]
+2. Include all text content in [LANGUAGE]
+3. Organize the information in a structured way
+4. If text appears in another language in the document, translate it to [LANGUAGE]
+RESPONSE FORMAT:
+- Start directly with your description
+- Write EVERYTHING in [LANGUAGE] only
+- Be thorough but clear
+This page is part of a document with the following summary:
+[SUMMARY_CONTEXT]
+Start your response directly with the description for page [PAGE_NUM] in [LANGUAGE]:

pytest.ini ADDED Viewed

	@@ -0,0 +1,17 @@

+[pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+markers =
+    unit: marks tests as unit tests
+    integration: marks tests as integration tests
+# Configuration for pytest-cov
+addopts = --cov=describepdf --cov-report=term-missing
+# Logging configuration
+log_cli = true
+log_cli_level = INFO
+log_cli_format = %(asctime)s - %(levelname)s - [%(module)s] - %(message)s

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=5.20.1
+pymupdf>=1.24.10
+requests>=2.32.3
+python-dotenv>=1.1.0
+markitdown[pdf]>=0.1.1
+pillow>=10.4.0
+ollama>=0.4.7
+tqdm>=4.67.0

setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from setuptools import setup, find_packages
+# Read requirements from requirements.txt
+with open('requirements.txt') as f:
+    requirements = f.read().splitlines()
+setup(
+    name="describepdf",
+    version="0.1.0",
+    description="Convert PDFs to detailed Markdown descriptions using Vision-Language Models",
+    author="David Romero",
+    packages=find_packages(),
+    include_package_data=True,
+    install_requires=requirements,
+    entry_points={
+        'console_scripts': [
+            'describepdf=describepdf.cli:run_cli',
+            'describepdf-web=describepdf.ui:launch_app',
+            'describepdf-web-ollama=describepdf.ui_ollama:launch_app',
+        ],
+    },
+    python_requires='>=3.8',
+)