import marker import os import sys import gc import torch from marker.config.parser import ConfigParser from marker.models import create_model_dict # Global variable to hold the pre-loaded converter _converter = None def initialize_converter(): """Initializes the marker converter models and stores it globally.""" global _converter if _converter is None: print("Initializing marker models...") try: # Clear any existing CUDA cache before loading models if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() print(f"CUDA memory before initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved") # Set custom font path from environment variable if available font_path = os.environ.get('MARKER_FONT_PATH') if font_path: try: # Import marker settings and override font path from marker import settings os.makedirs(font_path, exist_ok=True) custom_font_path = os.path.join(font_path, 'NotoSans-Regular.ttf') settings.FONT_PATH = custom_font_path print(f"Using custom font path: {custom_font_path}") except ImportError: print("Could not import marker settings, using default font path") except Exception as e: print(f"Error setting custom font path: {e}", file=sys.stderr) # Create configuration, explicitly setting output format and batch multiplier config_parser = ConfigParser({ 'output_format': 'markdown', 'batch_multiplier': 4, # Increased from default 2 # Add any device-specific configuration here 'device': 'cuda' if torch.cuda.is_available() else 'cpu' }) # Load models with explicit device mapping models = create_model_dict() # Get converter class and create converter converter_cls = config_parser.get_converter_cls() _converter = converter_cls( config=config_parser.generate_config_dict(), artifact_dict=models, processor_list=config_parser.get_processors(), renderer=config_parser.get_renderer(), llm_service=config_parser.get_llm_service() ) # Force another garbage collection after model load if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() print(f"CUDA memory after initialization: {torch.cuda.memory_allocated()/1024**2:.2f} MB allocated, {torch.cuda.memory_reserved()/1024**2:.2f} MB reserved") print("Marker models initialized successfully with batch_multiplier=4.") except Exception as e: print(f"Failed to initialize marker models: {e}", file=sys.stderr) _converter = None # Ensure it's None if init fails # Attempt to clean up GPU memory in case of initialization failure if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() raise else: print("Marker models already initialized.") def convert_pdf(pdf_input_path, output_md_path=None): """ Convert PDF file to Markdown using the pre-loaded marker converter. Args: pdf_input_path (str): Path to the input PDF file output_md_path (str, optional): Path where to save the output Markdown file. If None, markdown is only returned. Returns: str: The markdown text """ # Check if the input PDF exists if not os.path.exists(pdf_input_path): raise FileNotFoundError(f"Input PDF file not found at '{pdf_input_path}'") # Check if converter is initialized if _converter is None: raise RuntimeError("Marker converter has not been initialized. Call initialize_converter() during application startup.") print(f"Starting conversion of '{pdf_input_path}' using pre-loaded models...") try: # Free up any temporary memory before conversion if torch.cuda.is_available(): torch.cuda.empty_cache() # Convert the PDF to markdown using the pre-loaded converter result = _converter(pdf_input_path) # Access the markdown content directly from the result object markdown_text = result.markdown # If output path is provided, save the markdown if output_md_path: output_dir = os.path.dirname(output_md_path) if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) with open(output_md_path, "w", encoding="utf-8") as f: f.write(markdown_text) print(f"Successfully saved markdown to '{output_md_path}'") # Clean up temporary GPU memory after conversion if torch.cuda.is_available(): torch.cuda.empty_cache() return markdown_text except Exception as e: print(f"An error occurred during conversion: {e}", file=sys.stderr) print(f"Error details: {str(type(e))}", file=sys.stderr) # Try to clean up GPU memory on error if torch.cuda.is_available(): torch.cuda.empty_cache() raise