""" Configuration settings for Data Extractor Using Gemini Optimized for Gemini-only model usage with robust directory management """ import os from pathlib import Path from dotenv import load_dotenv import logging # Load environment variables load_dotenv() logger = logging.getLogger(__name__) class Settings: """Configuration settings with Gemini-only model support and robust directory management.""" # === GEMINI MODEL CONFIGURATION === GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # Gemini model specifications - using gemini-2.5-flash (supports thinking budget) DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro") DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.5-pro") CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.5-flash") # Thinking budgets optimized for each task type DATA_EXTRACTOR_MODEL_THINKING_BUDGET = int(os.getenv("DATA_EXTRACTOR_THINKING_BUDGET", "4096")) DATA_ARRANGER_MODEL_THINKING_BUDGET = int(os.getenv("DATA_ARRANGER_THINKING_BUDGET", "4096")) CODE_GENERATOR_MODEL_THINKING_BUDGET = int(os.getenv("CODE_GENERATOR_THINKING_BUDGET", "4096")) # === FILE PROCESSING CONFIGURATION === MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50")) SUPPORTED_FILE_TYPES = [ "pdf", "txt", "docx", "xlsx", "csv", "md", "json", "xml", "html", "png", "jpg", "jpeg", "doc", "xls", "ppt", "pptx" ] # === DIRECTORY MANAGEMENT === # Centralized working directory - all operations happen within this directory WORKING_DIR = Path(os.getenv("WORKING_DIR", "/tmp/data_extractor_gemini")) # Subdirectories within working directory TEMP_DIR = WORKING_DIR / "temp" INPUT_DIR = WORKING_DIR / "input" OUTPUT_DIR = WORKING_DIR / "output" CACHE_DIR = WORKING_DIR / "cache" LOGS_DIR = WORKING_DIR / "logs" # === WORKFLOW CONFIGURATION === # Retry and timeout settings MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3")) RETRY_DELAY_SECONDS = int(os.getenv("RETRY_DELAY_SECONDS", "5")) AGENT_TIMEOUT_SECONDS = int(os.getenv("AGENT_TIMEOUT_SECONDS", "300")) # Cache settings ENABLE_CACHING = os.getenv("ENABLE_CACHING", "true").lower() == "true" CACHE_TTL_HOURS = int(os.getenv("CACHE_TTL_HOURS", "24")) @classmethod def initialize_directories(cls): """Initialize all required directories with proper permissions.""" directories = [ cls.WORKING_DIR, cls.TEMP_DIR, cls.INPUT_DIR, cls.OUTPUT_DIR, cls.CACHE_DIR, cls.LOGS_DIR ] created_dirs = [] for directory in directories: try: directory.mkdir(parents=True, exist_ok=True) # Test write permissions test_file = directory / ".write_test" test_file.write_text("test") test_file.unlink() created_dirs.append(str(directory)) logger.debug(f"Directory initialized: {directory}") except Exception as e: logger.error(f"Failed to initialize directory {directory}: {e}") raise RuntimeError(f"Cannot create or write to directory {directory}: {e}") logger.info(f"Successfully initialized {len(created_dirs)} directories") return created_dirs @classmethod def validate_config(cls): """Comprehensive configuration validation with detailed error reporting.""" errors = [] warnings = [] # === CRITICAL VALIDATIONS === # Google API Key validation if not cls.GOOGLE_API_KEY: errors.append("GOOGLE_API_KEY is required. Get it from https://aistudio.google.com/app/apikey") elif len(cls.GOOGLE_API_KEY) < 30: warnings.append("GOOGLE_API_KEY appears to be too short - verify it's correct") # Model name validation gemini_models = [cls.DATA_EXTRACTOR_MODEL, cls.DATA_ARRANGER_MODEL, cls.CODE_GENERATOR_MODEL] for i, model in enumerate(gemini_models): model_names = ["DATA_EXTRACTOR_MODEL", "DATA_ARRANGER_MODEL", "CODE_GENERATOR_MODEL"] if not model: errors.append(f"{model_names[i]} cannot be empty") elif not model.startswith("gemini-"): errors.append(f"{model_names[i]} must be a Gemini model (starts with 'gemini-'), got: {model}") # Directory validation try: cls.initialize_directories() except Exception as e: errors.append(f"Directory initialization failed: {e}") # === MODERATE VALIDATIONS === # File size validation if cls.MAX_FILE_SIZE_MB <= 0: errors.append("MAX_FILE_SIZE_MB must be positive") elif cls.MAX_FILE_SIZE_MB > 100: warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large - may cause memory issues") # Supported file types validation if not cls.SUPPORTED_FILE_TYPES: errors.append("SUPPORTED_FILE_TYPES cannot be empty") # Thinking budget validation budgets = [ (cls.DATA_EXTRACTOR_MODEL_THINKING_BUDGET, "DATA_EXTRACTOR_MODEL_THINKING_BUDGET"), (cls.DATA_ARRANGER_MODEL_THINKING_BUDGET, "DATA_ARRANGER_MODEL_THINKING_BUDGET"), (cls.CODE_GENERATOR_MODEL_THINKING_BUDGET, "CODE_GENERATOR_MODEL_THINKING_BUDGET") ] for budget, name in budgets: if budget < 1024: warnings.append(f"{name} ({budget}) is quite low - may affect model performance") elif budget > 8192: warnings.append(f"{name} ({budget}) is very high - may be unnecessary") # Retry configuration validation if cls.MAX_RETRIES < 1: warnings.append("MAX_RETRIES should be at least 1") elif cls.MAX_RETRIES > 10: warnings.append("MAX_RETRIES is very high - may cause long delays") # === RESULT PROCESSING === if errors: error_msg = "❌ Configuration validation failed:\n" error_msg += "\n".join(f" • {error}" for error in errors) if warnings: error_msg += "\n\n⚠️ Warnings:\n" error_msg += "\n".join(f" • {warning}" for warning in warnings) raise ValueError(error_msg) if warnings: logger.warning("Configuration warnings detected:") for warning in warnings: logger.warning(f" • {warning}") logger.info("✅ Configuration validation successful") return True @classmethod def get_session_directories(cls, session_id: str): """Get session-specific directory structure.""" session_base = cls.WORKING_DIR / session_id return { "base": session_base, "input": session_base / "input", "output": session_base / "output", "temp": session_base / "temp", "cache": session_base / "cache" } @classmethod def create_session_directories(cls, session_id: str): """Create and validate session-specific directories.""" session_dirs = cls.get_session_directories(session_id) created = [] for name, directory in session_dirs.items(): try: directory.mkdir(parents=True, exist_ok=True) # Test write permissions test_file = directory / ".write_test" test_file.write_text("test") test_file.unlink() created.append(str(directory)) except Exception as e: logger.error(f"Failed to create session directory {name}: {e}") raise RuntimeError(f"Cannot create session directory {directory}: {e}") logger.info(f"Created {len(created)} session directories for {session_id}") return session_dirs @classmethod def cleanup_session(cls, session_id: str, keep_output: bool = True): """Clean up session directories with option to preserve output.""" session_dirs = cls.get_session_directories(session_id) import shutil cleaned = [] for name, directory in session_dirs.items(): if keep_output and name == "output": continue if directory.exists(): try: shutil.rmtree(directory) cleaned.append(str(directory)) except Exception as e: logger.warning(f"Could not clean {name} directory: {e}") logger.info(f"Cleaned {len(cleaned)} session directories for {session_id}") return cleaned @classmethod def get_debug_info(cls): """Get comprehensive debug information about current configuration.""" import platform import sys return { "python_version": sys.version, "platform": platform.platform(), "temp_dir": str(cls.TEMP_DIR), "temp_dir_exists": cls.TEMP_DIR.exists(), "models": { "data_extractor": cls.DATA_EXTRACTOR_MODEL, "data_arranger": cls.DATA_ARRANGER_MODEL, "code_generator": cls.CODE_GENERATOR_MODEL, }, "api_keys": { "google_api_key_present": bool(cls.GOOGLE_API_KEY), "google_api_key_length": len(cls.GOOGLE_API_KEY) if cls.GOOGLE_API_KEY else 0 } } # Global settings instance settings = Settings() # Auto-initialize directories on import try: settings.initialize_directories() logger.debug("Settings initialized successfully") except Exception as e: logger.error(f"Failed to initialize settings: {e}") # Don't raise here to allow import to succeed