""" Utility functions for Smart Auto-Complete Provides common functionality for text processing, logging, and validation """ import html import logging import re import sys import unicodedata from typing import Dict, List, Optional, Tuple def setup_logging(level: str = "INFO") -> logging.Logger: """ Set up logging configuration for the application Args: level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL) Returns: Configured logger instance """ # Create logger logger = logging.getLogger("smart_autocomplete") logger.setLevel(getattr(logging, level.upper())) # Remove existing handlers to avoid duplicates for handler in logger.handlers[:]: logger.removeHandler(handler) # Create console handler with formatting console_handler = logging.StreamHandler(sys.stdout) console_handler.setLevel(getattr(logging, level.upper())) # Create formatter formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) console_handler.setFormatter(formatter) # Add handler to logger logger.addHandler(console_handler) return logger def sanitize_input(text: str) -> str: """ Sanitize and clean input text for processing Args: text: Raw input text Returns: Cleaned and sanitized text """ if not text: return "" # Convert to string if not already text = str(text) # HTML escape to prevent injection text = html.escape(text) # Normalize unicode characters text = unicodedata.normalize("NFKC", text) # Remove excessive whitespace but preserve structure text = re.sub(r"\n\s*\n\s*\n", "\n\n", text) # Max 2 consecutive newlines text = re.sub(r"[ \t]+", " ", text) # Multiple spaces/tabs to single space # Remove control characters except newlines and tabs text = "".join(char for char in text if ord(char) >= 32 or char in "\n\t") # Trim leading/trailing whitespace text = text.strip() return text def extract_context_hints(text: str) -> Dict[str, any]: """ Extract contextual hints from the input text to improve suggestions Args: text: Input text to analyze Returns: Dictionary containing context hints """ hints = { "length": len(text), "word_count": len(text.split()), "has_greeting": False, "has_signature": False, "has_code_markers": False, "has_questions": False, "tone": "neutral", "language_style": "linkedin", } text_lower = text.lower() # Check for email patterns email_greetings = [ "dear", "hello", "hi", "greetings", "good morning", "good afternoon", ] email_signatures = [ "sincerely", "best regards", "thank you", "yours truly", "kind regards", ] hints["has_greeting"] = any(greeting in text_lower for greeting in email_greetings) hints["has_signature"] = any( signature in text_lower for signature in email_signatures ) # Check for code patterns code_markers = [ "//", "/*", "*/", "#", "def ", "function", "class ", "import ", "from ", ] hints["has_code_markers"] = any(marker in text_lower for marker in code_markers) # Check for questions hints["has_questions"] = "?" in text or any( q in text_lower for q in ["what", "how", "why", "when", "where", "who"] ) # Determine tone formal_words = ["please", "kindly", "respectfully", "sincerely", "professional"] casual_words = ["hey", "yeah", "cool", "awesome", "thanks"] formal_count = sum(1 for word in formal_words if word in text_lower) casual_count = sum(1 for word in casual_words if word in text_lower) if formal_count > casual_count: hints["tone"] = "formal" elif casual_count > formal_count: hints["tone"] = "casual" # Determine language style if hints["has_code_markers"]: hints["language_style"] = "technical" elif hints["has_greeting"] or hints["has_signature"]: hints["language_style"] = "business" elif any( creative in text_lower for creative in ["once upon", "story", "character", "plot"] ): hints["language_style"] = "creative" return hints def validate_api_key(api_key: str, provider: str) -> bool: """ Validate API key format for different providers Args: api_key: The API key to validate provider: The provider name (openai, anthropic) Returns: True if the key format is valid, False otherwise """ if not api_key or not isinstance(api_key, str): return False api_key = api_key.strip() if provider.lower() == "openai": # OpenAI keys start with 'sk-' and are typically 51 characters return api_key.startswith("sk-") and len(api_key) >= 40 elif provider.lower() == "anthropic": # Anthropic keys start with 'sk-ant-' return api_key.startswith("sk-ant-") and len(api_key) >= 40 return False def truncate_text(text: str, max_length: int, preserve_words: bool = True) -> str: """ Truncate text to a maximum length while optionally preserving word boundaries Args: text: Text to truncate max_length: Maximum allowed length preserve_words: Whether to preserve word boundaries Returns: Truncated text """ if len(text) <= max_length: return text if not preserve_words: return text[:max_length].rstrip() + "..." # Find the last space before the max_length truncated = text[:max_length] last_space = truncated.rfind(" ") if last_space > max_length * 0.8: # Only use word boundary if it's not too far back return text[:last_space].rstrip() + "..." else: return text[:max_length].rstrip() + "..." def format_suggestions_for_display( suggestions: List[str], max_display_length: int = 100 ) -> List[Dict[str, str]]: """ Format suggestions for display in the UI Args: suggestions: List of suggestion strings max_display_length: Maximum length for display Returns: List of formatted suggestion dictionaries """ formatted = [] for i, suggestion in enumerate(suggestions, 1): # Clean the suggestion clean_suggestion = sanitize_input(suggestion) # Create display version (truncated if needed) display_text = truncate_text(clean_suggestion, max_display_length) formatted.append( { "id": i, "text": clean_suggestion, "display_text": display_text, "length": len(clean_suggestion), "word_count": len(clean_suggestion.split()), } ) return formatted def calculate_text_similarity(text1: str, text2: str) -> float: """ Calculate similarity between two texts using simple word overlap Args: text1: First text text2: Second text Returns: Similarity score between 0 and 1 """ if not text1 or not text2: return 0.0 # Convert to lowercase and split into words words1 = set(text1.lower().split()) words2 = set(text2.lower().split()) # Calculate Jaccard similarity intersection = len(words1.intersection(words2)) union = len(words1.union(words2)) return intersection / union if union > 0 else 0.0 def get_text_stats(text: str) -> Dict[str, int]: """ Get basic statistics about the text Args: text: Text to analyze Returns: Dictionary with text statistics """ if not text: return {"characters": 0, "words": 0, "sentences": 0, "paragraphs": 0} # Count characters (excluding whitespace) char_count = len(text.replace(" ", "").replace("\n", "").replace("\t", "")) # Count words word_count = len(text.split()) # Count sentences (rough estimate) sentence_count = len(re.findall(r"[.!?]+", text)) # Count paragraphs paragraph_count = len([p for p in text.split("\n\n") if p.strip()]) return { "characters": char_count, "words": word_count, "sentences": max(1, sentence_count), # At least 1 sentence "paragraphs": max(1, paragraph_count), # At least 1 paragraph }