import re import html import json def clean_text(text): """Clean text with common issues like HTML entities and escaped quotes.""" if not text or not isinstance(text, str): return text # Fix incomplete HTML entities incomplete_entities = {''': ''', '"': '"', '<': '<', '>': '>', '&': '&'} for incomplete, complete in incomplete_entities.items(): text = re.sub(f"{re.escape(incomplete)}(?!;)", complete, text) # Convert HTML entities to characters try: text = html.unescape(text) except Exception: pass # Handle escaped quotes and other special characters replacements = { r'\"': '"', r"\'": "'", r"\n": "\n", r"\t": "\t", r"\\": "\\", # Also normalize fancy quotes '"': '"', '"': '"', ''': "'", ''': "'", '`': "'", 'ยด': "'" } for pattern, replacement in replacements.items(): text = text.replace(pattern, replacement) # Remove trailing backslash if present if text.rstrip().endswith('\\'): text = text.rstrip().rstrip('\\') return text def get_context_html(example, show_full=False): """Format context chunks into HTML for display.""" html_output = "" # Process insufficient context warning if needed if example.get("insufficient", False): insufficient_reason = example.get("insufficient_reason", "") reason_html = f"
{insufficient_reason}
" if insufficient_reason else "The context may not contain enough information to fully answer the question, or the question might be ambiguous. Models should ideally indicate this limitation or refuse to answer.
" html_output += f"""