import re import html import json def clean_text(text): """Clean text with common issues like HTML entities and escaped quotes.""" if not text or not isinstance(text, str): return text # Fix incomplete HTML entities incomplete_entities = {''': ''', '"': '"', '<': '<', '>': '>', '&': '&'} for incomplete, complete in incomplete_entities.items(): text = re.sub(f"{re.escape(incomplete)}(?!;)", complete, text) # Convert HTML entities to characters try: text = html.unescape(text) except Exception: pass # Handle escaped quotes and other special characters replacements = { r'\"': '"', r"\'": "'", r"\n": "\n", r"\t": "\t", r"\\": "\\", # Also normalize fancy quotes '"': '"', '"': '"', ''': "'", ''': "'", '`': "'", '´': "'" } for pattern, replacement in replacements.items(): text = text.replace(pattern, replacement) # Remove trailing backslash if present if text.rstrip().endswith('\\'): text = text.rstrip().rstrip('\\') return text def get_context_html(example, show_full=False): """Format context chunks into HTML for display.""" html_output = "" # Process insufficient context warning if needed if example.get("insufficient", False): insufficient_reason = example.get("insufficient_reason", "") reason_html = f"

{insufficient_reason}

" if insufficient_reason else "

The context may not contain enough information to fully answer the question, or the question might be ambiguous. Models should ideally indicate this limitation or refuse to answer.

" html_output += f"""

Insufficient Context {reason_html}

""" html_output += '

' # Display full contexts or highlighted contexts based on toggle if show_full: # Show full context - directly use the strings from the list in full_contexts if "full_contexts" in example and example["full_contexts"]: for context_item in example["full_contexts"]: if isinstance(context_item, dict) and 'content' in context_item: content = context_item.get('content', '') elif isinstance(context_item, str): content = context_item else: content = str(context_item) # Escape HTML entities for safe display escaped_content = html.escape(content) # Create the context item box - no headers html_output += f'

{escaped_content}

' else: # Show highlighted contexts if "contexts" in example and example["contexts"]: for context_item in example["contexts"]: if isinstance(context_item, dict): content = context_item.get('content', '') is_primary = context_item.get('is_primary', False) # Extra class for primary context styling extra_class = " primary-context" if is_primary else "" # Use content directly as it already has HTML highlighting html_output += f'

{content}

' elif isinstance(context_item, str): # For direct string contexts html_output += f'

{context_item}

' else: html_output += '

No context available. Try toggling to full context view.

' html_output += '

' return html_output