SLM-RAG-Arena

Running on Zero

App Files Files Community

SLM-RAG-Arena / utils /context_processor.py

oliver-aizip

update data pipeline

347797e 2 months ago

raw

history blame contribute delete

4.45 kB

	import re
	import html
	import json

	def clean_text(text):
	"""Clean text with common issues like HTML entities and escaped quotes."""
	if not text or not isinstance(text, str):
	return text

	# Fix incomplete HTML entities
	incomplete_entities = {'&#x27': ''', '&quot': '"', '&lt': '<', '&gt': '>', '&amp': '&'}
	for incomplete, complete in incomplete_entities.items():
	text = re.sub(f"{re.escape(incomplete)}(?!;)", complete, text)

	# Convert HTML entities to characters
	try:
	text = html.unescape(text)
	except Exception:
	pass

	# Handle escaped quotes and other special characters
	replacements = {
	r'\"': '"', r"\'": "'", r"\n": "\n", r"\t": "\t", r"\\": "\\",
	# Also normalize fancy quotes
	'"': '"', '"': '"', ''': "'", ''': "'", '`': "'", '´': "'"
	}
	for pattern, replacement in replacements.items():
	text = text.replace(pattern, replacement)

	# Remove trailing backslash if present
	if text.rstrip().endswith('\\'):
	text = text.rstrip().rstrip('\\')

	return text

	def get_context_html(example, show_full=False):
	"""Format context chunks into HTML for display."""
	html_output = ""

	# Process insufficient context warning if needed
	if example.get("insufficient", False):
	insufficient_reason = example.get("insufficient_reason", "")
	reason_html = f"<p>{insufficient_reason}</p>" if insufficient_reason else "<p>The context may not contain enough information to fully answer the question, or the question might be ambiguous. Models should ideally indicate this limitation or refuse to answer.</p>"

	html_output += f"""
	<div class="insufficient-alert">
	<strong>
	<svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" style="vertical-align: middle; margin-right: 5px;">
	<path d="m21.73 18-8-14a2 2 0 0 0-3.48 0l-8 14A2 2 0 0 0 4 21h16a2 2 0 0 0 1.73-3Z"></path>
	<line x1="12" y1="9" x2="12" y2="13"></line>
	<line x1="12" y1="17" x2="12.01" y2="17"></line>
	</svg>
	Insufficient Context
	</strong>
	{reason_html}
	</div>
	"""

	html_output += '<div class="context-items-container">'

	# Display full contexts or highlighted contexts based on toggle
	if show_full:
	# Show full context - directly use the strings from the list in full_contexts
	if "full_contexts" in example and example["full_contexts"]:
	for context_item in example["full_contexts"]:
	if isinstance(context_item, dict) and 'content' in context_item:
	content = context_item.get('content', '')
	elif isinstance(context_item, str):
	content = context_item
	else:
	content = str(context_item)

	# Escape HTML entities for safe display
	escaped_content = html.escape(content)

	# Create the context item box - no headers
	html_output += f'<div class="context-item">{escaped_content}</div>'
	else:
	# Show highlighted contexts
	if "contexts" in example and example["contexts"]:
	for context_item in example["contexts"]:
	if isinstance(context_item, dict):
	content = context_item.get('content', '')
	is_primary = context_item.get('is_primary', False)

	# Extra class for primary context styling
	extra_class = " primary-context" if is_primary else ""

	# Use content directly as it already has HTML highlighting
	html_output += f'<div class="context-item{extra_class}">{content}</div>'
	elif isinstance(context_item, str):
	# For direct string contexts
	html_output += f'<div class="context-item">{context_item}</div>'
	else:
	html_output += '<div class="context-item">No context available. Try toggling to full context view.</div>'

	html_output += '</div>'

	return html_output