SLM-RAG-Arena

Running on Zero

App Files Files Community

SLM-RAG-Arena / utils /data_loader.py

oliver-aizip

update data pipeline

347797e 1 day ago

raw

history blame contribute delete

7.64 kB

	import os
	import json
	import pandas as pd
	import random
	import re

	# Global data store - loaded once at import time
	_ARENA_DATA = None

	def load_arena_data():
	"""
	Loads the arena data from the arena_df.csv file in the utils directory.
	Returns the data in a format compatible with the application.
	"""
	global _ARENA_DATA

	# If data is already loaded, return it
	if _ARENA_DATA is not None:
	return _ARENA_DATA

	try:
	# Define the path to the CSV file
	csv_path = os.path.join('utils', 'arena_df.csv')

	# Read the CSV file
	df = pd.read_csv(csv_path)
	print(f"Loaded arena data with {len(df)} examples")

	# Store the data globally
	_ARENA_DATA = df
	return df
	except Exception as e:
	print(f"Error loading arena data: {e}")
	# Return an empty DataFrame if file can't be loaded
	return pd.DataFrame()

	def create_dummy_example():
	"""Creates a dummy example if no data is loaded"""
	return {
	"question": "Could not load questions from the dataset. Please check the data file.",
	"processed_context_desc": "Error: Data not available",
	"contexts": [],
	"full_contexts": [],
	"Answerable": False,
	"insufficient": True,
	"insufficient_reason": "Data loading error"
	}

	def get_random_example():
	"""
	Selects a random example from the loaded arena data.
	Returns the example data in a format compatible with the application.
	"""
	# Get the globally stored data - won't reload from disk
	df = load_arena_data()

	if df.empty:
	# If no data is loaded, return a dummy example
	return create_dummy_example()

	# Select a random row
	example = df.sample(1).iloc[0]

	# Process the example data
	processed_example = {
	"question": example['question'],
	"Answerable": not example.get('insufficient', False),
	"insufficient": example.get('insufficient', False),
	"insufficient_reason": example.get('insufficient_reason', ''),
	"sample_id": example.get('sample_id', 0)
	}

	# Process the context description - ensure it's a non-empty string
	context_desc = example.get('processed_context_desc', '')
	if pd.isna(context_desc):
	context_desc = ""
	# Add the description to the processed example
	processed_example["processed_context_desc"] = context_desc

	# Process full contexts - from the 'contexts' column
	full_contexts = []
	try:
	if 'contexts' in example and example['contexts']:
	# Try to parse contexts as JSON if it's a string
	contexts_str = example['contexts']

	if isinstance(contexts_str, str):
	# Try to parse as list literal first (for Python list representation)
	if contexts_str.strip().startswith('[') and contexts_str.strip().endswith(']'):
	try:
	# This is for handling Python list literals like "['string1', 'string2']"
	import ast
	contexts_list = ast.literal_eval(contexts_str)

	# Process each context string in the list
	for ctx in contexts_list:
	full_contexts.append(ctx)
	except (SyntaxError, ValueError) as e:
	# If ast.literal_eval fails, try JSON
	try:
	contexts_list = json.loads(contexts_str)

	# Process each context in the list
	for ctx in contexts_list:
	if isinstance(ctx, str):
	full_contexts.append(ctx)
	elif isinstance(ctx, dict) and 'content' in ctx:
	full_contexts.append(ctx.get('content', ''))
	except json.JSONDecodeError:
	# Not valid JSON, treat as a single context
	full_contexts.append(contexts_str)
	else:
	# Single context string (not JSON array or list literal)
	full_contexts.append(contexts_str)
	elif isinstance(contexts_str, list):
	# Already a list, process directly
	for ctx in contexts_str:
	if isinstance(ctx, str):
	full_contexts.append(ctx)
	elif isinstance(ctx, dict) and 'content' in ctx:
	full_contexts.append(ctx.get('content', ''))
	except Exception as e:
	print(f"Error processing full contexts: {e}")

	# Process highlighted contexts - from contexts_highlighted column
	contexts_highlighted = []
	try:
	# Process contexts_highlighted - this is stored as a string in CSV
	if 'contexts_highlighted' in example and example['contexts_highlighted']:
	highlights_str = example['contexts_highlighted']

	if isinstance(highlights_str, str):
	try:
	# Try to parse as JSON array
	highlights_list = json.loads(highlights_str)

	# Process each highlighted context
	for i, ctx in enumerate(highlights_list):
	if isinstance(ctx, dict):
	ctx_type = ctx.get('type', 'secondary')
	content = ctx.get('abbreviatedContent', '')

	# The content already has HTML span tags for highlights
	contexts_highlighted.append({
	'is_primary': ctx_type == 'primary',
	'content': content
	})
	except json.JSONDecodeError:
	print(f"Error parsing contexts_highlighted JSON: {highlights_str[:100]}...")
	elif isinstance(highlights_str, list):
	# Already a list, process directly
	for ctx in highlights_str:
	if isinstance(ctx, dict):
	ctx_type = ctx.get('type', 'secondary')
	content = ctx.get('abbreviatedContent', '')

	contexts_highlighted.append({
	'is_primary': ctx_type == 'primary',
	'content': content
	})
	except Exception as e:
	print(f"Error processing highlighted contexts: {e}")

	# Make sure we have the highlighted contexts populated even if there are no contexts_highlighted
	if not contexts_highlighted and full_contexts:
	for content in full_contexts:
	contexts_highlighted.append({
	'is_primary': False,
	'content': content
	})

	processed_example["contexts"] = contexts_highlighted
	processed_example["full_contexts"] = full_contexts

	return processed_example

	def get_random_example_and_models(model_names):
	"""
	Selects a random example from the arena data and assigns two distinct
	random models to positions A and B.
	"""
	example = get_random_example()
	# Choose two different models from the model list
	model_a_name, model_b_name = random.sample(model_names, 2)
	return example, model_a_name, model_b_name