SLM-RAG-Arena / utils /data_loader.py
oliver-aizip's picture
update data pipeline
347797e
import os
import json
import pandas as pd
import random
import re
# Global data store - loaded once at import time
_ARENA_DATA = None
def load_arena_data():
"""
Loads the arena data from the arena_df.csv file in the utils directory.
Returns the data in a format compatible with the application.
"""
global _ARENA_DATA
# If data is already loaded, return it
if _ARENA_DATA is not None:
return _ARENA_DATA
try:
# Define the path to the CSV file
csv_path = os.path.join('utils', 'arena_df.csv')
# Read the CSV file
df = pd.read_csv(csv_path)
print(f"Loaded arena data with {len(df)} examples")
# Store the data globally
_ARENA_DATA = df
return df
except Exception as e:
print(f"Error loading arena data: {e}")
# Return an empty DataFrame if file can't be loaded
return pd.DataFrame()
def create_dummy_example():
"""Creates a dummy example if no data is loaded"""
return {
"question": "Could not load questions from the dataset. Please check the data file.",
"processed_context_desc": "Error: Data not available",
"contexts": [],
"full_contexts": [],
"Answerable": False,
"insufficient": True,
"insufficient_reason": "Data loading error"
}
def get_random_example():
"""
Selects a random example from the loaded arena data.
Returns the example data in a format compatible with the application.
"""
# Get the globally stored data - won't reload from disk
df = load_arena_data()
if df.empty:
# If no data is loaded, return a dummy example
return create_dummy_example()
# Select a random row
example = df.sample(1).iloc[0]
# Process the example data
processed_example = {
"question": example['question'],
"Answerable": not example.get('insufficient', False),
"insufficient": example.get('insufficient', False),
"insufficient_reason": example.get('insufficient_reason', ''),
"sample_id": example.get('sample_id', 0)
}
# Process the context description - ensure it's a non-empty string
context_desc = example.get('processed_context_desc', '')
if pd.isna(context_desc):
context_desc = ""
# Add the description to the processed example
processed_example["processed_context_desc"] = context_desc
# Process full contexts - from the 'contexts' column
full_contexts = []
try:
if 'contexts' in example and example['contexts']:
# Try to parse contexts as JSON if it's a string
contexts_str = example['contexts']
if isinstance(contexts_str, str):
# Try to parse as list literal first (for Python list representation)
if contexts_str.strip().startswith('[') and contexts_str.strip().endswith(']'):
try:
# This is for handling Python list literals like "['string1', 'string2']"
import ast
contexts_list = ast.literal_eval(contexts_str)
# Process each context string in the list
for ctx in contexts_list:
full_contexts.append(ctx)
except (SyntaxError, ValueError) as e:
# If ast.literal_eval fails, try JSON
try:
contexts_list = json.loads(contexts_str)
# Process each context in the list
for ctx in contexts_list:
if isinstance(ctx, str):
full_contexts.append(ctx)
elif isinstance(ctx, dict) and 'content' in ctx:
full_contexts.append(ctx.get('content', ''))
except json.JSONDecodeError:
# Not valid JSON, treat as a single context
full_contexts.append(contexts_str)
else:
# Single context string (not JSON array or list literal)
full_contexts.append(contexts_str)
elif isinstance(contexts_str, list):
# Already a list, process directly
for ctx in contexts_str:
if isinstance(ctx, str):
full_contexts.append(ctx)
elif isinstance(ctx, dict) and 'content' in ctx:
full_contexts.append(ctx.get('content', ''))
except Exception as e:
print(f"Error processing full contexts: {e}")
# Process highlighted contexts - from contexts_highlighted column
contexts_highlighted = []
try:
# Process contexts_highlighted - this is stored as a string in CSV
if 'contexts_highlighted' in example and example['contexts_highlighted']:
highlights_str = example['contexts_highlighted']
if isinstance(highlights_str, str):
try:
# Try to parse as JSON array
highlights_list = json.loads(highlights_str)
# Process each highlighted context
for i, ctx in enumerate(highlights_list):
if isinstance(ctx, dict):
ctx_type = ctx.get('type', 'secondary')
content = ctx.get('abbreviatedContent', '')
# The content already has HTML span tags for highlights
contexts_highlighted.append({
'is_primary': ctx_type == 'primary',
'content': content
})
except json.JSONDecodeError:
print(f"Error parsing contexts_highlighted JSON: {highlights_str[:100]}...")
elif isinstance(highlights_str, list):
# Already a list, process directly
for ctx in highlights_str:
if isinstance(ctx, dict):
ctx_type = ctx.get('type', 'secondary')
content = ctx.get('abbreviatedContent', '')
contexts_highlighted.append({
'is_primary': ctx_type == 'primary',
'content': content
})
except Exception as e:
print(f"Error processing highlighted contexts: {e}")
# Make sure we have the highlighted contexts populated even if there are no contexts_highlighted
if not contexts_highlighted and full_contexts:
for content in full_contexts:
contexts_highlighted.append({
'is_primary': False,
'content': content
})
processed_example["contexts"] = contexts_highlighted
processed_example["full_contexts"] = full_contexts
return processed_example
def get_random_example_and_models(model_names):
"""
Selects a random example from the arena data and assigns two distinct
random models to positions A and B.
"""
example = get_random_example()
# Choose two different models from the model list
model_a_name, model_b_name = random.sample(model_names, 2)
return example, model_a_name, model_b_name