import os import json import pandas as pd import random import re # Global data store - loaded once at import time _ARENA_DATA = None def load_arena_data(): """ Loads the arena data from the arena_df.csv file in the utils directory. Returns the data in a format compatible with the application. """ global _ARENA_DATA # If data is already loaded, return it if _ARENA_DATA is not None: return _ARENA_DATA try: # Define the path to the CSV file csv_path = os.path.join('utils', 'arena_df.csv') # Read the CSV file df = pd.read_csv(csv_path) print(f"Loaded arena data with {len(df)} examples") # Store the data globally _ARENA_DATA = df return df except Exception as e: print(f"Error loading arena data: {e}") # Return an empty DataFrame if file can't be loaded return pd.DataFrame() def create_dummy_example(): """Creates a dummy example if no data is loaded""" return { "question": "Could not load questions from the dataset. Please check the data file.", "processed_context_desc": "Error: Data not available", "contexts": [], "full_contexts": [], "Answerable": False, "insufficient": True, "insufficient_reason": "Data loading error" } def get_random_example(): """ Selects a random example from the loaded arena data. Returns the example data in a format compatible with the application. """ # Get the globally stored data - won't reload from disk df = load_arena_data() if df.empty: # If no data is loaded, return a dummy example return create_dummy_example() # Select a random row example = df.sample(1).iloc[0] # Process the example data processed_example = { "question": example['question'], "Answerable": not example.get('insufficient', False), "insufficient": example.get('insufficient', False), "insufficient_reason": example.get('insufficient_reason', ''), "sample_id": example.get('sample_id', 0) } # Process the context description - ensure it's a non-empty string context_desc = example.get('processed_context_desc', '') if pd.isna(context_desc): context_desc = "" # Add the description to the processed example processed_example["processed_context_desc"] = context_desc # Process full contexts - from the 'contexts' column full_contexts = [] try: if 'contexts' in example and example['contexts']: # Try to parse contexts as JSON if it's a string contexts_str = example['contexts'] if isinstance(contexts_str, str): # Try to parse as list literal first (for Python list representation) if contexts_str.strip().startswith('[') and contexts_str.strip().endswith(']'): try: # This is for handling Python list literals like "['string1', 'string2']" import ast contexts_list = ast.literal_eval(contexts_str) # Process each context string in the list for ctx in contexts_list: full_contexts.append(ctx) except (SyntaxError, ValueError) as e: # If ast.literal_eval fails, try JSON try: contexts_list = json.loads(contexts_str) # Process each context in the list for ctx in contexts_list: if isinstance(ctx, str): full_contexts.append(ctx) elif isinstance(ctx, dict) and 'content' in ctx: full_contexts.append(ctx.get('content', '')) except json.JSONDecodeError: # Not valid JSON, treat as a single context full_contexts.append(contexts_str) else: # Single context string (not JSON array or list literal) full_contexts.append(contexts_str) elif isinstance(contexts_str, list): # Already a list, process directly for ctx in contexts_str: if isinstance(ctx, str): full_contexts.append(ctx) elif isinstance(ctx, dict) and 'content' in ctx: full_contexts.append(ctx.get('content', '')) except Exception as e: print(f"Error processing full contexts: {e}") # Process highlighted contexts - from contexts_highlighted column contexts_highlighted = [] try: # Process contexts_highlighted - this is stored as a string in CSV if 'contexts_highlighted' in example and example['contexts_highlighted']: highlights_str = example['contexts_highlighted'] if isinstance(highlights_str, str): try: # Try to parse as JSON array highlights_list = json.loads(highlights_str) # Process each highlighted context for i, ctx in enumerate(highlights_list): if isinstance(ctx, dict): ctx_type = ctx.get('type', 'secondary') content = ctx.get('abbreviatedContent', '') # The content already has HTML span tags for highlights contexts_highlighted.append({ 'is_primary': ctx_type == 'primary', 'content': content }) except json.JSONDecodeError: print(f"Error parsing contexts_highlighted JSON: {highlights_str[:100]}...") elif isinstance(highlights_str, list): # Already a list, process directly for ctx in highlights_str: if isinstance(ctx, dict): ctx_type = ctx.get('type', 'secondary') content = ctx.get('abbreviatedContent', '') contexts_highlighted.append({ 'is_primary': ctx_type == 'primary', 'content': content }) except Exception as e: print(f"Error processing highlighted contexts: {e}") # Make sure we have the highlighted contexts populated even if there are no contexts_highlighted if not contexts_highlighted and full_contexts: for content in full_contexts: contexts_highlighted.append({ 'is_primary': False, 'content': content }) processed_example["contexts"] = contexts_highlighted processed_example["full_contexts"] = full_contexts return processed_example def get_random_example_and_models(model_names): """ Selects a random example from the arena data and assigns two distinct random models to positions A and B. """ example = get_random_example() # Choose two different models from the model list model_a_name, model_b_name = random.sample(model_names, 2) return example, model_a_name, model_b_name