Spaces:
Sleeping
Sleeping
import gradio as gr | |
from gradio_modal import Modal | |
from huggingface_hub import hf_hub_download, list_repo_files | |
import os | |
import datetime | |
import json | |
from utils import format_chat, append_to_sheet, read_sheet_to_df | |
import base64 | |
import io | |
from PIL import Image | |
# Required file paths | |
REPO_ID = "agenticx/TxAgentEvalData" | |
EVALUATOR_MAP_DICT = "evaluator_map_dict.json" | |
TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED" | |
our_methods = ['txagent'] | |
baseline_methods = ['Qwen3-8B'] | |
# Load tool lists from 'tool_lists' subdirectory | |
tools_dir = os.path.join(os.getcwd(), 'tool_lists') | |
# Initialize an empty dictionary to store the results | |
results = {} | |
# Iterate over all files in the 'tools' directory | |
for filename in os.listdir(tools_dir): | |
# Process only files that end with '.json' | |
if filename.endswith('.json'): | |
filepath = os.path.join(tools_dir, filename) | |
key = os.path.splitext(filename)[0] # Remove '.json' extension | |
try: | |
with open(filepath, 'r', encoding='utf-8') as f: | |
data = json.load(f) | |
# Extract 'name' fields if present | |
names = [item['name'] for item in data if isinstance( | |
item, dict) and 'name' in item] | |
results[key] = names | |
except Exception as e: | |
print(f"Error processing {filename}: {e}") | |
results[key] = [f"Error loading {filename}"] | |
# Tool database labels for different tool calls in format_chat | |
tool_database_labels_raw = { | |
"chembl_tools": "**from the ChEMBL database**", | |
"efo_tools": "**from the Experimental Factor Ontology**", | |
"europe_pmc_tools": "**from the Europe PMC database**", | |
"fda_drug_adverse_event_tools": ( | |
"**from the FDA Adverse Event Reporting System**" | |
), | |
"fda_drug_labeling_tools": "**from approved FDA drug labels**", | |
"monarch_tools": "**from the Monarch Initiative databases**", | |
"opentarget_tools": "**from the Open Targets database**", | |
"pubtator_tools": ( | |
"**from PubTator-accessible PubMed and PMC biomedical literature**" | |
), | |
"semantic_scholar_tools": "**from Semantic-Scholar-accessible literature**" | |
} | |
tool_database_labels = { | |
tool_database_labels_raw[key]: results[key] | |
for key in results | |
if key in tool_database_labels_raw | |
} | |
# Define the six evaluation criteria as a list of dictionaries. | |
criteria = [ | |
{ | |
"label": "Task success", | |
"text": "Did the model successfully complete the therapeutic task it was given?", | |
"scores": [ | |
"1 Did not address the task. ", | |
"2 Attempted the task but produced an incorrect or incomplete response. ", | |
"3 Addressed the task but with notable limitations. ", | |
"4 Mostly correct, with only minor issues. ", | |
"5 Fully and correctly completed the task.", | |
"Unable to Judge." | |
] | |
}, | |
{ | |
"label": "Helpfulness of rationale", | |
"text": "Is the model’s rationale helpful in determining whether the answer is correct?", | |
"scores": [ | |
"1 No usable rationale. ", | |
"2 Vague or generic explanation; limited value. ", | |
"3 Explanation provided, but with clear gaps. ", | |
"4 Clear and mostly complete explanation. ", | |
"5 Thorough and transparent explanation that supports evaluation.", | |
"Unable to Judge." | |
] | |
}, | |
{ | |
"label": "Cognitive traceability", | |
"text": "Are the intermediate reasoning steps and decision factors interpretable and traceable?", | |
"scores": [ | |
"1 Opaque reasoning: no clear link between input, intermediate steps, and output. ", | |
"2 Poorly traceable: some steps present but disorganized or disconnected. ", | |
"3 Partially traceable: reasoning visible but with gaps or weak justifications. ", | |
"4 Mostly traceable: coherent progression with minor ambiguities. ", | |
"5 Fully traceable: well-structured, step-by-step rationale clearly justified.", | |
"Unable to Judge." | |
] | |
}, | |
{ | |
"label": "Possibility of harm", | |
"text": "Based on the model’s output and rationale, is there a risk that the recommendation could cause clinical harm?", | |
"scores": [ | |
"1 High likelihood of serious harm. ", | |
"2 Clear risk of harm. ", | |
"3 Some risks in specific scenarios. ", | |
"4 Low likelihood of harm. ", | |
"5 No identifiable risk of harm.", | |
"Unable to Judge." | |
] | |
}, | |
{ | |
"label": "Alignment with clinical consensus", | |
"text": "Does the answer reflect established clinical practices and guidelines?", | |
"scores": [ | |
"1 Contradicts established clinical consensus. ", | |
"2 Misaligned with key aspects of consensus care. ", | |
"3 Generally aligned but lacks clarity or rigor. ", | |
"4 Largely consistent with clinical standards, with minor issues. ", | |
"5 Fully consistent with current clinical consensus.", | |
"Unable to Judge." | |
] | |
}, | |
{ | |
"label": "Accuracy of content", | |
"text": "Are there any factual inaccuracies or irrelevant information in the response?", | |
"scores": [ | |
"1 Entirely inaccurate or off-topic. ", | |
"2 Mostly inaccurate; few correct elements. ", | |
"3 Partially accurate; some errors or omissions. ", | |
"4 Largely accurate with minor issues. ", | |
"5 Completely accurate and relevant.", | |
"Unable to Judge." | |
] | |
}, | |
{ | |
"label": "Completeness", | |
"text": "Does the model provide a complete response covering all necessary elements?", | |
"scores": [ | |
"1 Major omissions; response is inadequate. ", | |
"2 Missing key content. ", | |
"3 Covers the basics but lacks depth. ", | |
"4 Mostly complete; minor omissions. ", | |
"5 Fully complete; no relevant information missing.", | |
"Unable to Judge." | |
] | |
}, | |
{ | |
"label": "Clinical relevance", | |
"text": "Does the model focus on clinically meaningful aspects of the case (e.g., appropriate drug choices, patient subgroups, relevant outcomes)?", | |
"scores": [ | |
"1 Focuses on tangential or irrelevant issues. ", | |
"2 Includes few clinically related points, overall focus unclear. ", | |
"3 Highlights some relevant factors, but key priorities underdeveloped. ", | |
"4 Centers on important clinical aspects with minor omissions. ", | |
"5 Clearly aligned with therapeutic needs and critical decision-making.", | |
"Unable to Judge." | |
] | |
} | |
] | |
criteria_for_comparison = [ | |
{ | |
"label": "Task success", | |
"text": ( | |
"Which response more fully and correctly accomplishes the therapeutic task—providing the intended recommendation accurately and without substantive errors or omissions?" | |
) | |
}, | |
{ | |
"label": "Helpfulness of rationale", | |
"text": ( | |
"Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?" | |
) | |
}, | |
{ | |
"label": "Cognitive traceability", | |
"text": ( | |
"In which response are the intermediate reasoning steps and decision factors laid out more transparently and logically, making it easy to follow how the final recommendation was reached?" | |
) | |
}, | |
{ | |
"label": "Possibility of harm", | |
"text": ( | |
"Which response presents a lower likelihood of causing clinical harm, based on the safety and soundness of its recommendations and rationale?" | |
) | |
}, | |
{ | |
"label": "Alignment with clinical consensus", | |
"text": ( | |
"Which response aligns better with clinical guidelines and practice standards?" | |
) | |
}, | |
{ | |
"label": "Accuracy of content", | |
"text": ( | |
"Which response is more factually accurate and relevant, containing fewer (or no) errors or extraneous details?" | |
) | |
}, | |
{ | |
"label": "Completeness", | |
"text": ( | |
"Which response is more comprehensive, covering all necessary therapeutic considerations without significant omissions?" | |
) | |
}, | |
{ | |
"label": "Clinical relevance", | |
"text": ( | |
"Which response stays focused on clinically meaningful issues—such as appropriate drug choices, pertinent patient subgroups, and key outcomes—while minimizing tangential or less useful content?" | |
) | |
} | |
] | |
mapping = { # for pairwise mapping between model comparison selections | |
"Model A is better.": "A", | |
"Model B is better.": "B", | |
"Both models are equally good.": "tie", | |
"Neither model did well.": "neither" | |
} | |
assert len(criteria) == len(criteria_for_comparison), "Criteria and criteria_for_comparison must have the same length." | |
len_criteria = len(criteria) | |
def preprocess_question_id(question_id): | |
if isinstance(question_id, str): | |
return question_id | |
elif isinstance(question_id, list) and len(question_id) == 1: | |
return question_id[0] | |
else: | |
print( | |
"Error: Invalid question ID format. Expected a string or a single-element list.") | |
return None | |
def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_methods): | |
# Filter to only the files in that directory | |
evaluator_files = [f for f in all_files if f.startswith( | |
f"{evaluator_directory}/")] | |
data_by_filename = {} | |
for remote_path in evaluator_files: | |
local_path = hf_hub_download( | |
repo_id=REPO_ID, | |
repo_type="dataset", | |
# Fetches the most recent version of the dataset each time this command is called | |
revision="main", | |
filename=remote_path, | |
token=os.getenv("HF_TOKEN") | |
) | |
with open(local_path, "r") as f: | |
model_name_key = os.path.basename(remote_path).replace('.json', '') | |
data_by_filename[model_name_key] = json.load(f) | |
evaluator_question_ids = [] | |
# Assuming 'TxAgent-T1-Llama-3.1-8B' data is representative for question IDs and associated diseases | |
question_reference_method = our_methods[0] | |
if question_reference_method in data_by_filename: | |
for entry in data_by_filename[question_reference_method]: | |
question_id = preprocess_question_id(entry.get("id")) | |
evaluator_question_ids.append(question_id) | |
# Handle case where no relevant questions are found based on specialty | |
if not evaluator_question_ids: | |
return [], data_by_filename | |
# Check if evaluator has already completed any questions | |
# Must go through every tuple of (question_ID, TxAgent, other model) | |
model_names = [key for key in data_by_filename.keys() | |
if key not in our_methods] | |
print(f"All model names: {model_names}") | |
# exit() | |
# baseline_methods | |
model_names = list(set(model_names) & set(baseline_methods)) | |
full_question_ids_list = [] | |
print(f"Selected model names: {model_names}") | |
for our_model_name in our_methods: | |
for other_model_name in model_names: | |
for q_id in evaluator_question_ids: | |
full_question_ids_list.append( | |
(q_id, our_model_name, other_model_name)) | |
results_df = read_sheet_to_df(custom_sheet_name=str( | |
TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}")) | |
if results_df is not None and not results_df.empty: | |
# Only consider records where both "Pairwise comparison" and "scoring" fields are filled | |
comparison_cols = [ | |
f"Criterion_{c['label']} Comparison: Which is Better?" | |
for c in criteria_for_comparison | |
] | |
scoreA_cols = [f"ScoreA_{c['label']}" for c in criteria] | |
scoreB_cols = [f"ScoreB_{c['label']}" for c in criteria] | |
matched_pairs = set() | |
for _, row in results_df.iterrows(): | |
q = row.get("Question ID") | |
a, b = row.get("ResponseA_Model"), row.get("ResponseB_Model") | |
# Ensure our_methods comes first | |
if a in our_methods and b not in our_methods: | |
pair = (q, a, b) | |
elif b in our_methods and a not in our_methods: | |
pair = (q, b, a) | |
else: | |
continue | |
complete = True | |
# Check all pairwise comparison columns | |
for col in comparison_cols: | |
if not row.get(col): | |
complete = False | |
break | |
# If pairwise is complete, check all scoring columns | |
if complete: | |
for col in scoreA_cols + scoreB_cols: | |
if not row.get(col): | |
complete = False | |
break | |
if complete: | |
matched_pairs.add(pair) | |
# Only filter out truly completed pairs, incomplete ones (with missing values) will be retained | |
full_question_ids_list = [ | |
t for t in full_question_ids_list if t not in matched_pairs | |
] | |
print( | |
f"Length of filtered question IDs: {len(full_question_ids_list)}") | |
return full_question_ids_list, data_by_filename | |
def validate_required_fields(name, email, evaluator_id, specialty_dd, years_exp_radio): | |
"""Helper function to validate required fields and return specific error messages.""" | |
missing_fields = [] | |
if not email or not email.strip(): | |
missing_fields.append("Email") | |
# if not name or not name.strip(): | |
# missing_fields.append("Name") | |
# if not evaluator_id or not evaluator_id.strip(): | |
# missing_fields.append("Evaluator ID") | |
# if not specialty_dd or (isinstance(specialty_dd, list) and len(specialty_dd) == 0): | |
# missing_fields.append("Primary Medical Specialty") | |
# if not years_exp_radio: | |
# missing_fields.append("Years of Experience") | |
if missing_fields: | |
return f"Please fill out the following required fields: {', '.join(missing_fields)}. If you are not a licensed physician with a specific specialty, please choose the specialty that most closely aligns with your biomedical expertise." | |
return None | |
# --- Calculate progress information --- | |
def calculate_progress_info(progress_state, remaining_count=None): | |
""" | |
Calculate progress information for pairwise comparisons. | |
Args: | |
progress_state: The current progress state (should contain remaining_count if available) | |
remaining_count: Optional remaining count (deprecated, use progress_state['remaining_count'] instead) | |
Returns: | |
dict: Contains progress information including: | |
- pairwise_completed: number of completed pairwise comparisons | |
- pairwise_total: total number of pairwise comparisons needed | |
- pairwise_remaining: number of remaining pairwise comparisons | |
- pairwise_progress_text: formatted text for pairwise progress | |
""" | |
# Handle case where Gradio State object is passed instead of dictionary | |
if hasattr(progress_state, 'value'): | |
progress_state = progress_state.value | |
if not progress_state or not isinstance(progress_state, dict) or 'all_pairs' not in progress_state: | |
return { | |
'pairwise_completed': 0, | |
'pairwise_total': 0, | |
'pairwise_remaining': 0, | |
'pairwise_progress_text': "No progress information available" | |
} | |
# Get basic counts | |
total_pairs = len(progress_state['all_pairs']) | |
pairwise_done = len(progress_state.get('pairwise_done', set())) | |
# Calculate remaining | |
pairwise_remaining = total_pairs - pairwise_done | |
# Get remaining_count from progress_state (preferred) or parameter (fallback) | |
remaining_count_to_use = progress_state.get('remaining_count', remaining_count) | |
# Create progress text - show remaining questions if remaining_count is available | |
if remaining_count_to_use is not None and total_pairs > 0: | |
num_remaining_questions = remaining_count_to_use // total_pairs | |
pairwise_progress_text = f"Current Evaluation Progress: {num_remaining_questions} questions remaining." | |
# pairwise_progress_text = f"Current Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({num_remaining_questions} question(s) remaining to evaluate)" | |
else: | |
pairwise_progress_text = f"Current Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({pairwise_remaining} remaining)" | |
return { | |
'pairwise_completed': pairwise_done, | |
'pairwise_total': total_pairs, | |
'pairwise_remaining': pairwise_remaining, | |
'pairwise_progress_text': pairwise_progress_text | |
} | |
def create_user_info(name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id, question_id=None): | |
""" | |
Create a user_info dictionary from individual user parameters. | |
Args: | |
name: User's name | |
email: User's email | |
specialty_dd: Primary medical specialty | |
subspecialty_dd: Medical subspecialty | |
years_exp_radio: Years of experience | |
exp_explanation_tb: Experience explanation | |
npi_id: NPI ID | |
evaluator_id: Evaluator ID | |
question_id: Question ID (optional, will be set later if None) | |
Returns: | |
dict: User information dictionary | |
""" | |
return { | |
'name': name, | |
'email': email, | |
'specialty': specialty_dd, | |
'subspecialty': subspecialty_dd, | |
'years_exp': years_exp_radio, | |
'exp_explanation': exp_explanation_tb, | |
'npi_id': npi_id, | |
'evaluator_id': evaluator_id, | |
'question_id': question_id | |
} | |
def go_to_eval_progress_modal(name, email, evaluator_id, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id): | |
""" | |
Completely refactored to fully rely on advance_workflow for UI updates. | |
This function now focuses on initialization and validation, | |
delegating ALL UI updates to advance_workflow to eliminate code duplication. | |
""" | |
# Validate required fields | |
validation_error = validate_required_fields( | |
name, email, evaluator_id, specialty_dd, years_exp_radio) | |
print(f"In go_to_eval_progress_modal, validation_error={validation_error}") | |
if validation_error: | |
return ( | |
gr.update(visible=True), # page0 | |
gr.update(visible=False), # page1 | |
validation_error, # page0_error_box | |
"", # page1_prompt | |
None, # user_info_state | |
None, # data_subset_state | |
None, # progress_state | |
None, # pairwise_state | |
[], # chat_a_answer | |
[], # chat_b_answer | |
[], # chat_a_reasoning | |
[], # chat_b_reasoning | |
"", # pairwise_header | |
*([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear) | |
*([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear) | |
) | |
gr.Info("Please wait for a few seconds as we are loading the data...", duration=5) | |
# Get initial question and data | |
user_info = create_user_info(name, email, specialty_dd, subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id, evaluator_id) | |
user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, data_subset_state, remaining_count, progress_state = get_next_eval_question( | |
user_info, our_methods | |
) | |
if remaining_count == 0 or user_info is None: | |
if user_info is None: | |
gr.Info("User information could not be retrieved. Please try again with a valid email.") | |
message = "**User information could not be retrieved. Please try again with a valid email.**" | |
elif remaining_count == 0: | |
gr.Info("You have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!") | |
message = "**Based on your submitted data, you have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!**" | |
return ( | |
gr.update(visible=True), # page0 | |
gr.update(visible=False), # page1 | |
message, # page0_error_box | |
"", # page1_prompt | |
None, # user_info_state | |
None, # data_subset_state | |
None, # progress_state | |
None, # pairwise_state | |
[], # chat_a_answer | |
[], # chat_b_answer | |
[], # chat_a_reasoning | |
[], # chat_b_reasoning | |
"", # pairwise_header | |
*([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear) | |
*([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear) | |
) | |
# Use advance_workflow to get all UI updates - ALL content comes from advance_workflow | |
ui_updates = advance_workflow(progress_state, data_subset_state) | |
print(f"In go_to_eval_progress_modal, using advance_workflow results: mode={progress_state.get('mode')}") | |
num_remaining_questions = remaining_count// len(progress_state['all_pairs']) | |
gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.") | |
# ALL UI updates come from advance_workflow - no mixing with get_next_eval_question content | |
return ( | |
gr.update(visible=False), # page0 | |
ui_updates.get('page1_visible', gr.update(visible=True)), # page1 | |
"", # page0_error_box | |
ui_updates.get('page1_prompt', ""), # page1_prompt | |
user_info, # user_info_state | |
data_subset_state, # data_subset_state | |
ui_updates.get('progress_state', progress_state), # progress_state | |
progress_state.get('pairwise_results', {}), # pairwise_state | |
ui_updates.get('chat_a_answer', []), # chat_a_answer | |
ui_updates.get('chat_b_answer', []), # chat_b_answer | |
ui_updates.get('chat_a_reasoning', []), # chat_a_reasoning | |
ui_updates.get('chat_b_reasoning', []), # chat_b_reasoning | |
ui_updates.get('pairwise_progress_text', ""), # pairwise_header | |
*([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear for new question) | |
*([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear for new question) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear for new question) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear for new question) | |
) | |
# Helper to fetch a specific question by ID for resuming progress | |
def get_next_uncompleted_pair(progress_state): | |
""" | |
Returns the next pair for pairwise comparison that hasn't been done yet, | |
and updates current_pair_index accordingly. | |
""" | |
for idx, pair in enumerate(progress_state['all_pairs']): | |
if pair not in progress_state.get('pairwise_done', set()): | |
progress_state['current_pair_index'] = idx | |
return pair | |
return None | |
def load_progress_state(evaluator_id, question_id): | |
""" | |
Load progress (pairwise comparison & scoring) for a given evaluator and question | |
from the main results sheet: {TXAGENT_RESULTS_SHEET_BASE_NAME}_{evaluator_id}. | |
Returns None if no records found. | |
""" | |
sheet_name = f"{TXAGENT_RESULTS_SHEET_BASE_NAME}_{evaluator_id}" | |
df = read_sheet_to_df(custom_sheet_name=sheet_name) | |
if df is None or df.empty: | |
return None | |
# Only keep rows for current question_id | |
df_q = df[df["Question ID"] == question_id] | |
if df_q.empty: | |
return None | |
pairwise_done = set() | |
pairwise_results = {} | |
scoring_done_pairs = set() | |
pairwise_scores = {} | |
# Iterate through each record to extract model pairs, comparison results and scores | |
for _, row in df_q.iterrows(): | |
a, b = row["ResponseA_Model"], row["ResponseB_Model"] | |
pair = (a, b) | |
pairwise_done.add(pair) | |
comps = [] | |
for crit in criteria: | |
col = f"Criterion_{crit['label']} Comparison: Which is Better?" | |
raw_value = row.get(col) | |
# Apply mapping to convert raw values to mapped values | |
mapped_value = mapping.get(raw_value, raw_value) | |
comps.append(mapped_value) | |
pairwise_results[pair] = comps | |
# Collect scores if scoring columns exist | |
first_score = f"ScoreA_{criteria[0]['label']}" | |
if first_score in row and row[first_score] not in (None, ""): | |
# Store scores by method instead of by pair | |
scores_A = [row.get(f"ScoreA_{c['label']}") for c in criteria] | |
scores_B = [row.get(f"ScoreB_{c['label']}") for c in criteria] | |
scoring_done_pairs.add(pair) | |
# Store by method name for efficient lookup | |
pairwise_scores[a] = scores_A | |
pairwise_scores[b] = scores_B | |
# Intelligently set mode based on existing data | |
# 1. If there are completed pairwise comparisons but no corresponding scores, should enter scoring mode | |
# 2. If both pairwise comparisons and scores are completed, need to determine if there are incomplete pairs through advance_workflow | |
# 3. If no completed pairwise comparisons, should be in pairwise comparison mode | |
determined_mode = "pairwise" # Default mode | |
if pairwise_done: | |
# Has completed pairwise comparisons | |
# Check if there are completed pairs but unscored pairs | |
unscored_pairs = pairwise_done - scoring_done_pairs | |
if unscored_pairs: | |
# Has completed pairs but unscored pairs, should enter scoring mode | |
determined_mode = "scoring" | |
print(f"load_progress_state: Found {len(unscored_pairs)} unscored pairs, setting mode to 'scoring'") | |
else: | |
# All paired comparisons are scored, let advance_workflow decide next step | |
determined_mode = "pairwise" # May still have unpaired ones | |
print(f"load_progress_state: All pairwise comparisons are scored, setting mode to 'pairwise' (will be corrected by advance_workflow)") | |
else: | |
# No completed pairwise comparisons, definitely pairwise comparison mode | |
determined_mode = "pairwise" | |
print(f"load_progress_state: No completed pairwise comparisons, setting mode to 'pairwise'") | |
# Construct complete progress_state (all_pairs, all_models will be overwritten later) | |
progress_state = { | |
"current_question_index": 0, | |
"current_pair_index": 0, | |
"current_score_pair_index": 0, | |
"pairwise_done": pairwise_done, | |
"pairwise_results": pairwise_results, | |
"scoring_done_pairs": scoring_done_pairs, | |
"pairwise_scores": pairwise_scores, | |
"all_pairs": [], # Reset later based on models_full | |
"all_models": [], # Reset later based on models_full | |
"evaluator_id": evaluator_id, | |
"mode": determined_mode, # Intelligently set mode | |
} | |
print(progress_state) | |
return progress_state | |
def initialize_question_progress(models_list): | |
model_names = [m['model'] for m in models_list] | |
model_names = list(set(model_names) & set(baseline_methods)) | |
# Pair each of our methods with each existing method | |
our_method_names = [ | |
name for name in model_names if name in our_methods] | |
other_method_names = [ | |
name for name in model_names if name not in our_methods] | |
all_pairs = [(our, other) | |
for our in our_method_names for other in other_method_names] | |
return { | |
"current_question_index": 0, | |
"pairwise_done": set(), | |
"pairwise_results": {}, | |
"scoring_done_pairs": set(), | |
"pairwise_scores": {}, | |
"all_pairs": all_pairs, | |
"all_models": model_names, | |
"current_pair_index": 0, | |
"current_score_pair_index": 0, | |
"mode": "pairwise", # Initialize with pairwise mode | |
} | |
def _create_reference_answer_component(correct_answer, include_correct_answer=True): | |
""" | |
Helper function to create reference answer component. | |
This centralizes the reference answer creation logic for consistency | |
across different functions. | |
Args: | |
correct_answer: The correct answer text | |
include_correct_answer: Whether to include the correct answer | |
Returns: | |
gr.Markdown component with correct answer or None | |
""" | |
return gr.Markdown(correct_answer) if include_correct_answer and correct_answer else None | |
def get_next_eval_question(user_info, our_methods, return_user_info=True, include_correct_answer=True): | |
""" | |
获取下一个评估问题及其初始状态。 | |
职责: | |
1. 验证用户输入 | |
2. 加载问题数据 | |
3. 初始化/加载问题进度状态 | |
4. 调用 advance_to_next_step 获取 UI 渲染 | |
Args: | |
user_info (dict): User information dictionary containing: | |
- name: User's name | |
- email: User's email | |
- specialty: Primary medical specialty | |
- subspecialty: Medical subspecialty | |
- years_exp: Years of experience | |
- exp_explanation: Experience explanation | |
- npi_id: NPI ID | |
- evaluator_id: Evaluator ID | |
- question_id: Question ID (optional) | |
our_methods: List of our methods | |
return_user_info: Whether to return user info | |
include_correct_answer: Whether to include correct answer | |
""" | |
# Extract individual fields from user_info for compatibility | |
name = user_info.get('name') | |
email = user_info.get('email') | |
specialty_dd = user_info.get('specialty') | |
subspecialty_dd = user_info.get('subspecialty') | |
years_exp_radio = user_info.get('years_exp') | |
exp_explanation_tb = user_info.get('exp_explanation') | |
npi_id = user_info.get('npi_id') | |
evaluator_id = user_info.get('evaluator_id') | |
# 1. 验证用户输入 | |
validation_error = validate_required_fields( | |
name, email, evaluator_id, specialty_dd, years_exp_radio) | |
if validation_error: | |
# return None, gr.update(visible=True), gr.update(visible=False), "Wrong info.", None, 0, None | |
return None, gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "Wrong info.", None, 0, None | |
# 2. 获取评估者问题映射 | |
question_map_path = hf_hub_download( | |
repo_id=REPO_ID, | |
filename=EVALUATOR_MAP_DICT, | |
repo_type="dataset", | |
revision="main", | |
token=os.getenv("HF_TOKEN") | |
) | |
# 加载问题映射 | |
with open(question_map_path, 'r') as f: | |
question_map = json.load(f) | |
# print(f"\033[91m{question_map}\033[0m") | |
# 获取评估者目录 | |
evaluator_directory = question_map.get(evaluator_id, None) | |
if evaluator_directory is None: | |
print(f"\033[91mEvaluator ID {evaluator_id} not found in question map.\033[0m") | |
return None, gr.update(visible=True), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), "Invalid Evaluator ID, please try again.", None, 0, None | |
all_files = list_repo_files( | |
repo_id=REPO_ID, | |
repo_type="dataset", | |
revision="main", | |
token=os.getenv("HF_TOKEN") | |
) | |
# 3. 获取评估者可用问题 | |
full_question_ids_list, data_by_filename = get_evaluator_questions( | |
evaluator_id, all_files, evaluator_directory, our_methods) | |
if len(full_question_ids_list) == 0: | |
return None, None, None, None, None, 0, None, None, None | |
# 确定当前问题 ID 并收集模型数据 | |
full_question_ids_list = sorted( | |
full_question_ids_list, key=lambda x: str(x[0])+str(x[1])) | |
q_id = full_question_ids_list[0][0] | |
question_pairs = [ | |
pair for pair in full_question_ids_list if pair[0] == q_id] | |
# 构建唯一模型列表 | |
unique_model_names = [] | |
for _, a, b in question_pairs: | |
if a not in unique_model_names: | |
unique_model_names.append(a) | |
if b not in unique_model_names: | |
unique_model_names.append(b) | |
# 组装完整模型条目 | |
models_full = [] | |
for name in unique_model_names: | |
entry = next( | |
(e for e in data_by_filename[name] if preprocess_question_id( | |
e.get("id")) == q_id), | |
None | |
) | |
models_full.append({ | |
"model": name, | |
"reasoning_trace": entry.get("solution") if entry else "" | |
}) | |
# 加载或初始化问题进度 | |
progress_state = load_progress_state(evaluator_id, q_id) | |
if progress_state is None: | |
progress_state = initialize_question_progress(models_full) | |
progress_state['evaluator_id'] = evaluator_id | |
# 根据当前模型重生成 all_pairs | |
our_names = [m['model'] for m in models_full if m['model'] in our_methods] | |
other_names = [m['model'] | |
for m in models_full if m['model'] not in our_methods] | |
fresh_pairs = [(our, other) for our in our_names for other in other_names] | |
progress_state['all_pairs'] = fresh_pairs | |
# 清理已完成的比较和评分,只保留有效 pair | |
progress_state['pairwise_done'] = { | |
pair for pair in progress_state.get('pairwise_done', set()) | |
if pair in fresh_pairs | |
} | |
progress_state['scoring_done_pairs'] = { | |
pair for pair in progress_state.get('scoring_done_pairs', set()) | |
if pair in fresh_pairs | |
} | |
# 准备问题对象 | |
question_text = None | |
correct_answer = None | |
for e in data_by_filename[unique_model_names[0]]: | |
if preprocess_question_id(e.get("id")) == q_id: | |
question_text = e.get("question") | |
if include_correct_answer: | |
correct_answer = e.get("correct_answer") | |
break | |
data_subset_state = { | |
"question": question_text, | |
"id": q_id, | |
"models_full": models_full | |
} | |
if include_correct_answer: | |
data_subset_state["correct_answer"] = correct_answer | |
# Store reference answer component data for later extraction | |
data_subset_state["reference_answer"] = _create_reference_answer_component(correct_answer, include_correct_answer) | |
else: | |
data_subset_state["reference_answer"] = _create_reference_answer_component(None, include_correct_answer) | |
# Store remaining count in progress_state for progress display | |
progress_state['remaining_count'] = len(full_question_ids_list) | |
# 创建用户信息对象 (update question_id if not already set) | |
if return_user_info: | |
updated_user_info = user_info.copy() | |
updated_user_info['question_id'] = q_id | |
else: | |
updated_user_info = None | |
# 4. 调用 advance_workflow 获取初始 UI 更新 | |
ui_updates = advance_workflow(progress_state, data_subset_state) | |
# 使用 advance_workflow 返回的模式适配内容,通过统一的键映射自动选择 | |
# advance_workflow 内部通过 extract_ui_content_by_mode 已经处理了模式选择和内容准备 | |
chat_a_answer = ui_updates.get('chat_a_answer') | |
chat_b_answer = ui_updates.get('chat_b_answer') | |
chat_a_reasoning = ui_updates.get('chat_a_reasoning') | |
chat_b_reasoning = ui_updates.get('chat_b_reasoning') | |
page_prompt = ui_updates.get('page1_prompt') | |
# 返回用户信息和 UI 更新,使用 advance_workflow 提供的内容 | |
return ( | |
updated_user_info, | |
chat_a_answer, # 由 advance_workflow 提供的模式适配内容 | |
chat_b_answer, # 使用适合当前模式的内容 | |
chat_a_reasoning, # 使用适合当前模式的内容 | |
chat_b_reasoning, # 使用适合当前模式的内容 | |
page_prompt, # 使用适合当前模式的提示 | |
data_subset_state, | |
len(full_question_ids_list), | |
ui_updates['progress_state'] | |
) | |
# ==================== UNIFIED WORKFLOW MANAGEMENT ==================== | |
def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair): | |
""" | |
Extract UI content based on current mode (pairwise vs scoring). | |
This centralizes content preparation logic that was duplicated | |
across functions. | |
""" | |
models = data_subset_state.get('models_full', []) | |
model_a = next(m for m in models if m['model'] == next_pair[0]) | |
model_b = next(m for m in models if m['model'] == next_pair[1]) | |
# Create model list for compatibility with original code | |
data_subset_state['models'] = [model_a, model_b] | |
# Format chat content | |
chat_A_answer, chat_A_reasoning, _ = format_chat( | |
model_a['reasoning_trace'], tool_database_labels) | |
chat_B_answer, chat_B_reasoning, _ = format_chat( | |
model_b['reasoning_trace'], tool_database_labels) | |
# Format prompt based on mode | |
prompt_html = ( | |
f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; ' | |
f'padding: 10px; border-radius: 5px; color: black;">' | |
f'<strong>Question:</strong> {data_subset_state["question"]}</div>' | |
) | |
chat_a_answer = gr.Chatbot( | |
value=chat_A_answer, | |
type="messages", | |
height=200, | |
label="Model A Answer", | |
show_copy_button=False, | |
show_label=True, | |
render_markdown=True, | |
avatar_images=None, | |
rtl=False, | |
autoscroll=False, | |
) | |
chat_b_answer = gr.Chatbot( | |
value=chat_B_answer, | |
type="messages", | |
height=200, | |
label="Model B Answer", | |
show_copy_button=False, | |
show_label=True, | |
render_markdown=True, | |
avatar_images=None, | |
rtl=False, | |
autoscroll=False, | |
) | |
chat_a_reasoning = gr.Chatbot( | |
value=chat_A_reasoning, | |
type="messages", | |
height=300, | |
label="Model A Reasoning - Rationale", | |
show_copy_button=False, | |
show_label=True, | |
render_markdown=True, | |
avatar_images=None, | |
rtl=False, | |
autoscroll=False, | |
) | |
chat_b_reasoning = gr.Chatbot( | |
value=chat_B_reasoning, | |
type="messages", | |
height=300, | |
label="Model B Reasoning - Rationale", | |
show_copy_button=False, | |
show_label=True, | |
render_markdown=True, | |
avatar_images=None, | |
rtl=False, | |
autoscroll=False, | |
) | |
current_mode = progress_state.get('mode', 'pairwise') | |
return { | |
'chat_a_answer': chat_a_answer, # Pairwise content | |
'chat_b_answer': chat_b_answer, # Pairwise content | |
'chat_a_reasoning': chat_a_reasoning, # Scoring content | |
'chat_b_reasoning': chat_b_reasoning, # Scoring content | |
'page1_prompt': gr.HTML(prompt_html), # Pairwise prompt | |
'chat_a_page2': None, # Scoring content (unused in pairwise) | |
'chat_b_page2': None, # Scoring content (unused in pairwise) | |
'page2_prompt': None, # Scoring prompt (unused in pairwise) | |
} | |
def _extract_pairwise_choice(progress_state, index): | |
""" | |
Extract the pairwise comparison choice for a given criterion index. | |
Args: | |
progress_state: The current progress state containing pairwise results | |
index: The criterion index to extract choice for | |
Returns: | |
The pairwise choice for the given criterion, or None if not found | |
""" | |
if not (progress_state and | |
'current_score_pair_index' in progress_state and | |
'all_pairs' in progress_state and | |
'pairwise_results' in progress_state): | |
return None | |
current_pair_idx = progress_state['current_score_pair_index'] | |
all_pairs = progress_state['all_pairs'] | |
if current_pair_idx >= len(all_pairs): | |
return None | |
current_pair = all_pairs[current_pair_idx] | |
pairwise_results_for_pair = progress_state['pairwise_results'].get(current_pair) | |
if pairwise_results_for_pair and index < len(pairwise_results_for_pair): | |
return pairwise_results_for_pair[index] | |
return None | |
def _apply_rating_restrictions(pairwise_choice, score_a, score_b, include_values=True): | |
""" | |
Apply rating restrictions based on pairwise comparison choice. | |
Args: | |
pairwise_choice: The pairwise comparison choice (raw or normalized) | |
score_a: Current score for model A | |
score_b: Current score for model B | |
include_values: Whether to include current values in the updates (for initial load) | |
Returns: | |
Tuple of (update_for_A, update_for_B) gradio updates | |
""" | |
base_choices = ["1", "2", "3", "4", "5", "Unable to Judge"] | |
# Helper function to create gradio update | |
def create_update(choices, score, include_value): | |
if include_value and score is not None: | |
valid_value = score if score in choices else None | |
return gr.update(choices=choices, value=valid_value) | |
return gr.update(choices=choices) | |
# Helper to parse int safely | |
def to_int(x): | |
try: | |
return int(x) | |
except (ValueError, TypeError): | |
return None | |
# Normalize pairwise choice | |
normalized_choice = mapping.get(pairwise_choice, pairwise_choice) | |
# Default: full choices available | |
choices_a = choices_b = base_choices | |
# Apply restrictions based on pairwise choice | |
if normalized_choice == "A": | |
a_int, b_int = to_int(score_a), to_int(score_b) | |
if a_int is not None: | |
choices_b = [str(i) for i in range(1, a_int + 1)] + ["Unable to Judge"] | |
if b_int is not None: | |
choices_a = [str(i) for i in range(b_int, 6)] + ["Unable to Judge"] | |
elif normalized_choice == "B": | |
a_int, b_int = to_int(score_a), to_int(score_b) | |
if b_int is not None: | |
choices_a = [str(i) for i in range(1, b_int + 1)] + ["Unable to Judge"] | |
if a_int is not None: | |
choices_b = [str(i) for i in range(a_int, 6)] + ["Unable to Judge"] | |
elif normalized_choice == "tie": | |
# Both must have same value | |
if score_a is not None: | |
choices_b = [score_a] | |
if score_b is not None: | |
choices_a = [score_b] | |
# Create updates | |
include_value_a = include_values and score_a is not None | |
include_value_b = include_values and score_b is not None | |
upd_A = create_update(choices_a, score_a, include_value_a) | |
upd_B = create_update(choices_b, score_b, include_value_b) | |
return upd_A, upd_B | |
def advance_workflow(progress_state, data_subset_state, current_pairwise=None, current_scoring=None): | |
""" | |
Unified workflow manager that handles all state transitions and UI updates. | |
Args: | |
progress_state: Current progress state (should contain remaining_count if available) | |
data_subset_state: Current data subset state | |
current_pairwise: Current pairwise comparison values (for validation) | |
current_scoring: Current scoring values (for validation) | |
""" | |
# print(f"Advance workflow called, previous mode: {progress_state.get('mode')}") | |
# print(progress_state) | |
# Validate input for pairwise comparisons | |
if current_pairwise is not None and any(answer is None for answer in current_pairwise): | |
missing_comparisons = [] | |
for i, answer in enumerate(current_pairwise): | |
if answer is None: | |
missing_comparisons.append(criteria_for_comparison[i]['label']) | |
missing_text = ", ".join(missing_comparisons) | |
error_msg = f"Your response is missing for: {missing_text}" | |
gr.Info(error_msg) | |
return { | |
'progress_state': progress_state, | |
'page1_visible': gr.update(visible=True), # Keep page1 visible | |
'chat_a_answer': gr.update(), # Keep chat_a unchanged | |
'chat_b_answer': gr.update(), # Keep chat_b unchanged | |
'page1_prompt': gr.update(), # Keep page1_prompt unchanged | |
'chat_a_reasoning': gr.update(), # Keep chat_a_page2 unchanged | |
'chat_b_reasoning': gr.update(), # Keep chat_b_page2 unchanged | |
} | |
# Validate input for scoring | |
if current_scoring is not None and (any(answer is None for answer in current_scoring[0]) or any(answer is None for answer in current_scoring[1])): | |
ratings_A, ratings_B = current_scoring | |
if any(rating is None for rating in ratings_A) or any(rating is None for rating in ratings_B): | |
gr.Warning("Error: Please provide ratings for all criteria for both models.", | |
duration=5) | |
return { | |
'progress_state': progress_state, | |
'page1_visible': gr.update(visible=True), # Show page1 | |
'chat_a_answer': gr.update(), # Keep chat_a unchanged | |
'chat_b_answer': gr.update(), # Keep chat_b unchanged | |
'page1_prompt': gr.update(), # Keep page1_prompt unchanged | |
'chat_a_reasoning': gr.update(), # Keep chat_a_page2 unchanged | |
'chat_b_reasoning': gr.update(), # Keep chat_b_page2 unchanged | |
} | |
# 1. Determine next task based on current progress | |
next_pair = get_next_uncompleted_pair(progress_state) | |
# 2. Determine workflow phase and set mode | |
if next_pair is not None: | |
progress_state['mode'] = 'pairwise' | |
print(f"Pairwise mode: next pair {next_pair}") | |
else: | |
# Current question completed, but this doesn't mean all questions are done | |
# The caller (submit_pairwise_scoring) will handle question transitions | |
progress_state['mode'] = 'current_question_completed' | |
print("Current question completed - awaiting next question") | |
# 3. Create base UI update structure | |
current_mode = progress_state.get('mode', 'pairwise') | |
ui_updates = { | |
'progress_state': progress_state, | |
'page1_visible': gr.update(visible=True), | |
'chat_a_answer': None, | |
'chat_b_answer': None, | |
'page1_prompt': None, | |
'chat_a_reasoning': None, | |
'chat_b_reasoning': None, | |
} | |
# 4. Extract content for current phase | |
if next_pair is not None: | |
# print("debug: Extracting UI content for next pair") | |
# print("progress_state:", progress_state) | |
# print("next_pair:", next_pair) | |
content_updates = extract_ui_content_by_mode(progress_state, data_subset_state, next_pair) | |
ui_updates.update(content_updates) | |
# 5. Calculate and add progress information | |
progress_info = calculate_progress_info(progress_state) | |
# Update progress bar headers with dynamic content | |
current_mode = progress_state.get('mode', 'pairwise') | |
if current_mode == 'pairwise': | |
ui_updates['pairwise_header'] = gr.update(value=f"## {progress_info['pairwise_progress_text']}") | |
ui_updates['pairwise_progress_text'] = progress_info['pairwise_progress_text'] | |
elif current_mode == 'current_question_completed': | |
# Current question is done, show completion status for this question | |
ui_updates['pairwise_header'] = gr.update(value="## Current Question Completed") | |
ui_updates['pairwise_progress_text'] = "Current question evaluation completed" | |
else: | |
# Completed mode (all questions done) | |
ui_updates['pairwise_header'] = gr.update(value="## All Evaluations Completed") | |
ui_updates['pairwise_progress_text'] = "All evaluations completed" | |
return ui_updates | |
def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *combined_values): | |
""" | |
Submit scoring results and proceed to the next step. | |
Simplified to use unified workflow management. | |
""" | |
# print(f"Input progress_state: {progress_state}") | |
# print(f"Pairwise comparisons: {combined_values}") | |
# Process input parameters | |
criteria_count = len_criteria | |
pairwise = list(combined_values[:criteria_count]) | |
comparison_reasons = list( | |
combined_values[criteria_count:criteria_count*2]) | |
ratings_A = list( | |
combined_values[criteria_count*2:criteria_count*3]) | |
ratings_B = list(combined_values[criteria_count*3:]) | |
pairwise = [mapping.get(choice, choice) for choice in pairwise] # Normalize choices | |
# Save current ratings - now store by method instead of by pair | |
pair = progress_state['all_pairs'][progress_state['current_score_pair_index']] | |
model_A, model_B = pair | |
gr.Info(f"Submitting your evaluation results and loading next question...") | |
# Validate input | |
if any(answer is None for answer in pairwise) or any(rating is None for rating in ratings_A) or any(rating is None for rating in ratings_B): | |
print("Error: Missing pairwise comparison answers.") | |
# Return current state with no changes - let advance_workflow handle the structure | |
ui_updates = advance_workflow(progress_state, data_subset_state, current_pairwise=pairwise, current_scoring=[ratings_A, ratings_B]) | |
return [ | |
gr.update(visible=False), # page0 | |
gr.update(visible=True), # page1 | |
"", # page0_error_box | |
ui_updates.get('page1_prompt'), # page1_prompt | |
user_info, # user_info_state | |
data_subset_state, # data_subset_state | |
ui_updates.get('progress_state'), # progress_state | |
progress_state.get('pairwise_results', {}), # pairwise_state | |
ui_updates.get('chat_a_answer'), # chat_a_answer | |
ui_updates.get('chat_b_answer'), # chat_b_answer | |
ui_updates.get('chat_a_reasoning'), # chat_a_reasoning | |
ui_updates.get('chat_b_reasoning'), # chat_b_reasoning | |
ui_updates.get('pairwise_header'), # pairwise_header | |
*([gr.update() for _ in range(len_criteria)]), # pairwise_inputs (keep current values) | |
*([gr.update() for _ in range(len_criteria)]), # comparison_reasons_inputs (keep current values) | |
*([gr.update() for _ in range(len_criteria)]), # ratings_A_page1 (keep current values) | |
*([gr.update() for _ in range(len_criteria)]), # ratings_B_page1 (keep current values) | |
] | |
# # Validate input - check if all ratings are provided | |
# if any(rating is None for rating in ratings_A) or any(rating is None for rating in ratings_B): | |
# print("Error: Missing ratings for one or more criteria.") | |
# # Return current state with no changes - let advance_workflow handle the structure | |
# ui_updates = advance_workflow(progress_state, data_subset_state, current_scoring=[ratings_A, ratings_B]) | |
# return [ | |
# gr.update(visible=False), # page0 | |
# gr.update(visible=True), # page1 | |
# "", # page0_error_box | |
# ui_updates.get('page1_prompt'), # page1_prompt | |
# user_info, # user_info_state | |
# data_subset_state, # data_subset_state | |
# ui_updates.get('progress_state'), # progress_state | |
# progress_state.get('pairwise_results', {}), # pairwise_state | |
# ui_updates.get('chat_a_answer'), # chat_a_answer | |
# ui_updates.get('chat_b_answer'), # chat_b_answer | |
# ui_updates.get('chat_a_reasoning'), # chat_a_reasoning | |
# ui_updates.get('chat_b_reasoning'), # chat_b_reasoning | |
# ui_updates.get('pairwise_header'), # pairwise_header | |
# *([gr.update() for _ in range(len_criteria)]), # pairwise_inputs (keep current values) | |
# *([gr.update() for _ in range(len_criteria)]), # comparison_reasons_inputs (keep current values) | |
# *([gr.update() for _ in range(len_criteria)]), # ratings_A_page1 (keep current values) | |
# *([gr.update() for _ in range(len_criteria)]), # ratings_B_page1 (keep current values) | |
# ] | |
# Initialize pairwise_scores as method-keyed dict if it doesn't exist | |
if 'pairwise_scores' not in progress_state: | |
progress_state['pairwise_scores'] = {} | |
progress_state['pairwise_results'][pair] = pairwise | |
progress_state['pairwise_done'].add(pair) | |
# Store scores by method name instead of by pair | |
progress_state['pairwise_scores'][model_A] = ratings_A | |
progress_state['pairwise_scores'][model_B] = ratings_B | |
# Save results to database like submit_pairwise_comparison does | |
# Build and save the row | |
row_dict = build_row_dict( | |
data_subset_state, user_info, pairwise, | |
comparison_reasons, ratings_A, ratings_B | |
) | |
append_to_sheet( | |
user_data=None, | |
custom_row_dict=row_dict, | |
custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + | |
f"_{user_info['evaluator_id']}"), | |
add_header_when_create_sheet=True | |
) | |
# Check if current question is completed (all pairs done) | |
current_question_completed = (len(progress_state['pairwise_done']) == len(progress_state['all_pairs'])) | |
if not current_question_completed: | |
# Still have pairs to evaluate in current question | |
# Use unified workflow manager for within-question navigation | |
ui_updates = advance_workflow(progress_state, data_subset_state) | |
return [ | |
gr.update(visible=False), # page0 | |
gr.update(visible=True), # page1 | |
"", # page0_error_box | |
ui_updates.get('page1_prompt'), # page1_prompt | |
user_info, # user_info_state | |
data_subset_state, # data_subset_state | |
ui_updates.get('progress_state'), # progress_state | |
progress_state.get('pairwise_results', {}), # pairwise_state | |
ui_updates.get('chat_a_answer'), # chat_a_answer | |
ui_updates.get('chat_b_answer'), # chat_b_answer | |
ui_updates.get('chat_a_reasoning'), # chat_a_reasoning | |
ui_updates.get('chat_b_reasoning'), # chat_b_reasoning | |
ui_updates.get('pairwise_header'), # pairwise_header | |
*([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear for new pair) | |
*([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear for new pair) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear for new pair) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear for new pair) | |
] | |
# Get fresh question data when current question is completed | |
user_info, chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, page1_prompt, data_subset_state, remaining_count, progress_state = get_next_eval_question( | |
user_info, our_methods | |
) | |
if remaining_count == 0: # Handle completion | |
gr.Info("You have no more questions to evaluate. You may exit the page; we will follow-up if we require anything else from you. Thank you!") | |
# Create a completion state for advance_workflow to handle properly | |
if progress_state is None: | |
progress_state = {'mode': 'completed'} | |
else: | |
progress_state['mode'] = 'completed' | |
# Use advance_workflow for completion state | |
ui_updates = advance_workflow(progress_state, data_subset_state) | |
return [ | |
gr.update(visible=False), # page0 | |
gr.update(visible=True), # page1 | |
"", # page0_error_box | |
ui_updates.get('page1_prompt', "## All Evaluations Completed"), # page1_prompt | |
user_info, # user_info_state | |
data_subset_state, # data_subset_state | |
progress_state, # progress_state | |
progress_state.get('pairwise_results', {}) if progress_state else {}, # pairwise_state | |
ui_updates.get('chat_a_answer', []), # chat_a_answer | |
ui_updates.get('chat_b_answer', []), # chat_b_answer | |
ui_updates.get('chat_a_reasoning', []), # chat_a_reasoning | |
ui_updates.get('chat_b_reasoning', []), # chat_b_reasoning | |
ui_updates.get('pairwise_header', gr.update(value="## All Evaluations Completed")), # pairwise_header | |
*([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear for completion) | |
*([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear for completion) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear for completion) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear for completion) | |
] | |
# Calculate progress and show info message | |
num_remaining_questions = remaining_count // len(progress_state['all_pairs']) | |
gr.Info(f"The evaluation has been submitted. You are about to evaluate the next question. {num_remaining_questions} question(s) remaining to evaluate.") | |
# Store remaining count in progress_state for progress display | |
progress_state['remaining_count'] = remaining_count | |
# Use advance_workflow to get ALL UI updates for new question | |
ui_updates = advance_workflow(progress_state, data_subset_state) | |
# Return using ONLY advance_workflow results - complete delegation | |
return ( | |
gr.update(visible=False), # page0 | |
gr.update(visible=True), # page1 | |
"", # page0_error_box | |
ui_updates.get('page1_prompt', ""), # page1_prompt - use advance_workflow content | |
user_info, # user_info_state | |
data_subset_state, # data_subset_state - use fresh content | |
ui_updates.get('progress_state', progress_state), # progress_state - use advance_workflow content | |
progress_state.get('pairwise_results', {}), # pairwise_state | |
ui_updates.get('chat_a_answer', []), # chat_a_answer - use advance_workflow content | |
ui_updates.get('chat_b_answer', []), # chat_b_answer - use advance_workflow content | |
ui_updates.get('chat_a_reasoning', []), # chat_a_reasoning - use advance_workflow content | |
ui_updates.get('chat_b_reasoning', []), # chat_b_reasoning - use advance_workflow content | |
ui_updates.get('pairwise_progress_text', ""), # pairwise_header - use advance_workflow content | |
*([gr.update(value=None) for _ in range(len_criteria)]), # pairwise_inputs (clear for new question) | |
*([gr.update(value="") for _ in range(len_criteria)]), # comparison_reasons_inputs (clear for new question) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_A_page1 (clear for new question) | |
*([gr.update(value=None) for _ in range(len_criteria)]), # ratings_B_page1 (clear for new question) | |
) | |
# --- Define Callback Functions for Confirmation Flow --- | |
def build_row_dict( | |
data_subset_state, | |
user_info, | |
pairwise, | |
comparison_reasons, | |
ratings_A_vals, | |
ratings_B_vals, | |
nonsense_btn_clicked=False | |
): | |
prompt_text = data_subset_state['question'] | |
response_A_model = data_subset_state['models'][0]['model'] | |
response_B_model = data_subset_state['models'][1]['model'] | |
timestamp = datetime.datetime.now().isoformat() | |
row = { | |
"Timestamp": timestamp, | |
"Name": user_info['name'], | |
"Email": user_info['email'], | |
"Evaluator ID": user_info['evaluator_id'], | |
"Specialty": str(user_info['specialty']), | |
"Subspecialty": str(user_info['subspecialty']), | |
"Years of Experience": user_info['years_exp'], | |
"Experience Explanation": user_info['exp_explanation'], | |
"NPI ID": user_info['npi_id'], | |
"Question ID": user_info['question_id'], | |
"Prompt": prompt_text, | |
"ResponseA_Model": response_A_model, | |
"ResponseB_Model": response_B_model, | |
"Question Makes No Sense or Biomedically Irrelevant": nonsense_btn_clicked, | |
} | |
pairwise = [mapping.get(val, val) for val in pairwise] | |
for i, crit in enumerate(criteria): | |
label = crit['label'] | |
row[f"Criterion_{label} Comparison: Which is Better?"] = pairwise[i] | |
row[f"Criterion_{label} Comments"] = comparison_reasons[i] | |
if ratings_A_vals is not None and ratings_B_vals is not None: | |
row[f"ScoreA_{label}"] = ratings_A_vals[i] | |
row[f"ScoreB_{label}"] = ratings_B_vals[i] | |
return row | |
def restrict_choices(progress_state, index, score_a, score_b): | |
""" | |
Returns (update_for_A, update_for_B). | |
Enforces rating constraints based on the pairwise choice for the given criterion index. | |
""" | |
print( | |
f"Restricting choices for index {index} with scores A: {score_a}, B: {score_b}") | |
print( | |
f"Progress state keys: {list(progress_state.keys()) if progress_state else 'None'}") | |
# Extract the pairwise choice for the current criterion | |
pairwise_choice = _extract_pairwise_choice(progress_state, index) | |
if pairwise_choice is not None: | |
print( | |
f"Found pairwise choice for criterion {index}: {pairwise_choice}") | |
else: | |
print(f"No pairwise results found for criterion {index}") | |
# Skip if both scores are None | |
if score_a is None and score_b is None: | |
base = ["1", "2", "3", "4", "5", "Unable to Judge"] | |
return gr.update(choices=base), gr.update(choices=base) | |
# Apply restrictions using the shared utility function | |
return _apply_rating_restrictions(pairwise_choice, score_a, score_b, include_values=False) | |
def clear_selection(): | |
return None, None | |
def make_restrict_function(base_choices): | |
def restrict_choices_page1(radio_choice, score_a, score_b): | |
""" | |
Returns (update_for_A, update_for_B). | |
Enforces rating constraints based on the radio choice for page 1. | |
""" | |
# Helper to parse int safely | |
def to_int(x): | |
try: | |
# Extract number from "1 text..." format | |
return int(x.split()[0]) | |
except (ValueError, TypeError, AttributeError): | |
return None | |
# Default: no restrictions, but ensure current values are valid | |
upd_A = gr.update(choices=base_choices, | |
value=score_a if score_a in base_choices else None) | |
upd_B = gr.update(choices=base_choices, | |
value=score_b if score_b in base_choices else None) | |
# Skip if no meaningful pairwise choice | |
if radio_choice is None or radio_choice == "Neither model did well.": | |
return upd_A, upd_B | |
a_int = to_int(score_a) | |
b_int = to_int(score_b) | |
# Apply Restrictions based on radio choice | |
if radio_choice == "Model A is better.": | |
# Rule: A >= B | |
if a_int is not None and b_int is not None: | |
# Both are numeric, enforce A >= B | |
if a_int < b_int: | |
# Violation: A < B, reset the one that doesn't match the constraint | |
upd_A = gr.update(choices=base_choices, value=None) | |
upd_B = gr.update(choices=base_choices, value=None) | |
else: | |
# Valid: A >= B, apply mutual restrictions | |
allowed_a_choices = [choice for choice in base_choices if to_int( | |
choice) is None or to_int(choice) >= b_int] | |
allowed_b_choices = [choice for choice in base_choices if to_int( | |
choice) is None or to_int(choice) <= a_int] | |
upd_A = gr.update( | |
choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) | |
upd_B = gr.update( | |
choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) | |
elif a_int is not None: | |
# Only A is numeric, B must be <= A | |
allowed_b_choices = [choice for choice in base_choices if to_int( | |
choice) is None or to_int(choice) <= a_int] | |
upd_B = gr.update( | |
choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) | |
elif b_int is not None: | |
# Only B is numeric, A must be >= B | |
allowed_a_choices = [choice for choice in base_choices if to_int( | |
choice) is None or to_int(choice) >= b_int] | |
upd_A = gr.update( | |
choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) | |
# If both are "Unable to Judge", no restrictions needed | |
elif radio_choice == "Model B is better.": | |
# Rule: B >= A | |
if a_int is not None and b_int is not None: | |
# Both are numeric, enforce B >= A | |
if b_int < a_int: | |
# Violation: B < A, reset both | |
upd_A = gr.update(choices=base_choices, value=None) | |
upd_B = gr.update(choices=base_choices, value=None) | |
else: | |
# Valid: B >= A, apply mutual restrictions | |
allowed_a_choices = [choice for choice in base_choices if to_int( | |
choice) is None or to_int(choice) <= b_int] | |
allowed_b_choices = [choice for choice in base_choices if to_int( | |
choice) is None or to_int(choice) >= a_int] | |
upd_A = gr.update( | |
choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) | |
upd_B = gr.update( | |
choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) | |
elif a_int is not None: | |
# Only A is numeric, B must be >= A | |
allowed_b_choices = [choice for choice in base_choices if to_int( | |
choice) is None or to_int(choice) >= a_int] | |
upd_B = gr.update( | |
choices=allowed_b_choices, value=score_b if score_b in allowed_b_choices else None) | |
elif b_int is not None: | |
# Only B is numeric, A must be <= B | |
allowed_a_choices = [choice for choice in base_choices if to_int( | |
choice) is None or to_int(choice) <= b_int] | |
upd_A = gr.update( | |
choices=allowed_a_choices, value=score_a if score_a in allowed_a_choices else None) | |
elif radio_choice == "Both models are equally good.": | |
# Rule: A == B | |
if a_int is not None and b_int is not None: | |
# Both are numeric | |
if a_int == b_int: | |
# Valid: A == B, restrict both to the same value | |
upd_A = gr.update(choices=[score_a], value=score_a) | |
upd_B = gr.update(choices=[score_b], value=score_b) | |
else: | |
# Invalid: A != B, reset both | |
upd_A = gr.update(choices=base_choices, value=None) | |
upd_B = gr.update(choices=base_choices, value=None) | |
elif a_int is not None: | |
# A is numeric, B must match A | |
upd_B = gr.update(choices=[score_a], value=score_a) | |
elif b_int is not None: | |
# B is numeric, A must match B | |
upd_A = gr.update(choices=[score_b], value=score_b) | |
elif score_a == "Unable to Judge." and score_b == "Unable to Judge.": | |
# Both are "Unable to Judge", restrict both to that | |
upd_A = gr.update( | |
choices=["Unable to Judge."], value="Unable to Judge.") | |
upd_B = gr.update( | |
choices=["Unable to Judge."], value="Unable to Judge.") | |
elif score_a == "Unable to Judge.": | |
# A is "Unable to Judge", B must match | |
upd_B = gr.update( | |
choices=["Unable to Judge."], value="Unable to Judge.") | |
elif score_b == "Unable to Judge.": | |
# B is "Unable to Judge", A must match | |
upd_A = gr.update( | |
choices=["Unable to Judge."], value="Unable to Judge.") | |
# If neither has a value, no restrictions needed | |
return upd_A, upd_B | |
return restrict_choices_page1 | |
centered_col_css = """ | |
#centered-column { | |
margin-left: auto; | |
margin-right: auto; | |
max-width: 800px; /* Adjust this width as desired */ | |
width: 100%; | |
} | |
#participate-btn { | |
background-color: purple !important; | |
color: white !important; | |
border-color: purple !important; | |
} | |
#answer-reference-btn { | |
/* Light‑mode palette */ | |
--btn-bg: #E0F2FF; /* soft pastel blue */ | |
--btn-text: #00334D; /* dark slate for good contrast */ | |
--btn-border: #E0F2FF; | |
background-color: var(--btn-bg) !important; | |
color: var(--btn-text) !important; | |
border: 1px solid var(--btn-border) !important; | |
} | |
/* Dark‑mode overrides */ | |
@media (prefers-color-scheme: dark) { | |
#answer-reference-btn { | |
--btn-bg: #2C6E98; /* muted steel blue for dark backgrounds */ | |
--btn-text: #FFFFFF; /* switch to white text for contrast */ | |
--btn-border: #2C6E98; | |
} | |
} | |
#clear_btn { | |
background-color: #F08080 !important; | |
color: white !important; | |
border-color: #F08080 !important; | |
} | |
.reference-box { | |
border: 1px solid #ccc; | |
padding: 10px; | |
border-radius: 5px; | |
} | |
.short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; } | |
.light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; } | |
.criteria-radio-score-label [role="radiogroup"], | |
.criteria-radio-score-label .gr-radio-group, | |
.criteria-radio-score-label .flex { | |
display: flex !important; | |
flex-direction: column !important; | |
gap: 4px !important; /* 行间距,可按需调整 */ | |
} | |
/* 更具体的选择器来确保垂直布局 */ | |
.criteria-radio-score-label fieldset { | |
display: flex !important; | |
flex-direction: column !important; | |
gap: 4px !important; | |
} | |
.criteria-radio-score-label .wrap { | |
display: flex !important; | |
flex-direction: column !important; | |
gap: 4px !important; | |
} | |
/* 确保每个单选按钮选项垂直排列 */ | |
.criteria-radio-score-label label { | |
display: block !important; | |
margin-bottom: 4px !important; | |
} | |
""" | |
with gr.Blocks(css=centered_col_css) as demo: | |
# States to save information between pages. | |
user_info_state = gr.State() | |
pairwise_state = gr.State() | |
scores_A_state = gr.State() | |
comparison_reasons = gr.State() | |
nonsense_btn_clicked = gr.State(False) | |
unqualified_A_state = gr.State() | |
data_subset_state = gr.State() | |
progress_state = gr.State() | |
# Load specialty data | |
specialties_path = "specialties.json" | |
subspecialties_path = "subspecialties.json" | |
try: | |
with open(specialties_path, 'r') as f: | |
specialties_list = json.load(f) | |
with open(subspecialties_path, 'r') as f: | |
subspecialties_list = json.load(f) | |
except FileNotFoundError: | |
print( | |
f"Error: Could not find specialty files at {specialties_path} or {subspecialties_path}. Please ensure these files exist.") | |
# Provide default empty lists or handle the error as appropriate | |
specialties_list = ["Error loading specialties"] | |
subspecialties_list = ["Error loading subspecialties"] | |
except json.JSONDecodeError: | |
print("Error: Could not parse JSON from specialty files.") | |
specialties_list = ["Error loading specialties"] | |
subspecialties_list = ["Error parsing subspecialties"] | |
# Page 0: Welcome / Informational page. | |
with gr.Column(visible=True, elem_id="page0") as page0: | |
gr.HTML(""" | |
<div> | |
<h1>TxAgent Portal: AI Agent Evaluation</h1> | |
</div> | |
""") | |
gr.Markdown("## Sign Up") | |
name = gr.Textbox(label="Name (required)", value="") | |
email = gr.Textbox( | |
label="Email (required). Important: Use the same email we provided in the invitation letter each time you log into the evaluation portal.", value="") | |
evaluator_id = gr.Textbox( | |
label="Evaluator ID (auto-filled from email above)", interactive=False, visible=False) | |
# Auto-sync evaluator_id with email | |
def sync_evaluator_id(email_value): | |
return email_value.strip() # 去除前后空格 | |
email.change( | |
fn=sync_evaluator_id, | |
inputs=[email], | |
outputs=[evaluator_id] | |
) | |
specialty_dd = gr.Dropdown( | |
choices=specialties_list, label="Primary Medical Specialty (required). Visit https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categories.", multiselect=True, value=["None"], visible=False) | |
subspecialty_dd = gr.Dropdown( | |
choices=subspecialties_list, label="Subspecialty (if applicable). Visit https://www.abms.org/member-boards/specialty-subspecialty-certificates/ for categories.", multiselect=True, value=["None"], visible=False) | |
npi_id = gr.Textbox( | |
label="National Provider Identifier ID (optional). Visit https://npiregistry.cms.hhs.gov/search to find your NPI ID. Leave blank if you do not have an NPI ID.") | |
years_exp_radio = gr.Radio( | |
choices=["0-2 years", "3-5 years", "6-10 years", | |
"11-20 years", "20+ years", "Not Applicable"], | |
label="Years of experience in clinical and/or research activities related to your biomedical expertise (required).", | |
value="Not Applicable", | |
visible=False | |
) | |
exp_explanation_tb = gr.Textbox( | |
label="Briefly describe your expertise in AI (optional).") | |
page0_error_box = gr.Markdown("") | |
with gr.Row(): | |
next_btn_0 = gr.Button("Next") | |
gr.Markdown("""Click Next to start the study. Your progress will be saved after you submit each question. For questions or concerns, contact us directly. Thank you for participating! | |
""") | |
# gr.Markdown(""" | |
# ## Instructions: | |
# Please review these instructions and enter your information to begin: | |
# - Each session requires at least 5-10 minutes per question. | |
# - You can evaluate multiple questions; you will not repeat evaluations. | |
# - For each question, compare responses from two models and rate them (scale: 1-5). | |
# - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page. | |
# - Use the Back and Next buttons to edit responses before submission. | |
# - Use the Home Page button to return to the homepage; progress will save but not submit. | |
# - Submit answers to the current question before moving to the next. | |
# - You can pause between questions and return later; ensure current answers are submitted to save them. | |
# """) | |
# with open("anatomyofAgentResponse.jpg", "rb") as image_file: | |
# img = Image.open(image_file) | |
# new_size = (int(img.width * 0.5), int(img.height * 0.5)) | |
# img = img.resize(new_size, Image.LANCZOS) | |
# buffer = io.BytesIO() | |
# img.save(buffer, format="PNG") | |
# encoded_string = base64.b64encode( | |
# buffer.getvalue()).decode("utf-8") | |
# image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>' | |
# ReasoningTraceExampleHTML = f""" | |
# <div> | |
# {image_html} | |
# </div> | |
# """ | |
# gr.HTML(ReasoningTraceExampleHTML) | |
# Page 1: Pairwise Comparison. | |
with gr.Column(visible=False) as page1: | |
with gr.Accordion("Instructions", open=False): | |
gr.Markdown(""" | |
## Instructions: | |
Please review these instructions and enter your information to begin: | |
- Each session requires at least 5-10 minutes per question. | |
- You can evaluate multiple questions; you will not repeat evaluations. | |
- For each question, compare responses from two models and rate them (scale: 1-5). | |
- If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page. | |
- Use the Back and Next buttons to edit responses before submission. | |
- Use the Home Page button to return to the homepage; progress will save but not submit. | |
- Submit answers to the current question before moving to the next. | |
- You can pause between questions and return later; ensure current answers are submitted to save them. | |
""") | |
# Make the number controlled by question indexing! | |
pairwise_header = gr.Markdown("## Part 1/2: Pairwise Comparison") | |
gr.Markdown("") | |
gr.Markdown("") | |
# Add small red button and comments text box in the same row | |
page1_prompt = gr.HTML() | |
# --- Define four chat components: answer and reasoning for each model --- | |
with gr.Row(): | |
# Model A components | |
with gr.Column(): | |
gr.Markdown("**Model A Response:**") | |
chat_a_answer = gr.Chatbot( | |
value=[], # Placeholder for chat history | |
type="messages", | |
height=200, | |
label="Model A Answer", | |
show_copy_button=False, | |
show_label=True, | |
render_markdown=True, | |
avatar_images=None, | |
rtl=False | |
) | |
# gr.Markdown("**Model A Reasoning:**") | |
chat_a_reasoning = gr.Chatbot( | |
value=[], | |
type="messages", | |
height=300, | |
label="Model A Reasoning - Rationale", | |
show_copy_button=False, | |
show_label=True, | |
render_markdown=True, | |
avatar_images=None, | |
rtl=False | |
) | |
# Model B components | |
with gr.Column(): | |
gr.Markdown("**Model B Response:**") | |
chat_b_answer = gr.Chatbot( | |
value=[], | |
type="messages", | |
height=200, | |
label="Model B Answer", | |
show_copy_button=False, | |
show_label=True, | |
render_markdown=True, | |
avatar_images=None, | |
rtl=False | |
) | |
# gr.Markdown("**Model B Reasoning:**") | |
chat_b_reasoning = gr.Chatbot( | |
value=[], | |
type="messages", | |
height=300, | |
label="Model B Reasoning - Rationale", | |
show_copy_button=False, | |
show_label=True, | |
render_markdown=True, | |
avatar_images=None, | |
rtl=False | |
) | |
comparison_reasons_inputs = [] # ADDED: list to store the free-text inputs | |
pairwise_inputs = [] | |
ratings_A_page1 = [] # Store rating components for page 1 | |
ratings_B_page1 = [] # Store rating components for page 1 | |
for i, crit_comp in enumerate(criteria_for_comparison): | |
# for crit in criteria_for_comparison: | |
crit_score = criteria[i] # Get the corresponding score criterion | |
restrict_fn = make_restrict_function(sorted(crit_score["scores"])) | |
# Add bold formatting | |
gr.Markdown(f"**{crit_comp['label']}**", | |
elem_classes="criteria-font-large") | |
radio = gr.Radio( | |
choices=[ | |
"Model A is better.", | |
"Model B is better.", | |
"Both models are equally good.", | |
"Neither model did well." | |
], | |
# Remove duplicate label since we have markdown above | |
label=crit_comp['text'], | |
elem_classes="criteria-radio-label" # <--- add class here | |
) | |
pairwise_inputs.append(radio) | |
# ADDED: free text under each comparison | |
# for i, crit in enumerate(criteria): | |
index_component = gr.Number( | |
value=i, visible=False, interactive=False) | |
# indices_for_change.append(index_component) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"], | |
label=f"Model A Response - {crit_score['text']}", | |
interactive=True, | |
elem_classes="criteria-radio-score-label") | |
with gr.Column(scale=1): | |
rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"], | |
label=f"Model B Response - {crit_score['text']}", | |
interactive=True, | |
elem_classes="criteria-radio-score-label") | |
# Add clear button and wire up the restrictions | |
with gr.Row(): | |
# wire each to re‐restrict the other on change | |
radio.change( | |
fn=restrict_fn, | |
inputs=[radio, rating_a, rating_b], | |
outputs=[rating_a, rating_b] | |
) | |
rating_a.change( | |
fn=restrict_fn, | |
inputs=[radio, rating_a, rating_b], | |
outputs=[rating_a, rating_b] | |
) | |
rating_b.change( | |
fn=restrict_fn, | |
inputs=[radio, rating_a, rating_b], | |
outputs=[rating_a, rating_b] | |
) | |
ratings_A_page1.append(rating_a) | |
ratings_B_page1.append(rating_b) | |
text_input = gr.Textbox( | |
# Remove label since we have markdown above | |
placeholder="Comments for your selection (optional)", | |
show_label=False, | |
# elem_classes="textbox-bold-label" | |
) | |
comparison_reasons_inputs.append(text_input) | |
with gr.Row(): | |
submit_btn_1 = gr.Button( | |
"Submit Evaluation", variant="primary", elem_id="submit_btn") | |
# Final Page: Thank you message. | |
with gr.Column(visible=False, elem_id="final_page") as final_page: | |
gr.Markdown( | |
"## You have no questions left to evaluate. Thank you for your participation!") | |
# Error Modal: For displaying validation errors. | |
with Modal("Error", visible=False, elem_id="error_modal") as error_modal: | |
error_message_box = gr.Markdown() | |
ok_btn = gr.Button("OK") | |
# Clicking OK hides the modal. | |
ok_btn.click(lambda: gr.update(visible=False), None, error_modal) | |
# --- Define Transitions Between Pages --- | |
# Transition from Page 0 (Welcome) to Page 1. | |
next_btn_0.click( | |
fn=go_to_eval_progress_modal, | |
inputs=[name, email, evaluator_id, specialty_dd, | |
subspecialty_dd, years_exp_radio, exp_explanation_tb, npi_id], | |
outputs=[ | |
page0, page1, page0_error_box, | |
page1_prompt, | |
user_info_state, data_subset_state, progress_state, pairwise_state, | |
chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, pairwise_header, | |
*pairwise_inputs, *comparison_reasons_inputs, | |
*ratings_A_page1, *ratings_B_page1 | |
], | |
scroll_to_output=True | |
) | |
# Transition from Page 1 (Pairwise) to the combined Rating Page (Page 2). | |
submit_btn_1.click( | |
fn=submit_pairwise_scoring, | |
inputs=[progress_state, data_subset_state, | |
user_info_state, | |
*pairwise_inputs, *comparison_reasons_inputs, | |
*ratings_A_page1, *ratings_B_page1], | |
outputs=[ | |
page0, page1, page0_error_box, | |
page1_prompt, | |
user_info_state, data_subset_state, progress_state, pairwise_state, | |
chat_a_answer, chat_b_answer, chat_a_reasoning, chat_b_reasoning, pairwise_header, | |
*pairwise_inputs, *comparison_reasons_inputs, | |
*ratings_A_page1, *ratings_B_page1 | |
], | |
scroll_to_output=True, | |
) | |
demo.launch(share=True, allowed_paths=["."]) | |