Spaces:

agenticx
/

TxAgentRAOEval

Sleeping

App Files Files Community

shgao commited on Jun 19

Commit

3622597

1 Parent(s): 495ce2c

update

Browse files

Files changed (3) hide show

README.md +2 -2
app.py +107 -43
utils.py +8 -8

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: CZI Evaluation
 emoji: 🌍
 colorFrom: yellow
 colorTo: indigo
@@ -9,7 +9,7 @@ app_file: app.py
 pinned: false
 ---
-The CZI Evaluation Portal is a Gradio-based web application designed to facilitate the human evaluation of TxAgent's responses compared to other models. Users log in with their credentials, receive questions relevant to their expertise, and then perform pairwise comparisons and detailed ratings of model responses.
 ---

 ---
+title: TxAgent RAO Evaluation
 emoji: 🌍
 colorFrom: yellow
 colorTo: indigo
 pinned: false
 ---
+The TxAgent Rare-as-one Evaluation Portal is a Gradio-based web application designed to facilitate the human evaluation of TxAgent's responses compared to other models. Users log in with their credentials, receive questions relevant to their expertise, and then perform pairwise comparisons and detailed ratings of model responses.
 ---

app.py CHANGED Viewed

@@ -60,7 +60,6 @@ tool_database_labels = {
     if key in tool_database_labels_raw
 }
-# Define the six evaluation criteria as a list of dictionaries.
 # Define the six evaluation criteria as a list of dictionaries.
 criteria = [
     {
@@ -76,7 +75,7 @@ criteria = [
         ]
     },
     {
-        "label": "Justification helpfulness",
         "text": "Is the model’s rationale helpful in determining whether the answer is correct?",
         "scores": [
             "1 No usable rationale. ",
@@ -170,7 +169,7 @@ criteria_for_comparison = [
         )
     },
     {
-        "label": "Justification helpfulness",
         "text": (
             "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
         )
@@ -355,10 +354,14 @@ def validate_required_fields(name, email, evaluator_id, specialty_dd, years_exp_
 # --- Calculate progress information ---
-def calculate_progress_info(progress_state):
     """
     Calculate progress information for pairwise comparisons.
     Returns:
         dict: Contains progress information including:
             - pairwise_completed: number of completed pairwise comparisons
@@ -385,8 +388,16 @@ def calculate_progress_info(progress_state):
     # Calculate remaining
     pairwise_remaining = total_pairs - pairwise_done
-    # Create progress text
-    pairwise_progress_text = f"Currrent Question Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({pairwise_remaining} remaining)"
     return {
         'pairwise_completed': pairwise_done,
@@ -816,6 +827,9 @@ def get_next_eval_question(user_info, our_methods, return_user_info=True, includ
     else:
         data_subset_state["reference_answer"] = _create_reference_answer_component(None, include_correct_answer)
     # 创建用户信息对象 (update question_id if not already set)
     if return_user_info:
         updated_user_info = user_info.copy()
@@ -875,7 +889,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
     prompt_html = (
         f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; '
         f'padding: 10px; border-radius: 5px; color: black;">'
-        f'<strong>Prompt:</strong> {data_subset_state["question"]}</div>'
     )
     chat_a_answer = gr.Chatbot(
         value=chat_A_answer,
@@ -905,7 +919,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
         value=chat_A_reasoning,
         type="messages",
         height=300,
-        label="Model A Reasoning",
         show_copy_button=False,
         show_label=True,
         render_markdown=True,
@@ -917,7 +931,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
         value=chat_B_reasoning,
         type="messages",
         height=300,
-        label="Model B Reasoning",
         show_copy_button=False,
         show_label=True,
         render_markdown=True,
@@ -1043,6 +1057,12 @@ def _apply_rating_restrictions(pairwise_choice, score_a, score_b, include_values
 def advance_workflow(progress_state, data_subset_state, current_pairwise=None, current_scoring=None):
     """
     Unified workflow manager that handles all state transitions and UI updates.
     """
     # print(f"Advance workflow called, previous mode: {progress_state.get('mode')}")
     # print(progress_state)
@@ -1302,7 +1322,10 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *combi
     # Calculate progress and show info message
     num_remaining_questions = remaining_count // len(progress_state['all_pairs'])
-    gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.")
     # Use advance_workflow to get ALL UI updates for new question
     ui_updates = advance_workflow(progress_state, data_subset_state)
@@ -1576,6 +1599,33 @@ centered_col_css = """
 .short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
 .light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
 """
 with gr.Blocks(css=centered_col_css) as demo:
@@ -1656,38 +1706,52 @@ with gr.Blocks(css=centered_col_css) as demo:
         gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
         """)
-        gr.Markdown("""
-                ## Instructions:
-                Please review these instructions and enter your information to begin:
-                - Each session requires at least 5-10 minutes per question.
-                - You can evaluate multiple questions; you will not repeat evaluations.
-                - For each question, compare responses from two models and rate them (scale: 1-5).
-                - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
-                - Use the Back and Next buttons to edit responses before submission.
-                - Use the Home Page button to return to the homepage; progress will save but not submit.
-                - Submit answers to the current question before moving to the next.
-                - You can pause between questions and return later; ensure current answers are submitted to save them.
-            """)
-        with open("anatomyofAgentResponse.jpg", "rb") as image_file:
-            img = Image.open(image_file)
-            new_size = (int(img.width * 0.5), int(img.height * 0.5))
-            img = img.resize(new_size, Image.LANCZOS)
-            buffer = io.BytesIO()
-            img.save(buffer, format="PNG")
-            encoded_string = base64.b64encode(
-                buffer.getvalue()).decode("utf-8")
-        image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
-        ReasoningTraceExampleHTML = f"""
-            <div>
-                {image_html}
-            </div>
-            """
-        gr.HTML(ReasoningTraceExampleHTML)
     # Page 1: Pairwise Comparison.
     with gr.Column(visible=False) as page1:
         # Make the number controlled by question indexing!
         pairwise_header = gr.Markdown("## Part 1/2: Pairwise Comparison")
         gr.Markdown("")
@@ -1716,7 +1780,7 @@ with gr.Blocks(css=centered_col_css) as demo:
                     value=[],
                     type="messages",
                     height=300,
-                    label="Model A Reasoning",
                     show_copy_button=False,
                     show_label=True,
                     render_markdown=True,
@@ -1742,7 +1806,7 @@ with gr.Blocks(css=centered_col_css) as demo:
                     value=[],
                     type="messages",
                     height=300,
-                    label="Model B Reasoning",
                     show_copy_button=False,
                     show_label=True,
                     render_markdown=True,
@@ -1787,12 +1851,12 @@ with gr.Blocks(css=centered_col_css) as demo:
                     rating_a = gr.Radio(choices=sorted(crit_score["scores"]),  # ["1", "2", "3", "4", "5", "Unable to Judge"],
                                         label=f"Response A - {crit_score['text']}",
                                         interactive=True,
-                                        elem_classes="criteria-radio-label")
                 with gr.Column(scale=1):
                     rating_b = gr.Radio(choices=sorted(crit_score["scores"]),  # ["1", "2", "3", "4", "5", "Unable to Judge"],
                                         label=f"Response B - {crit_score['text']}",
                                         interactive=True,
-                                        elem_classes="criteria-radio-label")
             # Add clear button and wire up the restrictions
             with gr.Row():

     if key in tool_database_labels_raw
 }
 # Define the six evaluation criteria as a list of dictionaries.
 criteria = [
     {
         ]
     },
     {
+        "label": "Helpfulness of rationale",
         "text": "Is the model’s rationale helpful in determining whether the answer is correct?",
         "scores": [
             "1 No usable rationale. ",
         )
     },
     {
+        "label": "Helpfulness of rationale",
         "text": (
             "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
         )
 # --- Calculate progress information ---
+def calculate_progress_info(progress_state, remaining_count=None):
     """
     Calculate progress information for pairwise comparisons.
+    Args:
+        progress_state: The current progress state (should contain remaining_count if available)
+        remaining_count: Optional remaining count (deprecated, use progress_state['remaining_count'] instead)
     Returns:
         dict: Contains progress information including:
             - pairwise_completed: number of completed pairwise comparisons
     # Calculate remaining
     pairwise_remaining = total_pairs - pairwise_done
+    # Get remaining_count from progress_state (preferred) or parameter (fallback)
+    remaining_count_to_use = progress_state.get('remaining_count', remaining_count)
+    # Create progress text - show remaining questions if remaining_count is available
+    if remaining_count_to_use is not None and total_pairs > 0:
+        num_remaining_questions = remaining_count_to_use // total_pairs
+        pairwise_progress_text = f"Current Question Evaluation Progress: {num_remaining_questions} question(s) remaining to evaluate"
+        # pairwise_progress_text = f"Current Question Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({num_remaining_questions} question(s) remaining to evaluate)"
+    else:
+        pairwise_progress_text = f"Current Question Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({pairwise_remaining} remaining)"
     return {
         'pairwise_completed': pairwise_done,
     else:
         data_subset_state["reference_answer"] = _create_reference_answer_component(None, include_correct_answer)
+    # Store remaining count in progress_state for progress display
+    progress_state['remaining_count'] = len(full_question_ids_list)
     # 创建用户信息对象 (update question_id if not already set)
     if return_user_info:
         updated_user_info = user_info.copy()
     prompt_html = (
         f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; '
         f'padding: 10px; border-radius: 5px; color: black;">'
+        f'<strong>Question:</strong> {data_subset_state["question"]}</div>'
     )
     chat_a_answer = gr.Chatbot(
         value=chat_A_answer,
         value=chat_A_reasoning,
         type="messages",
         height=300,
+        label="Model A Reasoning - Rationale",
         show_copy_button=False,
         show_label=True,
         render_markdown=True,
         value=chat_B_reasoning,
         type="messages",
         height=300,
+        label="Model B Reasoning - Rationale",
         show_copy_button=False,
         show_label=True,
         render_markdown=True,
 def advance_workflow(progress_state, data_subset_state, current_pairwise=None, current_scoring=None):
     """
     Unified workflow manager that handles all state transitions and UI updates.
+    Args:
+        progress_state: Current progress state (should contain remaining_count if available)
+        data_subset_state: Current data subset state
+        current_pairwise: Current pairwise comparison values (for validation)
+        current_scoring: Current scoring values (for validation)
     """
     # print(f"Advance workflow called, previous mode: {progress_state.get('mode')}")
     # print(progress_state)
     # Calculate progress and show info message
     num_remaining_questions = remaining_count // len(progress_state['all_pairs'])
+    gr.Info(f"The evaluation has been submitted. You are about to evaluate the next question. {num_remaining_questions} question(s) remaining to evaluate.")
+    # Store remaining count in progress_state for progress display
+    progress_state['remaining_count'] = remaining_count
     # Use advance_workflow to get ALL UI updates for new question
     ui_updates = advance_workflow(progress_state, data_subset_state)
 .short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
 .light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
+.criteria-radio-score-label [role="radiogroup"],
+.criteria-radio-score-label .gr-radio-group,
+.criteria-radio-score-label .flex {
+    display: flex !important;
+    flex-direction: column !important;
+    gap: 4px !important;                 /* 行间距，可按需调整 */
+}
+/* 更具体的选择器来确保垂直布局 */
+.criteria-radio-score-label fieldset {
+    display: flex !important;
+    flex-direction: column !important;
+    gap: 4px !important;
+}
+.criteria-radio-score-label .wrap {
+    display: flex !important;
+    flex-direction: column !important;
+    gap: 4px !important;
+}
+/* 确保每个单选按钮选项垂直排列 */
+.criteria-radio-score-label label {
+    display: block !important;
+    margin-bottom: 4px !important;
+}
 """
 with gr.Blocks(css=centered_col_css) as demo:
         gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
         """)
+        # gr.Markdown("""
+        #         ## Instructions:
+        #         Please review these instructions and enter your information to begin:
+        #         - Each session requires at least 5-10 minutes per question.
+        #         - You can evaluate multiple questions; you will not repeat evaluations.
+        #         - For each question, compare responses from two models and rate them (scale: 1-5).
+        #         - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
+        #         - Use the Back and Next buttons to edit responses before submission.
+        #         - Use the Home Page button to return to the homepage; progress will save but not submit.
+        #         - Submit answers to the current question before moving to the next.
+        #         - You can pause between questions and return later; ensure current answers are submitted to save them.
+        #     """)
+        # with open("anatomyofAgentResponse.jpg", "rb") as image_file:
+        #     img = Image.open(image_file)
+        #     new_size = (int(img.width * 0.5), int(img.height * 0.5))
+        #     img = img.resize(new_size, Image.LANCZOS)
+        #     buffer = io.BytesIO()
+        #     img.save(buffer, format="PNG")
+        #     encoded_string = base64.b64encode(
+        #         buffer.getvalue()).decode("utf-8")
+        # image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
+        # ReasoningTraceExampleHTML = f"""
+        #     <div>
+        #         {image_html}
+        #     </div>
+        #     """
+        # gr.HTML(ReasoningTraceExampleHTML)
     # Page 1: Pairwise Comparison.
     with gr.Column(visible=False) as page1:
+        with gr.Accordion("Instructions", open=False):
+            gr.Markdown("""
+                    ## Instructions:
+                    Please review these instructions and enter your information to begin:
+                    - Each session requires at least 5-10 minutes per question.
+                    - You can evaluate multiple questions; you will not repeat evaluations.
+                    - For each question, compare responses from two models and rate them (scale: 1-5).
+                    - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
+                    - Use the Back and Next buttons to edit responses before submission.
+                    - Use the Home Page button to return to the homepage; progress will save but not submit.
+                    - Submit answers to the current question before moving to the next.
+                    - You can pause between questions and return later; ensure current answers are submitted to save them.
+                """)
         # Make the number controlled by question indexing!
         pairwise_header = gr.Markdown("## Part 1/2: Pairwise Comparison")
         gr.Markdown("")
                     value=[],
                     type="messages",
                     height=300,
+                    label="Model A Reasoning - Rationale",
                     show_copy_button=False,
                     show_label=True,
                     render_markdown=True,
                     value=[],
                     type="messages",
                     height=300,
+                    label="Model B Reasoning - Rationale",
                     show_copy_button=False,
                     show_label=True,
                     render_markdown=True,
                     rating_a = gr.Radio(choices=sorted(crit_score["scores"]),  # ["1", "2", "3", "4", "5", "Unable to Judge"],
                                         label=f"Response A - {crit_score['text']}",
                                         interactive=True,
+                                        elem_classes="criteria-radio-score-label")
                 with gr.Column(scale=1):
                     rating_b = gr.Radio(choices=sorted(crit_score["scores"]),  # ["1", "2", "3", "4", "5", "Unable to Judge"],
                                         label=f"Response B - {crit_score['text']}",
                                         interactive=True,
+                                        elem_classes="criteria-radio-score-label")
             # Add clear button and wire up the restrictions
             with gr.Row():

utils.py CHANGED Viewed

@@ -248,14 +248,14 @@ def format_chat(response, tool_database_labels):
             # Clear after rendering
             last_tool_calls = []
-    if chat_history:
-        last_msg = chat_history[-1]
-        if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
-            # Find the first assistant message
-            for msg in chat_history:
-                if msg.role == "assistant" and isinstance(msg.content, str):
-                    msg.content = "**Reasoning:**\n" + msg.content
-                    break
     if chat_history:
         last_msg = chat_history[-1]
         if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:

             # Clear after rendering
             last_tool_calls = []
+    # if chat_history:
+    #     last_msg = chat_history[-1]
+    #     if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
+    #         # Find the first assistant message
+    #         for msg in chat_history:
+    #             if msg.role == "assistant" and isinstance(msg.content, str):
+    #                 msg.content = "**Reasoning:**\n" + msg.content
+    #                 break
     if chat_history:
         last_msg = chat_history[-1]
         if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content: