shgao commited on
Commit
3622597
·
1 Parent(s): 495ce2c
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +107 -43
  3. utils.py +8 -8
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: CZI Evaluation
3
  emoji: 🌍
4
  colorFrom: yellow
5
  colorTo: indigo
@@ -9,7 +9,7 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- The CZI Evaluation Portal is a Gradio-based web application designed to facilitate the human evaluation of TxAgent's responses compared to other models. Users log in with their credentials, receive questions relevant to their expertise, and then perform pairwise comparisons and detailed ratings of model responses.
13
 
14
  ---
15
 
 
1
  ---
2
+ title: TxAgent RAO Evaluation
3
  emoji: 🌍
4
  colorFrom: yellow
5
  colorTo: indigo
 
9
  pinned: false
10
  ---
11
 
12
+ The TxAgent Rare-as-one Evaluation Portal is a Gradio-based web application designed to facilitate the human evaluation of TxAgent's responses compared to other models. Users log in with their credentials, receive questions relevant to their expertise, and then perform pairwise comparisons and detailed ratings of model responses.
13
 
14
  ---
15
 
app.py CHANGED
@@ -60,7 +60,6 @@ tool_database_labels = {
60
  if key in tool_database_labels_raw
61
  }
62
 
63
- # Define the six evaluation criteria as a list of dictionaries.
64
  # Define the six evaluation criteria as a list of dictionaries.
65
  criteria = [
66
  {
@@ -76,7 +75,7 @@ criteria = [
76
  ]
77
  },
78
  {
79
- "label": "Justification helpfulness",
80
  "text": "Is the model’s rationale helpful in determining whether the answer is correct?",
81
  "scores": [
82
  "1 No usable rationale. ",
@@ -170,7 +169,7 @@ criteria_for_comparison = [
170
  )
171
  },
172
  {
173
- "label": "Justification helpfulness",
174
  "text": (
175
  "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
176
  )
@@ -355,10 +354,14 @@ def validate_required_fields(name, email, evaluator_id, specialty_dd, years_exp_
355
 
356
 
357
  # --- Calculate progress information ---
358
- def calculate_progress_info(progress_state):
359
  """
360
  Calculate progress information for pairwise comparisons.
361
 
 
 
 
 
362
  Returns:
363
  dict: Contains progress information including:
364
  - pairwise_completed: number of completed pairwise comparisons
@@ -385,8 +388,16 @@ def calculate_progress_info(progress_state):
385
  # Calculate remaining
386
  pairwise_remaining = total_pairs - pairwise_done
387
 
388
- # Create progress text
389
- pairwise_progress_text = f"Currrent Question Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({pairwise_remaining} remaining)"
 
 
 
 
 
 
 
 
390
 
391
  return {
392
  'pairwise_completed': pairwise_done,
@@ -816,6 +827,9 @@ def get_next_eval_question(user_info, our_methods, return_user_info=True, includ
816
  else:
817
  data_subset_state["reference_answer"] = _create_reference_answer_component(None, include_correct_answer)
818
 
 
 
 
819
  # 创建用户信息对象 (update question_id if not already set)
820
  if return_user_info:
821
  updated_user_info = user_info.copy()
@@ -875,7 +889,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
875
  prompt_html = (
876
  f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; '
877
  f'padding: 10px; border-radius: 5px; color: black;">'
878
- f'<strong>Prompt:</strong> {data_subset_state["question"]}</div>'
879
  )
880
  chat_a_answer = gr.Chatbot(
881
  value=chat_A_answer,
@@ -905,7 +919,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
905
  value=chat_A_reasoning,
906
  type="messages",
907
  height=300,
908
- label="Model A Reasoning",
909
  show_copy_button=False,
910
  show_label=True,
911
  render_markdown=True,
@@ -917,7 +931,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
917
  value=chat_B_reasoning,
918
  type="messages",
919
  height=300,
920
- label="Model B Reasoning",
921
  show_copy_button=False,
922
  show_label=True,
923
  render_markdown=True,
@@ -1043,6 +1057,12 @@ def _apply_rating_restrictions(pairwise_choice, score_a, score_b, include_values
1043
  def advance_workflow(progress_state, data_subset_state, current_pairwise=None, current_scoring=None):
1044
  """
1045
  Unified workflow manager that handles all state transitions and UI updates.
 
 
 
 
 
 
1046
  """
1047
  # print(f"Advance workflow called, previous mode: {progress_state.get('mode')}")
1048
  # print(progress_state)
@@ -1302,7 +1322,10 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *combi
1302
 
1303
  # Calculate progress and show info message
1304
  num_remaining_questions = remaining_count // len(progress_state['all_pairs'])
1305
- gr.Info(f"You are about to evaluate the next question. You have {num_remaining_questions} question(s) remaining to evaluate.")
 
 
 
1306
 
1307
  # Use advance_workflow to get ALL UI updates for new question
1308
  ui_updates = advance_workflow(progress_state, data_subset_state)
@@ -1576,6 +1599,33 @@ centered_col_css = """
1576
  .short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
1577
  .light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
1578
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1579
  """
1580
 
1581
  with gr.Blocks(css=centered_col_css) as demo:
@@ -1656,38 +1706,52 @@ with gr.Blocks(css=centered_col_css) as demo:
1656
  gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
1657
  """)
1658
 
1659
- gr.Markdown("""
1660
- ## Instructions:
1661
- Please review these instructions and enter your information to begin:
1662
-
1663
- - Each session requires at least 5-10 minutes per question.
1664
- - You can evaluate multiple questions; you will not repeat evaluations.
1665
- - For each question, compare responses from two models and rate them (scale: 1-5).
1666
- - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
1667
- - Use the Back and Next buttons to edit responses before submission.
1668
- - Use the Home Page button to return to the homepage; progress will save but not submit.
1669
- - Submit answers to the current question before moving to the next.
1670
- - You can pause between questions and return later; ensure current answers are submitted to save them.
1671
- """)
1672
- with open("anatomyofAgentResponse.jpg", "rb") as image_file:
1673
- img = Image.open(image_file)
1674
- new_size = (int(img.width * 0.5), int(img.height * 0.5))
1675
- img = img.resize(new_size, Image.LANCZOS)
1676
- buffer = io.BytesIO()
1677
- img.save(buffer, format="PNG")
1678
- encoded_string = base64.b64encode(
1679
- buffer.getvalue()).decode("utf-8")
1680
-
1681
- image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
1682
- ReasoningTraceExampleHTML = f"""
1683
- <div>
1684
- {image_html}
1685
- </div>
1686
- """
1687
- gr.HTML(ReasoningTraceExampleHTML)
1688
 
1689
  # Page 1: Pairwise Comparison.
1690
  with gr.Column(visible=False) as page1:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1691
  # Make the number controlled by question indexing!
1692
  pairwise_header = gr.Markdown("## Part 1/2: Pairwise Comparison")
1693
  gr.Markdown("")
@@ -1716,7 +1780,7 @@ with gr.Blocks(css=centered_col_css) as demo:
1716
  value=[],
1717
  type="messages",
1718
  height=300,
1719
- label="Model A Reasoning",
1720
  show_copy_button=False,
1721
  show_label=True,
1722
  render_markdown=True,
@@ -1742,7 +1806,7 @@ with gr.Blocks(css=centered_col_css) as demo:
1742
  value=[],
1743
  type="messages",
1744
  height=300,
1745
- label="Model B Reasoning",
1746
  show_copy_button=False,
1747
  show_label=True,
1748
  render_markdown=True,
@@ -1787,12 +1851,12 @@ with gr.Blocks(css=centered_col_css) as demo:
1787
  rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
1788
  label=f"Response A - {crit_score['text']}",
1789
  interactive=True,
1790
- elem_classes="criteria-radio-label")
1791
  with gr.Column(scale=1):
1792
  rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
1793
  label=f"Response B - {crit_score['text']}",
1794
  interactive=True,
1795
- elem_classes="criteria-radio-label")
1796
 
1797
  # Add clear button and wire up the restrictions
1798
  with gr.Row():
 
60
  if key in tool_database_labels_raw
61
  }
62
 
 
63
  # Define the six evaluation criteria as a list of dictionaries.
64
  criteria = [
65
  {
 
75
  ]
76
  },
77
  {
78
+ "label": "Helpfulness of rationale",
79
  "text": "Is the model’s rationale helpful in determining whether the answer is correct?",
80
  "scores": [
81
  "1 No usable rationale. ",
 
169
  )
170
  },
171
  {
172
+ "label": "Helpfulness of rationale",
173
  "text": (
174
  "Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
175
  )
 
354
 
355
 
356
  # --- Calculate progress information ---
357
+ def calculate_progress_info(progress_state, remaining_count=None):
358
  """
359
  Calculate progress information for pairwise comparisons.
360
 
361
+ Args:
362
+ progress_state: The current progress state (should contain remaining_count if available)
363
+ remaining_count: Optional remaining count (deprecated, use progress_state['remaining_count'] instead)
364
+
365
  Returns:
366
  dict: Contains progress information including:
367
  - pairwise_completed: number of completed pairwise comparisons
 
388
  # Calculate remaining
389
  pairwise_remaining = total_pairs - pairwise_done
390
 
391
+ # Get remaining_count from progress_state (preferred) or parameter (fallback)
392
+ remaining_count_to_use = progress_state.get('remaining_count', remaining_count)
393
+
394
+ # Create progress text - show remaining questions if remaining_count is available
395
+ if remaining_count_to_use is not None and total_pairs > 0:
396
+ num_remaining_questions = remaining_count_to_use // total_pairs
397
+ pairwise_progress_text = f"Current Question Evaluation Progress: {num_remaining_questions} question(s) remaining to evaluate"
398
+ # pairwise_progress_text = f"Current Question Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({num_remaining_questions} question(s) remaining to evaluate)"
399
+ else:
400
+ pairwise_progress_text = f"Current Question Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({pairwise_remaining} remaining)"
401
 
402
  return {
403
  'pairwise_completed': pairwise_done,
 
827
  else:
828
  data_subset_state["reference_answer"] = _create_reference_answer_component(None, include_correct_answer)
829
 
830
+ # Store remaining count in progress_state for progress display
831
+ progress_state['remaining_count'] = len(full_question_ids_list)
832
+
833
  # 创建用户信息对象 (update question_id if not already set)
834
  if return_user_info:
835
  updated_user_info = user_info.copy()
 
889
  prompt_html = (
890
  f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; '
891
  f'padding: 10px; border-radius: 5px; color: black;">'
892
+ f'<strong>Question:</strong> {data_subset_state["question"]}</div>'
893
  )
894
  chat_a_answer = gr.Chatbot(
895
  value=chat_A_answer,
 
919
  value=chat_A_reasoning,
920
  type="messages",
921
  height=300,
922
+ label="Model A Reasoning - Rationale",
923
  show_copy_button=False,
924
  show_label=True,
925
  render_markdown=True,
 
931
  value=chat_B_reasoning,
932
  type="messages",
933
  height=300,
934
+ label="Model B Reasoning - Rationale",
935
  show_copy_button=False,
936
  show_label=True,
937
  render_markdown=True,
 
1057
  def advance_workflow(progress_state, data_subset_state, current_pairwise=None, current_scoring=None):
1058
  """
1059
  Unified workflow manager that handles all state transitions and UI updates.
1060
+
1061
+ Args:
1062
+ progress_state: Current progress state (should contain remaining_count if available)
1063
+ data_subset_state: Current data subset state
1064
+ current_pairwise: Current pairwise comparison values (for validation)
1065
+ current_scoring: Current scoring values (for validation)
1066
  """
1067
  # print(f"Advance workflow called, previous mode: {progress_state.get('mode')}")
1068
  # print(progress_state)
 
1322
 
1323
  # Calculate progress and show info message
1324
  num_remaining_questions = remaining_count // len(progress_state['all_pairs'])
1325
+ gr.Info(f"The evaluation has been submitted. You are about to evaluate the next question. {num_remaining_questions} question(s) remaining to evaluate.")
1326
+
1327
+ # Store remaining count in progress_state for progress display
1328
+ progress_state['remaining_count'] = remaining_count
1329
 
1330
  # Use advance_workflow to get ALL UI updates for new question
1331
  ui_updates = advance_workflow(progress_state, data_subset_state)
 
1599
  .short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
1600
  .light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
1601
 
1602
+ .criteria-radio-score-label [role="radiogroup"],
1603
+ .criteria-radio-score-label .gr-radio-group,
1604
+ .criteria-radio-score-label .flex {
1605
+ display: flex !important;
1606
+ flex-direction: column !important;
1607
+ gap: 4px !important; /* 行间距,可按需调整 */
1608
+ }
1609
+
1610
+ /* 更具体的选择器来确保垂直布局 */
1611
+ .criteria-radio-score-label fieldset {
1612
+ display: flex !important;
1613
+ flex-direction: column !important;
1614
+ gap: 4px !important;
1615
+ }
1616
+
1617
+ .criteria-radio-score-label .wrap {
1618
+ display: flex !important;
1619
+ flex-direction: column !important;
1620
+ gap: 4px !important;
1621
+ }
1622
+
1623
+ /* 确保每个单选按钮选项垂直排列 */
1624
+ .criteria-radio-score-label label {
1625
+ display: block !important;
1626
+ margin-bottom: 4px !important;
1627
+ }
1628
+
1629
  """
1630
 
1631
  with gr.Blocks(css=centered_col_css) as demo:
 
1706
  gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
1707
  """)
1708
 
1709
+ # gr.Markdown("""
1710
+ # ## Instructions:
1711
+ # Please review these instructions and enter your information to begin:
1712
+
1713
+ # - Each session requires at least 5-10 minutes per question.
1714
+ # - You can evaluate multiple questions; you will not repeat evaluations.
1715
+ # - For each question, compare responses from two models and rate them (scale: 1-5).
1716
+ # - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
1717
+ # - Use the Back and Next buttons to edit responses before submission.
1718
+ # - Use the Home Page button to return to the homepage; progress will save but not submit.
1719
+ # - Submit answers to the current question before moving to the next.
1720
+ # - You can pause between questions and return later; ensure current answers are submitted to save them.
1721
+ # """)
1722
+ # with open("anatomyofAgentResponse.jpg", "rb") as image_file:
1723
+ # img = Image.open(image_file)
1724
+ # new_size = (int(img.width * 0.5), int(img.height * 0.5))
1725
+ # img = img.resize(new_size, Image.LANCZOS)
1726
+ # buffer = io.BytesIO()
1727
+ # img.save(buffer, format="PNG")
1728
+ # encoded_string = base64.b64encode(
1729
+ # buffer.getvalue()).decode("utf-8")
1730
+
1731
+ # image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
1732
+ # ReasoningTraceExampleHTML = f"""
1733
+ # <div>
1734
+ # {image_html}
1735
+ # </div>
1736
+ # """
1737
+ # gr.HTML(ReasoningTraceExampleHTML)
1738
 
1739
  # Page 1: Pairwise Comparison.
1740
  with gr.Column(visible=False) as page1:
1741
+ with gr.Accordion("Instructions", open=False):
1742
+ gr.Markdown("""
1743
+ ## Instructions:
1744
+ Please review these instructions and enter your information to begin:
1745
+
1746
+ - Each session requires at least 5-10 minutes per question.
1747
+ - You can evaluate multiple questions; you will not repeat evaluations.
1748
+ - For each question, compare responses from two models and rate them (scale: 1-5).
1749
+ - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
1750
+ - Use the Back and Next buttons to edit responses before submission.
1751
+ - Use the Home Page button to return to the homepage; progress will save but not submit.
1752
+ - Submit answers to the current question before moving to the next.
1753
+ - You can pause between questions and return later; ensure current answers are submitted to save them.
1754
+ """)
1755
  # Make the number controlled by question indexing!
1756
  pairwise_header = gr.Markdown("## Part 1/2: Pairwise Comparison")
1757
  gr.Markdown("")
 
1780
  value=[],
1781
  type="messages",
1782
  height=300,
1783
+ label="Model A Reasoning - Rationale",
1784
  show_copy_button=False,
1785
  show_label=True,
1786
  render_markdown=True,
 
1806
  value=[],
1807
  type="messages",
1808
  height=300,
1809
+ label="Model B Reasoning - Rationale",
1810
  show_copy_button=False,
1811
  show_label=True,
1812
  render_markdown=True,
 
1851
  rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
1852
  label=f"Response A - {crit_score['text']}",
1853
  interactive=True,
1854
+ elem_classes="criteria-radio-score-label")
1855
  with gr.Column(scale=1):
1856
  rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
1857
  label=f"Response B - {crit_score['text']}",
1858
  interactive=True,
1859
+ elem_classes="criteria-radio-score-label")
1860
 
1861
  # Add clear button and wire up the restrictions
1862
  with gr.Row():
utils.py CHANGED
@@ -248,14 +248,14 @@ def format_chat(response, tool_database_labels):
248
  # Clear after rendering
249
  last_tool_calls = []
250
 
251
- if chat_history:
252
- last_msg = chat_history[-1]
253
- if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
254
- # Find the first assistant message
255
- for msg in chat_history:
256
- if msg.role == "assistant" and isinstance(msg.content, str):
257
- msg.content = "**Reasoning:**\n" + msg.content
258
- break
259
  if chat_history:
260
  last_msg = chat_history[-1]
261
  if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
 
248
  # Clear after rendering
249
  last_tool_calls = []
250
 
251
+ # if chat_history:
252
+ # last_msg = chat_history[-1]
253
+ # if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
254
+ # # Find the first assistant message
255
+ # for msg in chat_history:
256
+ # if msg.role == "assistant" and isinstance(msg.content, str):
257
+ # msg.content = "**Reasoning:**\n" + msg.content
258
+ # break
259
  if chat_history:
260
  last_msg = chat_history[-1]
261
  if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content: