Spaces:
Sleeping
Sleeping
update
Browse files
README.md
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
emoji: 🌍
|
4 |
colorFrom: yellow
|
5 |
colorTo: indigo
|
@@ -9,7 +9,7 @@ app_file: app.py
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
-
The
|
13 |
|
14 |
---
|
15 |
|
|
|
1 |
---
|
2 |
+
title: TxAgent RAO Evaluation
|
3 |
emoji: 🌍
|
4 |
colorFrom: yellow
|
5 |
colorTo: indigo
|
|
|
9 |
pinned: false
|
10 |
---
|
11 |
|
12 |
+
The TxAgent Rare-as-one Evaluation Portal is a Gradio-based web application designed to facilitate the human evaluation of TxAgent's responses compared to other models. Users log in with their credentials, receive questions relevant to their expertise, and then perform pairwise comparisons and detailed ratings of model responses.
|
13 |
|
14 |
---
|
15 |
|
app.py
CHANGED
@@ -60,7 +60,6 @@ tool_database_labels = {
|
|
60 |
if key in tool_database_labels_raw
|
61 |
}
|
62 |
|
63 |
-
# Define the six evaluation criteria as a list of dictionaries.
|
64 |
# Define the six evaluation criteria as a list of dictionaries.
|
65 |
criteria = [
|
66 |
{
|
@@ -76,7 +75,7 @@ criteria = [
|
|
76 |
]
|
77 |
},
|
78 |
{
|
79 |
-
"label": "
|
80 |
"text": "Is the model’s rationale helpful in determining whether the answer is correct?",
|
81 |
"scores": [
|
82 |
"1 No usable rationale. ",
|
@@ -170,7 +169,7 @@ criteria_for_comparison = [
|
|
170 |
)
|
171 |
},
|
172 |
{
|
173 |
-
"label": "
|
174 |
"text": (
|
175 |
"Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
|
176 |
)
|
@@ -355,10 +354,14 @@ def validate_required_fields(name, email, evaluator_id, specialty_dd, years_exp_
|
|
355 |
|
356 |
|
357 |
# --- Calculate progress information ---
|
358 |
-
def calculate_progress_info(progress_state):
|
359 |
"""
|
360 |
Calculate progress information for pairwise comparisons.
|
361 |
|
|
|
|
|
|
|
|
|
362 |
Returns:
|
363 |
dict: Contains progress information including:
|
364 |
- pairwise_completed: number of completed pairwise comparisons
|
@@ -385,8 +388,16 @@ def calculate_progress_info(progress_state):
|
|
385 |
# Calculate remaining
|
386 |
pairwise_remaining = total_pairs - pairwise_done
|
387 |
|
388 |
-
#
|
389 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
390 |
|
391 |
return {
|
392 |
'pairwise_completed': pairwise_done,
|
@@ -816,6 +827,9 @@ def get_next_eval_question(user_info, our_methods, return_user_info=True, includ
|
|
816 |
else:
|
817 |
data_subset_state["reference_answer"] = _create_reference_answer_component(None, include_correct_answer)
|
818 |
|
|
|
|
|
|
|
819 |
# 创建用户信息对象 (update question_id if not already set)
|
820 |
if return_user_info:
|
821 |
updated_user_info = user_info.copy()
|
@@ -875,7 +889,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
|
|
875 |
prompt_html = (
|
876 |
f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; '
|
877 |
f'padding: 10px; border-radius: 5px; color: black;">'
|
878 |
-
f'<strong>
|
879 |
)
|
880 |
chat_a_answer = gr.Chatbot(
|
881 |
value=chat_A_answer,
|
@@ -905,7 +919,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
|
|
905 |
value=chat_A_reasoning,
|
906 |
type="messages",
|
907 |
height=300,
|
908 |
-
label="Model A Reasoning",
|
909 |
show_copy_button=False,
|
910 |
show_label=True,
|
911 |
render_markdown=True,
|
@@ -917,7 +931,7 @@ def extract_ui_content_by_mode(progress_state, data_subset_state, next_pair):
|
|
917 |
value=chat_B_reasoning,
|
918 |
type="messages",
|
919 |
height=300,
|
920 |
-
label="Model B Reasoning",
|
921 |
show_copy_button=False,
|
922 |
show_label=True,
|
923 |
render_markdown=True,
|
@@ -1043,6 +1057,12 @@ def _apply_rating_restrictions(pairwise_choice, score_a, score_b, include_values
|
|
1043 |
def advance_workflow(progress_state, data_subset_state, current_pairwise=None, current_scoring=None):
|
1044 |
"""
|
1045 |
Unified workflow manager that handles all state transitions and UI updates.
|
|
|
|
|
|
|
|
|
|
|
|
|
1046 |
"""
|
1047 |
# print(f"Advance workflow called, previous mode: {progress_state.get('mode')}")
|
1048 |
# print(progress_state)
|
@@ -1302,7 +1322,10 @@ def submit_pairwise_scoring(progress_state, data_subset_state, user_info, *combi
|
|
1302 |
|
1303 |
# Calculate progress and show info message
|
1304 |
num_remaining_questions = remaining_count // len(progress_state['all_pairs'])
|
1305 |
-
gr.Info(f"You are about to evaluate the next question.
|
|
|
|
|
|
|
1306 |
|
1307 |
# Use advance_workflow to get ALL UI updates for new question
|
1308 |
ui_updates = advance_workflow(progress_state, data_subset_state)
|
@@ -1576,6 +1599,33 @@ centered_col_css = """
|
|
1576 |
.short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
|
1577 |
.light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
|
1578 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1579 |
"""
|
1580 |
|
1581 |
with gr.Blocks(css=centered_col_css) as demo:
|
@@ -1656,38 +1706,52 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1656 |
gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
|
1657 |
""")
|
1658 |
|
1659 |
-
gr.Markdown("""
|
1660 |
-
|
1661 |
-
|
1662 |
-
|
1663 |
-
|
1664 |
-
|
1665 |
-
|
1666 |
-
|
1667 |
-
|
1668 |
-
|
1669 |
-
|
1670 |
-
|
1671 |
-
|
1672 |
-
with open("anatomyofAgentResponse.jpg", "rb") as image_file:
|
1673 |
-
|
1674 |
-
|
1675 |
-
|
1676 |
-
|
1677 |
-
|
1678 |
-
|
1679 |
-
|
1680 |
-
|
1681 |
-
image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
|
1682 |
-
ReasoningTraceExampleHTML = f"""
|
1683 |
-
|
1684 |
-
|
1685 |
-
|
1686 |
-
|
1687 |
-
gr.HTML(ReasoningTraceExampleHTML)
|
1688 |
|
1689 |
# Page 1: Pairwise Comparison.
|
1690 |
with gr.Column(visible=False) as page1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1691 |
# Make the number controlled by question indexing!
|
1692 |
pairwise_header = gr.Markdown("## Part 1/2: Pairwise Comparison")
|
1693 |
gr.Markdown("")
|
@@ -1716,7 +1780,7 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1716 |
value=[],
|
1717 |
type="messages",
|
1718 |
height=300,
|
1719 |
-
label="Model A Reasoning",
|
1720 |
show_copy_button=False,
|
1721 |
show_label=True,
|
1722 |
render_markdown=True,
|
@@ -1742,7 +1806,7 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1742 |
value=[],
|
1743 |
type="messages",
|
1744 |
height=300,
|
1745 |
-
label="Model B Reasoning",
|
1746 |
show_copy_button=False,
|
1747 |
show_label=True,
|
1748 |
render_markdown=True,
|
@@ -1787,12 +1851,12 @@ with gr.Blocks(css=centered_col_css) as demo:
|
|
1787 |
rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
|
1788 |
label=f"Response A - {crit_score['text']}",
|
1789 |
interactive=True,
|
1790 |
-
elem_classes="criteria-radio-label")
|
1791 |
with gr.Column(scale=1):
|
1792 |
rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
|
1793 |
label=f"Response B - {crit_score['text']}",
|
1794 |
interactive=True,
|
1795 |
-
elem_classes="criteria-radio-label")
|
1796 |
|
1797 |
# Add clear button and wire up the restrictions
|
1798 |
with gr.Row():
|
|
|
60 |
if key in tool_database_labels_raw
|
61 |
}
|
62 |
|
|
|
63 |
# Define the six evaluation criteria as a list of dictionaries.
|
64 |
criteria = [
|
65 |
{
|
|
|
75 |
]
|
76 |
},
|
77 |
{
|
78 |
+
"label": "Helpfulness of rationale",
|
79 |
"text": "Is the model’s rationale helpful in determining whether the answer is correct?",
|
80 |
"scores": [
|
81 |
"1 No usable rationale. ",
|
|
|
169 |
)
|
170 |
},
|
171 |
{
|
172 |
+
"label": "Helpfulness of rationale",
|
173 |
"text": (
|
174 |
"Which response offers a clearer, more detailed rationale that genuinely aids you in judging whether the answer is correct?"
|
175 |
)
|
|
|
354 |
|
355 |
|
356 |
# --- Calculate progress information ---
|
357 |
+
def calculate_progress_info(progress_state, remaining_count=None):
|
358 |
"""
|
359 |
Calculate progress information for pairwise comparisons.
|
360 |
|
361 |
+
Args:
|
362 |
+
progress_state: The current progress state (should contain remaining_count if available)
|
363 |
+
remaining_count: Optional remaining count (deprecated, use progress_state['remaining_count'] instead)
|
364 |
+
|
365 |
Returns:
|
366 |
dict: Contains progress information including:
|
367 |
- pairwise_completed: number of completed pairwise comparisons
|
|
|
388 |
# Calculate remaining
|
389 |
pairwise_remaining = total_pairs - pairwise_done
|
390 |
|
391 |
+
# Get remaining_count from progress_state (preferred) or parameter (fallback)
|
392 |
+
remaining_count_to_use = progress_state.get('remaining_count', remaining_count)
|
393 |
+
|
394 |
+
# Create progress text - show remaining questions if remaining_count is available
|
395 |
+
if remaining_count_to_use is not None and total_pairs > 0:
|
396 |
+
num_remaining_questions = remaining_count_to_use // total_pairs
|
397 |
+
pairwise_progress_text = f"Current Question Evaluation Progress: {num_remaining_questions} question(s) remaining to evaluate"
|
398 |
+
# pairwise_progress_text = f"Current Question Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({num_remaining_questions} question(s) remaining to evaluate)"
|
399 |
+
else:
|
400 |
+
pairwise_progress_text = f"Current Question Evaluation Progress: {pairwise_done}/{total_pairs} pairs completed ({pairwise_remaining} remaining)"
|
401 |
|
402 |
return {
|
403 |
'pairwise_completed': pairwise_done,
|
|
|
827 |
else:
|
828 |
data_subset_state["reference_answer"] = _create_reference_answer_component(None, include_correct_answer)
|
829 |
|
830 |
+
# Store remaining count in progress_state for progress display
|
831 |
+
progress_state['remaining_count'] = len(full_question_ids_list)
|
832 |
+
|
833 |
# 创建用户信息对象 (update question_id if not already set)
|
834 |
if return_user_info:
|
835 |
updated_user_info = user_info.copy()
|
|
|
889 |
prompt_html = (
|
890 |
f'<div style="background-color: #FFEFD5; border: 2px solid #FF8C00; '
|
891 |
f'padding: 10px; border-radius: 5px; color: black;">'
|
892 |
+
f'<strong>Question:</strong> {data_subset_state["question"]}</div>'
|
893 |
)
|
894 |
chat_a_answer = gr.Chatbot(
|
895 |
value=chat_A_answer,
|
|
|
919 |
value=chat_A_reasoning,
|
920 |
type="messages",
|
921 |
height=300,
|
922 |
+
label="Model A Reasoning - Rationale",
|
923 |
show_copy_button=False,
|
924 |
show_label=True,
|
925 |
render_markdown=True,
|
|
|
931 |
value=chat_B_reasoning,
|
932 |
type="messages",
|
933 |
height=300,
|
934 |
+
label="Model B Reasoning - Rationale",
|
935 |
show_copy_button=False,
|
936 |
show_label=True,
|
937 |
render_markdown=True,
|
|
|
1057 |
def advance_workflow(progress_state, data_subset_state, current_pairwise=None, current_scoring=None):
|
1058 |
"""
|
1059 |
Unified workflow manager that handles all state transitions and UI updates.
|
1060 |
+
|
1061 |
+
Args:
|
1062 |
+
progress_state: Current progress state (should contain remaining_count if available)
|
1063 |
+
data_subset_state: Current data subset state
|
1064 |
+
current_pairwise: Current pairwise comparison values (for validation)
|
1065 |
+
current_scoring: Current scoring values (for validation)
|
1066 |
"""
|
1067 |
# print(f"Advance workflow called, previous mode: {progress_state.get('mode')}")
|
1068 |
# print(progress_state)
|
|
|
1322 |
|
1323 |
# Calculate progress and show info message
|
1324 |
num_remaining_questions = remaining_count // len(progress_state['all_pairs'])
|
1325 |
+
gr.Info(f"The evaluation has been submitted. You are about to evaluate the next question. {num_remaining_questions} question(s) remaining to evaluate.")
|
1326 |
+
|
1327 |
+
# Store remaining count in progress_state for progress display
|
1328 |
+
progress_state['remaining_count'] = remaining_count
|
1329 |
|
1330 |
# Use advance_workflow to get ALL UI updates for new question
|
1331 |
ui_updates = advance_workflow(progress_state, data_subset_state)
|
|
|
1599 |
.short-btn { min-width: 80px !important; max-width: 120px !important; width: 100px !important; padding-left: 4px !important; padding-right: 4px !important; }
|
1600 |
.light-stop-btn { background-color: #ffcccc !important; color: #b30000 !important; border-color: #ffcccc !important; }
|
1601 |
|
1602 |
+
.criteria-radio-score-label [role="radiogroup"],
|
1603 |
+
.criteria-radio-score-label .gr-radio-group,
|
1604 |
+
.criteria-radio-score-label .flex {
|
1605 |
+
display: flex !important;
|
1606 |
+
flex-direction: column !important;
|
1607 |
+
gap: 4px !important; /* 行间距,可按需调整 */
|
1608 |
+
}
|
1609 |
+
|
1610 |
+
/* 更具体的选择器来确保垂直布局 */
|
1611 |
+
.criteria-radio-score-label fieldset {
|
1612 |
+
display: flex !important;
|
1613 |
+
flex-direction: column !important;
|
1614 |
+
gap: 4px !important;
|
1615 |
+
}
|
1616 |
+
|
1617 |
+
.criteria-radio-score-label .wrap {
|
1618 |
+
display: flex !important;
|
1619 |
+
flex-direction: column !important;
|
1620 |
+
gap: 4px !important;
|
1621 |
+
}
|
1622 |
+
|
1623 |
+
/* 确保每个单选按钮选项垂直排列 */
|
1624 |
+
.criteria-radio-score-label label {
|
1625 |
+
display: block !important;
|
1626 |
+
margin-bottom: 4px !important;
|
1627 |
+
}
|
1628 |
+
|
1629 |
"""
|
1630 |
|
1631 |
with gr.Blocks(css=centered_col_css) as demo:
|
|
|
1706 |
gr.Markdown("""By clicking 'Next' below, you will start the study, with your progress saved after submitting each question. If you have any other questions or concerns, please contact us directly. Thank you for your participation!
|
1707 |
""")
|
1708 |
|
1709 |
+
# gr.Markdown("""
|
1710 |
+
# ## Instructions:
|
1711 |
+
# Please review these instructions and enter your information to begin:
|
1712 |
+
|
1713 |
+
# - Each session requires at least 5-10 minutes per question.
|
1714 |
+
# - You can evaluate multiple questions; you will not repeat evaluations.
|
1715 |
+
# - For each question, compare responses from two models and rate them (scale: 1-5).
|
1716 |
+
# - If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
|
1717 |
+
# - Use the Back and Next buttons to edit responses before submission.
|
1718 |
+
# - Use the Home Page button to return to the homepage; progress will save but not submit.
|
1719 |
+
# - Submit answers to the current question before moving to the next.
|
1720 |
+
# - You can pause between questions and return later; ensure current answers are submitted to save them.
|
1721 |
+
# """)
|
1722 |
+
# with open("anatomyofAgentResponse.jpg", "rb") as image_file:
|
1723 |
+
# img = Image.open(image_file)
|
1724 |
+
# new_size = (int(img.width * 0.5), int(img.height * 0.5))
|
1725 |
+
# img = img.resize(new_size, Image.LANCZOS)
|
1726 |
+
# buffer = io.BytesIO()
|
1727 |
+
# img.save(buffer, format="PNG")
|
1728 |
+
# encoded_string = base64.b64encode(
|
1729 |
+
# buffer.getvalue()).decode("utf-8")
|
1730 |
+
|
1731 |
+
# image_html = f'<div style="text-align:center;"><img src="data:image/png;base64,{encoded_string}" alt="Your Image"></div>'
|
1732 |
+
# ReasoningTraceExampleHTML = f"""
|
1733 |
+
# <div>
|
1734 |
+
# {image_html}
|
1735 |
+
# </div>
|
1736 |
+
# """
|
1737 |
+
# gr.HTML(ReasoningTraceExampleHTML)
|
1738 |
|
1739 |
# Page 1: Pairwise Comparison.
|
1740 |
with gr.Column(visible=False) as page1:
|
1741 |
+
with gr.Accordion("Instructions", open=False):
|
1742 |
+
gr.Markdown("""
|
1743 |
+
## Instructions:
|
1744 |
+
Please review these instructions and enter your information to begin:
|
1745 |
+
|
1746 |
+
- Each session requires at least 5-10 minutes per question.
|
1747 |
+
- You can evaluate multiple questions; you will not repeat evaluations.
|
1748 |
+
- For each question, compare responses from two models and rate them (scale: 1-5).
|
1749 |
+
- If a question is unclear or irrelevant to biomedicine, click the RED BUTTON at the top of the comparison page.
|
1750 |
+
- Use the Back and Next buttons to edit responses before submission.
|
1751 |
+
- Use the Home Page button to return to the homepage; progress will save but not submit.
|
1752 |
+
- Submit answers to the current question before moving to the next.
|
1753 |
+
- You can pause between questions and return later; ensure current answers are submitted to save them.
|
1754 |
+
""")
|
1755 |
# Make the number controlled by question indexing!
|
1756 |
pairwise_header = gr.Markdown("## Part 1/2: Pairwise Comparison")
|
1757 |
gr.Markdown("")
|
|
|
1780 |
value=[],
|
1781 |
type="messages",
|
1782 |
height=300,
|
1783 |
+
label="Model A Reasoning - Rationale",
|
1784 |
show_copy_button=False,
|
1785 |
show_label=True,
|
1786 |
render_markdown=True,
|
|
|
1806 |
value=[],
|
1807 |
type="messages",
|
1808 |
height=300,
|
1809 |
+
label="Model B Reasoning - Rationale",
|
1810 |
show_copy_button=False,
|
1811 |
show_label=True,
|
1812 |
render_markdown=True,
|
|
|
1851 |
rating_a = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
|
1852 |
label=f"Response A - {crit_score['text']}",
|
1853 |
interactive=True,
|
1854 |
+
elem_classes="criteria-radio-score-label")
|
1855 |
with gr.Column(scale=1):
|
1856 |
rating_b = gr.Radio(choices=sorted(crit_score["scores"]), # ["1", "2", "3", "4", "5", "Unable to Judge"],
|
1857 |
label=f"Response B - {crit_score['text']}",
|
1858 |
interactive=True,
|
1859 |
+
elem_classes="criteria-radio-score-label")
|
1860 |
|
1861 |
# Add clear button and wire up the restrictions
|
1862 |
with gr.Row():
|
utils.py
CHANGED
@@ -248,14 +248,14 @@ def format_chat(response, tool_database_labels):
|
|
248 |
# Clear after rendering
|
249 |
last_tool_calls = []
|
250 |
|
251 |
-
if chat_history:
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
if chat_history:
|
260 |
last_msg = chat_history[-1]
|
261 |
if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
|
|
|
248 |
# Clear after rendering
|
249 |
last_tool_calls = []
|
250 |
|
251 |
+
# if chat_history:
|
252 |
+
# last_msg = chat_history[-1]
|
253 |
+
# if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
|
254 |
+
# # Find the first assistant message
|
255 |
+
# for msg in chat_history:
|
256 |
+
# if msg.role == "assistant" and isinstance(msg.content, str):
|
257 |
+
# msg.content = "**Reasoning:**\n" + msg.content
|
258 |
+
# break
|
259 |
if chat_history:
|
260 |
last_msg = chat_history[-1]
|
261 |
if isinstance(last_msg.content, str) and "[FinalAnswer]" in last_msg.content:
|