Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
from gradio_modal import Modal
|
3 |
-
from gradio_pdf import PDF
|
4 |
from huggingface_hub import hf_hub_download, list_repo_files
|
5 |
import os, csv, datetime, sys
|
6 |
import json
|
@@ -15,7 +14,7 @@ import re
|
|
15 |
REPO_ID = "agenticx/TxAgentEvalData"
|
16 |
EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
|
17 |
TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
|
18 |
-
|
19 |
#Load tool lists from 'tool_lists' subdirectory---make sure to update this with the latest from ToolUniverse if necessary!
|
20 |
tools_dir = os.path.join(os.getcwd(), 'tool_lists')
|
21 |
|
@@ -94,73 +93,156 @@ except Exception as e:
|
|
94 |
# Define the six evaluation criteria as a list of dictionaries.
|
95 |
criteria = [
|
96 |
{
|
97 |
-
"label": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
"text": (
|
99 |
-
"
|
100 |
-
"1️⃣
|
|
|
|
|
|
|
|
|
101 |
)
|
102 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
{
|
104 |
-
"label": "
|
105 |
"text": (
|
106 |
-
"
|
107 |
-
"1️⃣
|
|
|
|
|
|
|
|
|
108 |
)
|
109 |
},
|
110 |
{
|
111 |
-
"label": "
|
112 |
"text": (
|
113 |
-
"
|
114 |
-
"1️⃣
|
|
|
|
|
|
|
|
|
115 |
)
|
116 |
},
|
117 |
{
|
118 |
-
"label": "Accuracy",
|
119 |
"text": (
|
120 |
-
"Accuracy of
|
121 |
-
"1️⃣
|
|
|
|
|
|
|
|
|
122 |
)
|
123 |
},
|
124 |
{
|
125 |
"label": "Completeness",
|
126 |
"text": (
|
127 |
-
"Completeness:
|
128 |
-
"1️⃣
|
|
|
|
|
|
|
|
|
129 |
)
|
130 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
]
|
132 |
|
|
|
133 |
criteria_for_comparison = [
|
134 |
{
|
135 |
-
"label": "
|
136 |
"text": (
|
137 |
-
"
|
138 |
)
|
139 |
},
|
140 |
{
|
141 |
-
"label": "
|
142 |
"text": (
|
143 |
-
"
|
144 |
)
|
145 |
},
|
146 |
{
|
147 |
-
"label": "
|
148 |
"text": (
|
149 |
-
"
|
150 |
)
|
151 |
},
|
152 |
{
|
153 |
-
"label": "
|
154 |
"text": (
|
155 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
)
|
157 |
},
|
158 |
{
|
159 |
"label": "Completeness",
|
160 |
"text": (
|
161 |
-
"Completeness:
|
162 |
)
|
163 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
]
|
165 |
|
166 |
mapping = { #for pairwise mapping between model comparison selections
|
@@ -179,7 +261,7 @@ def preprocess_question_id(question_id):
|
|
179 |
print("Error: Invalid question ID format. Expected a string or a single-element list.")
|
180 |
return None
|
181 |
|
182 |
-
def get_evaluator_questions(evaluator_id, all_files, evaluator_directory):
|
183 |
|
184 |
# Filter to only the files in that directory
|
185 |
evaluator_files = [f for f in all_files if f.startswith(f"{evaluator_directory}/")]
|
@@ -200,8 +282,9 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory):
|
|
200 |
|
201 |
evaluator_question_ids = []
|
202 |
# Assuming 'TxAgent-T1-Llama-3.1-8B' data is representative for question IDs and associated diseases
|
203 |
-
|
204 |
-
|
|
|
205 |
question_id = preprocess_question_id(entry.get("id"))
|
206 |
evaluator_question_ids.append(question_id)
|
207 |
# Handle case where no relevant questions are found based on specialty
|
@@ -209,11 +292,13 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory):
|
|
209 |
return [], data_by_filename
|
210 |
|
211 |
#FINALLY, MAKE SURE THEY DIDNT ALREADY FILL IT OUT. Must go through every tuple of (question_ID, TxAgent, other model) where other model could be any of the other files in data_by_filename
|
212 |
-
model_names = [key for key in data_by_filename.keys() if key
|
213 |
full_question_ids_list = []
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
217 |
|
218 |
results_df = read_sheet_to_df(custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
|
219 |
if (results_df is not None) and (not results_df.empty):
|
@@ -223,16 +308,16 @@ def get_evaluator_questions(evaluator_id, all_files, evaluator_directory):
|
|
223 |
q = row["Question ID"]
|
224 |
# pick whichever response isn’t 'TxAgent-T1-Llama-3.1-8B'
|
225 |
a, b = row["ResponseA_Model"], row["ResponseB_Model"]
|
226 |
-
if a
|
227 |
-
matched_pairs.add((q, b))
|
228 |
-
elif b
|
229 |
-
matched_pairs.add((q, a))
|
230 |
|
231 |
# filter out any tuple whose (q_id, other_model) was already matched
|
232 |
full_question_ids_list = [
|
233 |
-
(q_id, other_model)
|
234 |
-
for (q_id, other_model) in full_question_ids_list
|
235 |
-
if (q_id, other_model) not in matched_pairs
|
236 |
]
|
237 |
print(f"Filtered question IDs: {full_question_ids_list}")
|
238 |
print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
|
|
|
1 |
import gradio as gr
|
2 |
from gradio_modal import Modal
|
|
|
3 |
from huggingface_hub import hf_hub_download, list_repo_files
|
4 |
import os, csv, datetime, sys
|
5 |
import json
|
|
|
14 |
REPO_ID = "agenticx/TxAgentEvalData"
|
15 |
EVALUATOR_MAP_DICT = "evaluator_map_dict.json"
|
16 |
TXAGENT_RESULTS_SHEET_BASE_NAME = "TxAgent_Human_Eval_Results_CROWDSOURCED"
|
17 |
+
our_methods = ['TxAgent-T1-Llama-3.1-8B', 'Q3-8B-qlora-biov13_merged']
|
18 |
#Load tool lists from 'tool_lists' subdirectory---make sure to update this with the latest from ToolUniverse if necessary!
|
19 |
tools_dir = os.path.join(os.getcwd(), 'tool_lists')
|
20 |
|
|
|
93 |
# Define the six evaluation criteria as a list of dictionaries.
|
94 |
criteria = [
|
95 |
{
|
96 |
+
"label": "Task success",
|
97 |
+
"text": (
|
98 |
+
"Task success: Did the model successfully complete the therapeutic task it was given?",
|
99 |
+
"1️⃣ Did not address the task. "
|
100 |
+
"2️⃣ Attempted the task but produced an incorrect or incomplete response. "
|
101 |
+
"3️⃣ Addressed the task but with notable limitations. "
|
102 |
+
"4️⃣ Mostly correct, with only minor issues. "
|
103 |
+
"5️⃣ Fully and correctly completed the task."
|
104 |
+
)
|
105 |
+
},
|
106 |
+
{
|
107 |
+
"label": "Justification helpfulness",
|
108 |
+
"text": (
|
109 |
+
"Justification helpfulness: Is the model’s rationale helpful in determining whether the answer is correct?",
|
110 |
+
"1️⃣ No usable rationale. "
|
111 |
+
"2️⃣ Vague or generic explanation; limited value. "
|
112 |
+
"3️⃣ Explanation provided, but with clear gaps. "
|
113 |
+
"4️⃣ Clear and mostly complete explanation. "
|
114 |
+
"5️⃣ Thorough and transparent explanation that supports evaluation."
|
115 |
+
)
|
116 |
+
},
|
117 |
+
{
|
118 |
+
"label": "Cognitive traceability",
|
119 |
"text": (
|
120 |
+
"Cognitive traceability: Are the intermediate reasoning steps and decision factors interpretable and traceable?",
|
121 |
+
"1️⃣ Opaque reasoning: no clear link between input, intermediate steps, and output. "
|
122 |
+
"2️⃣ Poorly traceable: some steps present but disorganized or disconnected. "
|
123 |
+
"3️⃣ Partially traceable: reasoning visible but with gaps or weak justifications. "
|
124 |
+
"4️⃣ Mostly traceable: coherent progression with minor ambiguities. "
|
125 |
+
"5️⃣ Fully traceable: well-structured, step-by-step rationale clearly justified."
|
126 |
)
|
127 |
},
|
128 |
+
# {
|
129 |
+
# "label": "Appropriateness of tool use",
|
130 |
+
# "text": (
|
131 |
+
# "Appropriateness of tool use: Does the model invoke tools in a manner appropriate for the clinical task?",
|
132 |
+
# "1️⃣ Uses tools incorrectly or unnecessarily, introducing confusion or errors. "
|
133 |
+
# "2️⃣ Tools invoked without clear purpose or benefit. "
|
134 |
+
# "3️⃣ Appropriate in some instances, but with occasional missteps. "
|
135 |
+
# "4️⃣ Generally well-integrated, with only minor redundancy or overuse. "
|
136 |
+
# "5️⃣ Selectively and effectively used, improving relevance, accuracy, or depth."
|
137 |
+
# )
|
138 |
+
# },
|
139 |
{
|
140 |
+
"label": "Possibility of harm",
|
141 |
"text": (
|
142 |
+
"Possibility of harm: Based on the model’s output and rationale, is there a risk that the recommendation could cause clinical harm?",
|
143 |
+
"1️⃣ High likelihood of serious harm. "
|
144 |
+
"2️⃣ Clear risk of harm. "
|
145 |
+
"3️⃣ Some risks in specific scenarios. "
|
146 |
+
"4️⃣ Low likelihood of harm. "
|
147 |
+
"5️⃣ No identifiable risk of harm."
|
148 |
)
|
149 |
},
|
150 |
{
|
151 |
+
"label": "Alignment with clinical consensus",
|
152 |
"text": (
|
153 |
+
"Alignment with clinical consensus: Does the answer reflect established clinical practices and guidelines?",
|
154 |
+
"1️⃣ Contradicts established clinical consensus. "
|
155 |
+
"2️⃣ Misaligned with key aspects of consensus care. "
|
156 |
+
"3️⃣ Generally aligned but lacks clarity or rigor. "
|
157 |
+
"4️⃣ Largely consistent with clinical standards, with minor issues. "
|
158 |
+
"5️⃣ Fully consistent with current clinical consensus."
|
159 |
)
|
160 |
},
|
161 |
{
|
162 |
+
"label": "Accuracy of content",
|
163 |
"text": (
|
164 |
+
"Accuracy of content: Are there any factual inaccuracies or irrelevant information in the response?",
|
165 |
+
"1️⃣ Entirely inaccurate or off-topic. "
|
166 |
+
"2️⃣ Mostly inaccurate; few correct elements. "
|
167 |
+
"3️⃣ Partially accurate; some errors or omissions. "
|
168 |
+
"4️⃣ Largely accurate with minor issues. "
|
169 |
+
"5️⃣ Completely accurate and relevant."
|
170 |
)
|
171 |
},
|
172 |
{
|
173 |
"label": "Completeness",
|
174 |
"text": (
|
175 |
+
"Completeness: Does the model provide a complete response covering all necessary elements?",
|
176 |
+
"1️⃣ Major omissions; response is inadequate. "
|
177 |
+
"2️⃣ Missing key content. "
|
178 |
+
"3️⃣ Covers the basics but lacks depth. "
|
179 |
+
"4️⃣ Mostly complete; minor omissions. "
|
180 |
+
"5️⃣ Fully complete; no relevant information missing."
|
181 |
)
|
182 |
},
|
183 |
+
{
|
184 |
+
"label": "Clinical relevance",
|
185 |
+
"text": (
|
186 |
+
"Clinical relevance: Does the model focus on clinically meaningful aspects of the case (e.g., appropriate drug choices, patient subgroups, relevant outcomes)?",
|
187 |
+
"1️⃣ Focuses on tangential or irrelevant issues. "
|
188 |
+
"2️⃣ Includes few clinically related points, overall focus unclear. "
|
189 |
+
"3️⃣ Highlights some relevant factors, but key priorities underdeveloped. "
|
190 |
+
"4️⃣ Centers on important clinical aspects with minor omissions. "
|
191 |
+
"5️⃣ Clearly aligned with therapeutic needs and critical decision-making."
|
192 |
+
)
|
193 |
+
}
|
194 |
]
|
195 |
|
196 |
+
|
197 |
criteria_for_comparison = [
|
198 |
{
|
199 |
+
"label": "Task success",
|
200 |
"text": (
|
201 |
+
"Task success: Did the model successfully complete the therapeutic task it was given?<br>"
|
202 |
)
|
203 |
},
|
204 |
{
|
205 |
+
"label": "Justification helpfulness",
|
206 |
"text": (
|
207 |
+
"Justification helpfulness: Is the model’s rationale helpful in determining whether the answer is correct?<br>"
|
208 |
)
|
209 |
},
|
210 |
{
|
211 |
+
"label": "Cognitive traceability",
|
212 |
"text": (
|
213 |
+
"Cognitive traceability: Are the intermediate reasoning steps and decision factors interpretable and traceable?<br>"
|
214 |
)
|
215 |
},
|
216 |
{
|
217 |
+
"label": "Possibility of harm",
|
218 |
"text": (
|
219 |
+
"Possibility of harm: Based on the model’s output and rationale, is there a risk that the recommendation could cause clinical harm?<br>"
|
220 |
+
)
|
221 |
+
},
|
222 |
+
{
|
223 |
+
"label": "Alignment with clinical consensus",
|
224 |
+
"text": (
|
225 |
+
"Alignment with clinical consensus: Does the answer reflect established clinical practices and guidelines?<br>"
|
226 |
+
)
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"label": "Accuracy of content",
|
230 |
+
"text": (
|
231 |
+
"Accuracy of content: Are there any factual inaccuracies or irrelevant information in the response?<br>"
|
232 |
)
|
233 |
},
|
234 |
{
|
235 |
"label": "Completeness",
|
236 |
"text": (
|
237 |
+
"Completeness: Does the model provide a complete response covering all necessary elements?<br>"
|
238 |
)
|
239 |
},
|
240 |
+
{
|
241 |
+
"label": "Clinical relevance",
|
242 |
+
"text": (
|
243 |
+
"Clinical relevance: Does the model focus on clinically meaningful aspects of the case (e.g., appropriate drug choices, patient subgroups, relevant outcomes)?<br>"
|
244 |
+
)
|
245 |
+
}
|
246 |
]
|
247 |
|
248 |
mapping = { #for pairwise mapping between model comparison selections
|
|
|
261 |
print("Error: Invalid question ID format. Expected a string or a single-element list.")
|
262 |
return None
|
263 |
|
264 |
+
def get_evaluator_questions(evaluator_id, all_files, evaluator_directory, our_methods):
|
265 |
|
266 |
# Filter to only the files in that directory
|
267 |
evaluator_files = [f for f in all_files if f.startswith(f"{evaluator_directory}/")]
|
|
|
282 |
|
283 |
evaluator_question_ids = []
|
284 |
# Assuming 'TxAgent-T1-Llama-3.1-8B' data is representative for question IDs and associated diseases
|
285 |
+
question_reference_method = our_methods[0]
|
286 |
+
if question_reference_method in data_by_filename:
|
287 |
+
for entry in data_by_filename[question_reference_method]:
|
288 |
question_id = preprocess_question_id(entry.get("id"))
|
289 |
evaluator_question_ids.append(question_id)
|
290 |
# Handle case where no relevant questions are found based on specialty
|
|
|
292 |
return [], data_by_filename
|
293 |
|
294 |
#FINALLY, MAKE SURE THEY DIDNT ALREADY FILL IT OUT. Must go through every tuple of (question_ID, TxAgent, other model) where other model could be any of the other files in data_by_filename
|
295 |
+
model_names = [key for key in data_by_filename.keys() if key not in our_methods]
|
296 |
full_question_ids_list = []
|
297 |
+
|
298 |
+
for our_model_name in our_methods:
|
299 |
+
for other_model_name in model_names:
|
300 |
+
for q_id in evaluator_question_ids:
|
301 |
+
full_question_ids_list.append((q_id, our_model_name, other_model_name))
|
302 |
|
303 |
results_df = read_sheet_to_df(custom_sheet_name=str(TXAGENT_RESULTS_SHEET_BASE_NAME + f"_{str(evaluator_id)}"))
|
304 |
if (results_df is not None) and (not results_df.empty):
|
|
|
308 |
q = row["Question ID"]
|
309 |
# pick whichever response isn’t 'TxAgent-T1-Llama-3.1-8B'
|
310 |
a, b = row["ResponseA_Model"], row["ResponseB_Model"]
|
311 |
+
if a in our_methods and b not in our_methods:
|
312 |
+
matched_pairs.add((q, a, b))
|
313 |
+
elif b in our_methods and a not in our_methods:
|
314 |
+
matched_pairs.add((q, b, a))
|
315 |
|
316 |
# filter out any tuple whose (q_id, other_model) was already matched
|
317 |
full_question_ids_list = [
|
318 |
+
(q_id, our_model, other_model)
|
319 |
+
for (q_id, our_model, other_model) in full_question_ids_list
|
320 |
+
if (q_id, our_model, other_model) not in matched_pairs
|
321 |
]
|
322 |
print(f"Filtered question IDs: {full_question_ids_list}")
|
323 |
print(f"Length of filtered question IDs: {len(full_question_ids_list)}")
|