Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -20,7 +20,6 @@ import openai
|
|
20 |
import threading
|
21 |
import time
|
22 |
from collections import Counter
|
23 |
-
from model_suggestions import add_suggestion, get_suggestions_html
|
24 |
from release_notes import get_release_notes_html
|
25 |
|
26 |
|
@@ -85,7 +84,7 @@ def call_ollama_api(model, prompt):
|
|
85 |
)
|
86 |
|
87 |
try:
|
88 |
-
logger.info("Starting API call")
|
89 |
response = client.chat.completions.create(
|
90 |
model=model,
|
91 |
messages=[
|
@@ -100,10 +99,10 @@ def call_ollama_api(model, prompt):
|
|
100 |
],
|
101 |
timeout=180
|
102 |
)
|
103 |
-
logger.info("Received response")
|
104 |
|
105 |
if not response or not response.choices:
|
106 |
-
logger.error("Empty response received")
|
107 |
return [
|
108 |
{"role": "user", "content": prompt},
|
109 |
{"role": "assistant", "content": "Error: Empty response from the model"}
|
@@ -111,7 +110,7 @@ def call_ollama_api(model, prompt):
|
|
111 |
|
112 |
content = response.choices[0].message.content
|
113 |
if not content:
|
114 |
-
logger.error("Empty content received")
|
115 |
return [
|
116 |
{"role": "user", "content": prompt},
|
117 |
{"role": "assistant", "content": "Error: Empty content from the model"}
|
@@ -124,30 +123,37 @@ def call_ollama_api(model, prompt):
|
|
124 |
thinking_content = thinking_match.group(1).strip()
|
125 |
main_content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
|
126 |
|
127 |
-
logger.info("Found thinking content
|
128 |
return [
|
129 |
{"role": "user", "content": prompt},
|
130 |
{"role": "assistant", "content": f"{main_content}\n\n<details><summary>🤔 View thinking process</summary>\n\n{thinking_content}\n\n</details>"}
|
131 |
]
|
132 |
|
133 |
# If no thinking tags, return normal content
|
134 |
-
logger.info("No thinking tags found
|
135 |
return [
|
136 |
{"role": "user", "content": prompt},
|
137 |
{"role": "assistant", "content": content.strip()}
|
138 |
]
|
139 |
|
140 |
except requests.exceptions.Timeout:
|
141 |
-
logger.error("Timeout error after 180 seconds")
|
142 |
return [
|
143 |
{"role": "user", "content": prompt},
|
144 |
{"role": "assistant", "content": "Error: Model response timed out after 180 seconds"}
|
145 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
except Exception as e:
|
147 |
-
logger.error(f"Error calling Ollama API: {str(e)}", exc_info=True)
|
148 |
return [
|
149 |
{"role": "user", "content": prompt},
|
150 |
-
{"role": "assistant", "content":
|
151 |
]
|
152 |
|
153 |
# Generate responses using two randomly selected models
|
@@ -278,12 +284,11 @@ def record_vote(prompt, left_response, right_response, left_model, right_model,
|
|
278 |
return (
|
279 |
gr.update(value=result_message, visible=True), # Show result as Markdown
|
280 |
get_leaderboard(), # Update leaderboard
|
281 |
-
get_elo_leaderboard(),
|
282 |
gr.update(interactive=False), # Disable left vote button
|
283 |
gr.update(interactive=False), # Disable right vote button
|
284 |
gr.update(interactive=False), # Disable tie button
|
285 |
-
gr.update(visible=True)
|
286 |
-
get_leaderboard_chart() # Update leaderboard chart
|
287 |
)
|
288 |
|
289 |
def get_leaderboard_chart():
|
@@ -426,7 +431,20 @@ with gr.Blocks(css="""
|
|
426 |
|
427 |
# Leaderboard Tab (now first)
|
428 |
with gr.Tab("Leaderboard"):
|
429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
|
431 |
# Battle Arena Tab (now second)
|
432 |
with gr.Tab("Battle Arena"):
|
@@ -473,50 +491,23 @@ with gr.Blocks(css="""
|
|
473 |
|
474 |
new_battle_btn = gr.Button("New Battle")
|
475 |
|
476 |
-
# Performance Chart Tab
|
477 |
-
with gr.Tab("Performance Chart"):
|
478 |
-
leaderboard_chart = gr.Plot(label="Model Performance Chart")
|
479 |
-
|
480 |
# ELO Leaderboard Tab
|
481 |
with gr.Tab("ELO Leaderboard"):
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
with
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
suggestion_status = gr.Markdown("Submit a model to see it listed below!")
|
495 |
-
suggestions_list = gr.HTML(get_suggestions_html())
|
496 |
-
refresh_suggestions_btn = gr.Button("Refresh List")
|
497 |
-
|
498 |
-
# Update button click handlers
|
499 |
-
submit_suggestion_btn.click(
|
500 |
-
add_suggestion,
|
501 |
-
inputs=[model_url_input],
|
502 |
-
outputs=[suggestion_status]
|
503 |
-
).then(
|
504 |
-
lambda: (
|
505 |
-
get_suggestions_html(), # Update suggestions list
|
506 |
-
"" # Clear model URL input
|
507 |
-
),
|
508 |
-
outputs=[
|
509 |
-
suggestions_list,
|
510 |
-
model_url_input
|
511 |
-
]
|
512 |
-
)
|
513 |
-
|
514 |
-
refresh_suggestions_btn.click(
|
515 |
-
get_suggestions_html,
|
516 |
-
outputs=[suggestions_list]
|
517 |
)
|
518 |
|
519 |
-
#
|
520 |
with gr.Tab("Latest Updates"):
|
521 |
release_notes = gr.HTML(get_release_notes_html())
|
522 |
refresh_notes_btn = gr.Button("Refresh Updates")
|
@@ -541,14 +532,14 @@ with gr.Blocks(css="""
|
|
541 |
lambda *args: record_vote(*args, "Left is better"),
|
542 |
inputs=[prompt_input, left_output, right_output, left_model, right_model],
|
543 |
outputs=[result, leaderboard, elo_leaderboard, left_vote_btn,
|
544 |
-
right_vote_btn, tie_btn, model_names_row
|
545 |
)
|
546 |
|
547 |
right_vote_btn.click(
|
548 |
lambda *args: record_vote(*args, "Right is better"),
|
549 |
inputs=[prompt_input, left_output, right_output, left_model, right_model],
|
550 |
outputs=[result, leaderboard, elo_leaderboard, left_vote_btn,
|
551 |
-
right_vote_btn, tie_btn, model_names_row
|
552 |
)
|
553 |
|
554 |
tie_btn.click(
|
@@ -561,13 +552,12 @@ with gr.Blocks(css="""
|
|
561 |
new_battle,
|
562 |
outputs=[prompt_input, left_output, right_output, left_model,
|
563 |
right_model, left_vote_btn, right_vote_btn, tie_btn,
|
564 |
-
result, leaderboard, model_names_row,
|
565 |
)
|
566 |
|
567 |
-
# Update leaderboard
|
568 |
demo.load(get_leaderboard, outputs=leaderboard)
|
569 |
demo.load(get_elo_leaderboard, outputs=elo_leaderboard)
|
570 |
-
demo.load(get_leaderboard_chart, outputs=leaderboard_chart)
|
571 |
|
572 |
if __name__ == "__main__":
|
573 |
# Initialize ELO ratings before launching the app
|
|
|
20 |
import threading
|
21 |
import time
|
22 |
from collections import Counter
|
|
|
23 |
from release_notes import get_release_notes_html
|
24 |
|
25 |
|
|
|
84 |
)
|
85 |
|
86 |
try:
|
87 |
+
logger.info(f"Starting API call for model: {model}")
|
88 |
response = client.chat.completions.create(
|
89 |
model=model,
|
90 |
messages=[
|
|
|
99 |
],
|
100 |
timeout=180
|
101 |
)
|
102 |
+
logger.info(f"Received response for model: {model}")
|
103 |
|
104 |
if not response or not response.choices:
|
105 |
+
logger.error(f"Empty response received for model: {model}")
|
106 |
return [
|
107 |
{"role": "user", "content": prompt},
|
108 |
{"role": "assistant", "content": "Error: Empty response from the model"}
|
|
|
110 |
|
111 |
content = response.choices[0].message.content
|
112 |
if not content:
|
113 |
+
logger.error(f"Empty content received for model: {model}")
|
114 |
return [
|
115 |
{"role": "user", "content": prompt},
|
116 |
{"role": "assistant", "content": "Error: Empty content from the model"}
|
|
|
123 |
thinking_content = thinking_match.group(1).strip()
|
124 |
main_content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
|
125 |
|
126 |
+
logger.info(f"Found thinking content for model: {model}")
|
127 |
return [
|
128 |
{"role": "user", "content": prompt},
|
129 |
{"role": "assistant", "content": f"{main_content}\n\n<details><summary>🤔 View thinking process</summary>\n\n{thinking_content}\n\n</details>"}
|
130 |
]
|
131 |
|
132 |
# If no thinking tags, return normal content
|
133 |
+
logger.info(f"No thinking tags found for model: {model}")
|
134 |
return [
|
135 |
{"role": "user", "content": prompt},
|
136 |
{"role": "assistant", "content": content.strip()}
|
137 |
]
|
138 |
|
139 |
except requests.exceptions.Timeout:
|
140 |
+
logger.error(f"Timeout error after 180 seconds for model: {model}")
|
141 |
return [
|
142 |
{"role": "user", "content": prompt},
|
143 |
{"role": "assistant", "content": "Error: Model response timed out after 180 seconds"}
|
144 |
]
|
145 |
+
except openai.BadRequestError as e:
|
146 |
+
error_msg = str(e)
|
147 |
+
logger.error(f"Bad request error for model: {model}. Error: {error_msg}")
|
148 |
+
return [
|
149 |
+
{"role": "user", "content": prompt},
|
150 |
+
{"role": "assistant", "content": "Error: Unable to get response from the model"}
|
151 |
+
]
|
152 |
except Exception as e:
|
153 |
+
logger.error(f"Error calling Ollama API for model: {model}. Error: {str(e)}", exc_info=True)
|
154 |
return [
|
155 |
{"role": "user", "content": prompt},
|
156 |
+
{"role": "assistant", "content": "Error: Unable to get response from the model"}
|
157 |
]
|
158 |
|
159 |
# Generate responses using two randomly selected models
|
|
|
284 |
return (
|
285 |
gr.update(value=result_message, visible=True), # Show result as Markdown
|
286 |
get_leaderboard(), # Update leaderboard
|
287 |
+
get_elo_leaderboard(), # Update ELO leaderboard
|
288 |
gr.update(interactive=False), # Disable left vote button
|
289 |
gr.update(interactive=False), # Disable right vote button
|
290 |
gr.update(interactive=False), # Disable tie button
|
291 |
+
gr.update(visible=True) # Show model names
|
|
|
292 |
)
|
293 |
|
294 |
def get_leaderboard_chart():
|
|
|
431 |
|
432 |
# Leaderboard Tab (now first)
|
433 |
with gr.Tab("Leaderboard"):
|
434 |
+
gr.Markdown("""
|
435 |
+
### Main Leaderboard
|
436 |
+
This leaderboard uses a scoring system that balances win rate and total battles. The score is calculated using the formula:
|
437 |
+
**Score = Win Rate * (1 - 1 / (Total Battles + 1))**
|
438 |
+
|
439 |
+
This formula rewards models with higher win rates and more battles. As the number of battles increases, the score approaches the win rate.
|
440 |
+
""")
|
441 |
+
leaderboard = gr.Dataframe(
|
442 |
+
headers=["Model", "Score", "Wins", "Losses", "Total Battles", "Win Rate"],
|
443 |
+
row_count=10,
|
444 |
+
col_count=6,
|
445 |
+
interactive=False,
|
446 |
+
label="Leaderboard"
|
447 |
+
)
|
448 |
|
449 |
# Battle Arena Tab (now second)
|
450 |
with gr.Tab("Battle Arena"):
|
|
|
491 |
|
492 |
new_battle_btn = gr.Button("New Battle")
|
493 |
|
|
|
|
|
|
|
|
|
494 |
# ELO Leaderboard Tab
|
495 |
with gr.Tab("ELO Leaderboard"):
|
496 |
+
gr.Markdown("""
|
497 |
+
### ELO Rating System
|
498 |
+
This leaderboard uses a modified ELO rating system that takes into account both the performance and size of the models.
|
499 |
+
Initial ratings are based on model size, with larger models starting at higher ratings.
|
500 |
+
The ELO rating is calculated based on wins and losses, with adjustments made based on the relative strengths of opponents.
|
501 |
+
""")
|
502 |
+
elo_leaderboard = gr.Dataframe(
|
503 |
+
headers=["Model", "ELO Rating", "Wins", "Losses", "Total Battles", "Win Rate"],
|
504 |
+
row_count=10,
|
505 |
+
col_count=6,
|
506 |
+
interactive=False,
|
507 |
+
label="ELO Leaderboard"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
)
|
509 |
|
510 |
+
# Latest Updates Tab
|
511 |
with gr.Tab("Latest Updates"):
|
512 |
release_notes = gr.HTML(get_release_notes_html())
|
513 |
refresh_notes_btn = gr.Button("Refresh Updates")
|
|
|
532 |
lambda *args: record_vote(*args, "Left is better"),
|
533 |
inputs=[prompt_input, left_output, right_output, left_model, right_model],
|
534 |
outputs=[result, leaderboard, elo_leaderboard, left_vote_btn,
|
535 |
+
right_vote_btn, tie_btn, model_names_row]
|
536 |
)
|
537 |
|
538 |
right_vote_btn.click(
|
539 |
lambda *args: record_vote(*args, "Right is better"),
|
540 |
inputs=[prompt_input, left_output, right_output, left_model, right_model],
|
541 |
outputs=[result, leaderboard, elo_leaderboard, left_vote_btn,
|
542 |
+
right_vote_btn, tie_btn, model_names_row]
|
543 |
)
|
544 |
|
545 |
tie_btn.click(
|
|
|
552 |
new_battle,
|
553 |
outputs=[prompt_input, left_output, right_output, left_model,
|
554 |
right_model, left_vote_btn, right_vote_btn, tie_btn,
|
555 |
+
result, leaderboard, model_names_row, tie_count]
|
556 |
)
|
557 |
|
558 |
+
# Update leaderboard on launch
|
559 |
demo.load(get_leaderboard, outputs=leaderboard)
|
560 |
demo.load(get_elo_leaderboard, outputs=elo_leaderboard)
|
|
|
561 |
|
562 |
if __name__ == "__main__":
|
563 |
# Initialize ELO ratings before launching the app
|