Spaces:
Running
Running
improved result.json to include api call details, added a new prompt
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +10 -0
- .vscode/launch.json +15 -0
- __pycache__/analysis_utils.cpython-311.pyc +0 -0
- __pycache__/app.cpython-311.pyc +0 -0
- __pycache__/query_comp.cpython-311.pyc +0 -0
- __pycache__/view_utils.cpython-311.pyc +0 -0
- eval_models_list.txt +2 -0
- eval_prompt_list.txt +1 -0
- modules/__pycache__/nav.cpython-311.pyc +0 -0
- pages/__pycache__/see_results.cpython-311.pyc +0 -0
- streamlit_app_local/__pycache__/analysis_utils.cpython-311.pyc +0 -0
- streamlit_app_local/__pycache__/analysis_utils.cpython-38.pyc +0 -0
- streamlit_app_local/__pycache__/app.cpython-311.pyc +0 -0
- streamlit_app_local/__pycache__/app.cpython-38.pyc +0 -0
- streamlit_app_local/__pycache__/query_comp.cpython-311.pyc +0 -0
- streamlit_app_local/__pycache__/view_utils.cpython-311.pyc +0 -0
- streamlit_app_local/__pycache__/view_utils.cpython-38.pyc +0 -0
- streamlit_app_local/eval_models_list.txt +0 -3
- streamlit_app_local/eval_models_list.txt +1 -0
- streamlit_app_local/eval_prompt_list.txt +0 -5
- streamlit_app_local/eval_prompt_list.txt +1 -0
- streamlit_app_local/modules/__pycache__/nav.cpython-311.pyc +0 -0
- streamlit_app_local/modules/__pycache__/nav.cpython-38.pyc +0 -0
- streamlit_app_local/pages/see_results.py +3 -1
- varco_arena/__pycache__/calc_cost.cpython-311.pyc +0 -0
- varco_arena/__pycache__/calc_cost.cpython-38.pyc +0 -0
- varco_arena/main.py +0 -7
- varco_arena/varco_arena_core/__pycache__/__init__.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/custom_input_utils.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/data_utils.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/elo.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/eval_utils.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/league.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/manager.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/match.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/tournament.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/tracking_utils.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/__pycache__/visualization.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/eval_utils.py +1 -0
- varco_arena/varco_arena_core/league.py +1 -0
- varco_arena/varco_arena_core/match.py +0 -1
- varco_arena/varco_arena_core/prompts/__init__.py +3 -1
- varco_arena/varco_arena_core/prompts/__pycache__/__init__.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/prompts/__pycache__/base_prompt.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/prompts/__pycache__/llmbar.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/prompts/__pycache__/llmbar_brief.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/prompts/__pycache__/naive_ab.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/prompts/__pycache__/prompt_utils.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/prompts/__pycache__/rag_pair.cpython-311.pyc +0 -0
- varco_arena/varco_arena_core/prompts/__pycache__/rag_pair_kr.cpython-311.pyc +0 -0
.gitignore
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Ignore all subdirectories in user_submit
|
| 2 |
+
**/user_submit/*/
|
| 3 |
+
**/__pycache__/
|
| 4 |
+
**/*.pyc
|
| 5 |
+
|
| 6 |
+
# But re-include these four
|
| 7 |
+
!**/user_submit/llm/
|
| 8 |
+
!**/user_submit/rag/
|
| 9 |
+
!**/user_submit/mt/
|
| 10 |
+
!**/user_submit/12-02-14:29:30/
|
.vscode/launch.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
// Use IntelliSense to learn about possible attributes.
|
| 3 |
+
// Hover to view descriptions of existing attributes.
|
| 4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
| 5 |
+
"version": "0.2.0",
|
| 6 |
+
"configurations": [
|
| 7 |
+
{
|
| 8 |
+
"name": "Python Debugger: Current File",
|
| 9 |
+
"type": "debugpy",
|
| 10 |
+
"request": "launch",
|
| 11 |
+
"program": "${file}",
|
| 12 |
+
"console": "integratedTerminal"
|
| 13 |
+
}
|
| 14 |
+
]
|
| 15 |
+
}
|
__pycache__/analysis_utils.cpython-311.pyc
DELETED
|
Binary file (17.7 kB)
|
|
|
__pycache__/app.cpython-311.pyc
DELETED
|
Binary file (22.3 kB)
|
|
|
__pycache__/query_comp.cpython-311.pyc
DELETED
|
Binary file (7.99 kB)
|
|
|
__pycache__/view_utils.cpython-311.pyc
DELETED
|
Binary file (18.3 kB)
|
|
|
eval_models_list.txt
CHANGED
|
@@ -1,3 +1,5 @@
|
|
| 1 |
gpt-4o-mini
|
| 2 |
gpt-4o-2024-05-13
|
| 3 |
gpt-4o-2024-08-06
|
|
|
|
|
|
|
|
|
| 1 |
gpt-4o-mini
|
| 2 |
gpt-4o-2024-05-13
|
| 3 |
gpt-4o-2024-08-06
|
| 4 |
+
gpt-4.1
|
| 5 |
+
gpt-4.1-mini
|
eval_prompt_list.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
llmbar
|
| 2 |
translation_pair
|
| 3 |
rag_pair_kr
|
|
|
|
|
|
| 1 |
llmbar
|
| 2 |
translation_pair
|
| 3 |
rag_pair_kr
|
| 4 |
+
translation_new
|
modules/__pycache__/nav.cpython-311.pyc
DELETED
|
Binary file (3.8 kB)
|
|
|
pages/__pycache__/see_results.cpython-311.pyc
DELETED
|
Binary file (26.3 kB)
|
|
|
streamlit_app_local/__pycache__/analysis_utils.cpython-311.pyc
DELETED
|
Binary file (17.7 kB)
|
|
|
streamlit_app_local/__pycache__/analysis_utils.cpython-38.pyc
DELETED
|
Binary file (9.12 kB)
|
|
|
streamlit_app_local/__pycache__/app.cpython-311.pyc
DELETED
|
Binary file (15.9 kB)
|
|
|
streamlit_app_local/__pycache__/app.cpython-38.pyc
DELETED
|
Binary file (6.32 kB)
|
|
|
streamlit_app_local/__pycache__/query_comp.cpython-311.pyc
DELETED
|
Binary file (8 kB)
|
|
|
streamlit_app_local/__pycache__/view_utils.cpython-311.pyc
DELETED
|
Binary file (18.3 kB)
|
|
|
streamlit_app_local/__pycache__/view_utils.cpython-38.pyc
DELETED
|
Binary file (9.91 kB)
|
|
|
streamlit_app_local/eval_models_list.txt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
gpt-4o-mini
|
| 2 |
-
gpt-4o-2024-05-13
|
| 3 |
-
gpt-4o-2024-08-06
|
|
|
|
|
|
|
|
|
|
|
|
streamlit_app_local/eval_models_list.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
../eval_models_list.txt
|
streamlit_app_local/eval_prompt_list.txt
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
llmbar
|
| 2 |
-
llmbar_brief
|
| 3 |
-
translation_pair
|
| 4 |
-
rag_pair_kr
|
| 5 |
-
contextual (WIP)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit_app_local/eval_prompt_list.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
../eval_prompt_list.txt
|
streamlit_app_local/modules/__pycache__/nav.cpython-311.pyc
DELETED
|
Binary file (2.85 kB)
|
|
|
streamlit_app_local/modules/__pycache__/nav.cpython-38.pyc
DELETED
|
Binary file (889 Bytes)
|
|
|
streamlit_app_local/pages/see_results.py
CHANGED
|
@@ -2,6 +2,7 @@ import pandas as pd
|
|
| 2 |
import streamlit as st
|
| 3 |
from varco_arena_core.prompts import load_prompt
|
| 4 |
|
|
|
|
| 5 |
import analysis_utils as au
|
| 6 |
from analysis_utils import number_breakdown_from_df
|
| 7 |
from app import VA_ROOT
|
|
@@ -248,11 +249,12 @@ def main():
|
|
| 248 |
out_b="{out_b}",
|
| 249 |
task=task,
|
| 250 |
)
|
| 251 |
-
if eval_prompt_name
|
| 252 |
kwargs["source_lang"] = "{source_lang}"
|
| 253 |
kwargs["target_lang"] = "{target_lang}"
|
| 254 |
prompt_cmpl = prompt.complete_prompt(**kwargs)
|
| 255 |
for msg in prompt_cmpl:
|
|
|
|
| 256 |
st.markdown(f"**{msg['role']}**")
|
| 257 |
st.info(show_linebreak_in_md(escape_markdown(msg["content"])))
|
| 258 |
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
from varco_arena_core.prompts import load_prompt
|
| 4 |
|
| 5 |
+
from pprint import pprint
|
| 6 |
import analysis_utils as au
|
| 7 |
from analysis_utils import number_breakdown_from_df
|
| 8 |
from app import VA_ROOT
|
|
|
|
| 249 |
out_b="{out_b}",
|
| 250 |
task=task,
|
| 251 |
)
|
| 252 |
+
if eval_prompt_name in ["translation_pair", "translation_new"]:
|
| 253 |
kwargs["source_lang"] = "{source_lang}"
|
| 254 |
kwargs["target_lang"] = "{target_lang}"
|
| 255 |
prompt_cmpl = prompt.complete_prompt(**kwargs)
|
| 256 |
for msg in prompt_cmpl:
|
| 257 |
+
pprint(msg)
|
| 258 |
st.markdown(f"**{msg['role']}**")
|
| 259 |
st.info(show_linebreak_in_md(escape_markdown(msg["content"])))
|
| 260 |
|
varco_arena/__pycache__/calc_cost.cpython-311.pyc
DELETED
|
Binary file (5.11 kB)
|
|
|
varco_arena/__pycache__/calc_cost.cpython-38.pyc
DELETED
|
Binary file (2.88 kB)
|
|
|
varco_arena/main.py
CHANGED
|
@@ -134,13 +134,6 @@ if __name__ == "__main__":
|
|
| 134 |
"-p",
|
| 135 |
"--evalprompt",
|
| 136 |
default="llmbar_brief",
|
| 137 |
-
choices=[
|
| 138 |
-
"llmbar_brief",
|
| 139 |
-
"llmbar",
|
| 140 |
-
"translation_pair",
|
| 141 |
-
"rag_pair_kr",
|
| 142 |
-
# "contextual_pair",
|
| 143 |
-
],
|
| 144 |
)
|
| 145 |
|
| 146 |
parser.add_argument(
|
|
|
|
| 134 |
"-p",
|
| 135 |
"--evalprompt",
|
| 136 |
default="llmbar_brief",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
)
|
| 138 |
|
| 139 |
parser.add_argument(
|
varco_arena/varco_arena_core/__pycache__/__init__.cpython-311.pyc
DELETED
|
Binary file (219 Bytes)
|
|
|
varco_arena/varco_arena_core/__pycache__/custom_input_utils.cpython-311.pyc
DELETED
|
Binary file (331 Bytes)
|
|
|
varco_arena/varco_arena_core/__pycache__/data_utils.cpython-311.pyc
DELETED
|
Binary file (8.07 kB)
|
|
|
varco_arena/varco_arena_core/__pycache__/elo.cpython-311.pyc
DELETED
|
Binary file (4.78 kB)
|
|
|
varco_arena/varco_arena_core/__pycache__/eval_utils.cpython-311.pyc
DELETED
|
Binary file (7.32 kB)
|
|
|
varco_arena/varco_arena_core/__pycache__/league.cpython-311.pyc
DELETED
|
Binary file (4.12 kB)
|
|
|
varco_arena/varco_arena_core/__pycache__/manager.cpython-311.pyc
DELETED
|
Binary file (9.54 kB)
|
|
|
varco_arena/varco_arena_core/__pycache__/match.cpython-311.pyc
DELETED
|
Binary file (9.29 kB)
|
|
|
varco_arena/varco_arena_core/__pycache__/tournament.cpython-311.pyc
DELETED
|
Binary file (7.19 kB)
|
|
|
varco_arena/varco_arena_core/__pycache__/tracking_utils.cpython-311.pyc
DELETED
|
Binary file (9.42 kB)
|
|
|
varco_arena/varco_arena_core/__pycache__/visualization.cpython-311.pyc
DELETED
|
Binary file (8.91 kB)
|
|
|
varco_arena/varco_arena_core/eval_utils.py
CHANGED
|
@@ -138,6 +138,7 @@ async def async_query_openai(
|
|
| 138 |
|
| 139 |
increase_match_count() # you're hacky Jumin...
|
| 140 |
|
|
|
|
| 141 |
return normalized_result, resp
|
| 142 |
|
| 143 |
|
|
|
|
| 138 |
|
| 139 |
increase_match_count() # you're hacky Jumin...
|
| 140 |
|
| 141 |
+
normalized_result["api_call_kwargs"] = kwargs
|
| 142 |
return normalized_result, resp
|
| 143 |
|
| 144 |
|
varco_arena/varco_arena_core/league.py
CHANGED
|
@@ -59,6 +59,7 @@ class League:
|
|
| 59 |
"round": "league",
|
| 60 |
"match_order_in_round": "league",
|
| 61 |
"tstamp": now_time,
|
|
|
|
| 62 |
# "logs": match.match_metainfo_log[0],
|
| 63 |
},
|
| 64 |
]
|
|
|
|
| 59 |
"round": "league",
|
| 60 |
"match_order_in_round": "league",
|
| 61 |
"tstamp": now_time,
|
| 62 |
+
"api_call_kwargs": match_result[0]["api_call_kwargs"],
|
| 63 |
# "logs": match.match_metainfo_log[0],
|
| 64 |
},
|
| 65 |
]
|
varco_arena/varco_arena_core/match.py
CHANGED
|
@@ -6,7 +6,6 @@ from typing import Any, Dict, List, Literal, Optional, Tuple, Union
|
|
| 6 |
|
| 7 |
from .eval_utils import async_eval_w_prompt
|
| 8 |
|
| 9 |
-
|
| 10 |
class Match:
|
| 11 |
def __init__(
|
| 12 |
self,
|
|
|
|
| 6 |
|
| 7 |
from .eval_utils import async_eval_w_prompt
|
| 8 |
|
|
|
|
| 9 |
class Match:
|
| 10 |
def __init__(
|
| 11 |
self,
|
varco_arena/varco_arena_core/prompts/__init__.py
CHANGED
|
@@ -8,12 +8,14 @@ from .llmbar import LLMBarPrompt
|
|
| 8 |
from .llmbar_brief import LLMBarBriefPrompt
|
| 9 |
from .rag_pair_kr import RagPairKRPrompt
|
| 10 |
from .translation_pair import TranslationPairPrompt
|
|
|
|
| 11 |
|
| 12 |
NAME2PROMPT_CLS = dict(
|
| 13 |
llmbar_brief=LLMBarBriefPrompt(),
|
| 14 |
llmbar=LLMBarPrompt(),
|
| 15 |
translation_pair=TranslationPairPrompt(),
|
| 16 |
rag_pair_kr=RagPairKRPrompt(),
|
|
|
|
| 17 |
# contextual_vqa = Contextual_VQA(),
|
| 18 |
# contextual_ocr = Contextual_OCR(),
|
| 19 |
)
|
|
@@ -24,8 +26,8 @@ def load_prompt(
|
|
| 24 |
"llmbar_brief",
|
| 25 |
"llmbar",
|
| 26 |
"translation_pair",
|
|
|
|
| 27 |
"rag_pair_kr",
|
| 28 |
-
# "contextual_pair"
|
| 29 |
],
|
| 30 |
task: str = "", # used for further prompt variation (eval prompt might depend on task.)
|
| 31 |
):
|
|
|
|
| 8 |
from .llmbar_brief import LLMBarBriefPrompt
|
| 9 |
from .rag_pair_kr import RagPairKRPrompt
|
| 10 |
from .translation_pair import TranslationPairPrompt
|
| 11 |
+
from .translation_new import TranslationNewPrompt
|
| 12 |
|
| 13 |
NAME2PROMPT_CLS = dict(
|
| 14 |
llmbar_brief=LLMBarBriefPrompt(),
|
| 15 |
llmbar=LLMBarPrompt(),
|
| 16 |
translation_pair=TranslationPairPrompt(),
|
| 17 |
rag_pair_kr=RagPairKRPrompt(),
|
| 18 |
+
translation_new=TranslationNewPrompt(),
|
| 19 |
# contextual_vqa = Contextual_VQA(),
|
| 20 |
# contextual_ocr = Contextual_OCR(),
|
| 21 |
)
|
|
|
|
| 26 |
"llmbar_brief",
|
| 27 |
"llmbar",
|
| 28 |
"translation_pair",
|
| 29 |
+
"translation_new",
|
| 30 |
"rag_pair_kr",
|
|
|
|
| 31 |
],
|
| 32 |
task: str = "", # used for further prompt variation (eval prompt might depend on task.)
|
| 33 |
):
|
varco_arena/varco_arena_core/prompts/__pycache__/__init__.cpython-311.pyc
DELETED
|
Binary file (1.44 kB)
|
|
|
varco_arena/varco_arena_core/prompts/__pycache__/base_prompt.cpython-311.pyc
DELETED
|
Binary file (6.07 kB)
|
|
|
varco_arena/varco_arena_core/prompts/__pycache__/llmbar.cpython-311.pyc
DELETED
|
Binary file (7.29 kB)
|
|
|
varco_arena/varco_arena_core/prompts/__pycache__/llmbar_brief.cpython-311.pyc
DELETED
|
Binary file (1.48 kB)
|
|
|
varco_arena/varco_arena_core/prompts/__pycache__/naive_ab.cpython-311.pyc
DELETED
|
Binary file (1.47 kB)
|
|
|
varco_arena/varco_arena_core/prompts/__pycache__/prompt_utils.cpython-311.pyc
DELETED
|
Binary file (7.55 kB)
|
|
|
varco_arena/varco_arena_core/prompts/__pycache__/rag_pair.cpython-311.pyc
DELETED
|
Binary file (2.28 kB)
|
|
|
varco_arena/varco_arena_core/prompts/__pycache__/rag_pair_kr.cpython-311.pyc
DELETED
|
Binary file (2.29 kB)
|
|
|