Spaces:
Running
Running
| import pandas as pd | |
| import streamlit as st | |
| import analysis_utils as au | |
| from analysis_utils import number_breakdown_from_df | |
| from app import load_and_cache_data | |
| # from app import VA_ROOT | |
| from query_comp import QueryWrapper, get_base_url | |
| from varco_arena.varco_arena_core.prompts import load_prompt | |
| from view_utils import ( | |
| default_page_setting, | |
| escape_markdown, | |
| set_nav_bar, | |
| show_linebreak_in_md, | |
| ) | |
| DEFAULT_LAYOUT_DICT = { | |
| "title": {"font": {"size": 20, "family": "Gothic A1"}}, | |
| "font": {"size": 16, "family": "Gothic A1"}, | |
| "xaxis": {"tickfont": {"size": 12, "family": "Gothic A1"}}, | |
| "yaxis": {"tickfont": {"size": 12, "family": "Gothic A1"}}, | |
| "legend": {"font": {"size": 12, "family": "Gothic A1"}}, | |
| } | |
| def navigate(t, source, key, val): | |
| # print(key, val) | |
| if source is None: | |
| return | |
| target_index = t.index(source) + val | |
| if 0 <= target_index < len(t): | |
| st.session_state[key] = t[target_index] | |
| st.rerun() | |
| def main(): | |
| sidebar_placeholder = default_page_setting(layout="wide") | |
| set_nav_bar( | |
| False, | |
| sidebar_placeholder=sidebar_placeholder, | |
| toggle_hashstr="see_results_init", | |
| ) | |
| # load the data | |
| # print(f"{st.session_state.get('result_file_path', None)=}") | |
| most_recent_run = st.session_state.get("result_file_path", None) | |
| most_recent_run = str(most_recent_run) if most_recent_run is not None else None | |
| ( | |
| st.session_state["all_result_dict"], | |
| st.session_state["df_dict"], | |
| ) = load_and_cache_data(result_file_path=most_recent_run) | |
| # side bar | |
| st.sidebar.title("Select Result:") | |
| result_select = QueryWrapper("expname")( | |
| st.sidebar.selectbox, | |
| list(st.session_state["all_result_dict"].keys()), | |
| ) | |
| if result_select is None: | |
| if st.session_state.korean: | |
| st.markdown("๊ฒฐ๊ณผ๋ฅผ ํ์ธํ๋ ค๋ฉด ๋จผ์ **๐ฅArena-Lite๋ฅผ ๊ตฌ๋**ํ์ ์ผ ํฉ๋๋ค") | |
| else: | |
| st.markdown("You should **๐ฅRun Arena-Lite** first to see results") | |
| st.image("streamlit_app_local/page_result_1.png") | |
| st.image("streamlit_app_local/page_result_2.png") | |
| st.image("streamlit_app_local/page_result_3.png") | |
| st.image("streamlit_app_local/page_result_4.png") | |
| st.stop() | |
| eval_prompt_name = result_select.split("/")[-1].strip() | |
| if st.sidebar.button("Clear Cache"): | |
| st.cache_data.clear() | |
| st.cache_resource.clear() | |
| st.rerun() | |
| if result_select: | |
| if "alpha2names" in st.session_state: | |
| del st.session_state["alpha2names"] | |
| fig_dict_per_task = st.session_state["all_result_dict"][result_select] | |
| task_list = list(fig_dict_per_task.keys()) | |
| elo_rating_by_task = fig_dict_per_task["Overall"]["elo_rating_by_task"] | |
| # tabs = st.tabs(task_list) | |
| df_dict_per_task = st.session_state["df_dict"][result_select] | |
| default_layout_dict = DEFAULT_LAYOUT_DICT | |
| task = QueryWrapper("task", "Select Task")(st.selectbox, task_list) | |
| if task is None: | |
| st.stop() | |
| figure_dict = fig_dict_per_task[task] | |
| judgename = figure_dict["judgename"] | |
| df = df_dict_per_task[task] | |
| interpretation, n_models, size_testset = number_breakdown_from_df(df) | |
| if st.session_state.korean: | |
| st.markdown(f"## ๊ฒฐ๊ณผ ({task})") | |
| st.markdown(f"##### Judge ๋ชจ๋ธ: {judgename} / ํ๊ฐํ๋กฌ: {eval_prompt_name}") | |
| st.markdown(f"##### ํ ์คํธ์ ์ฌ์ด์ฆ: {int(size_testset)} ํ") | |
| else: | |
| st.markdown(f"## Results ({task})") | |
| st.markdown(f"##### Judge Model: {judgename} / prompt: {eval_prompt_name}") | |
| st.markdown(f"##### Size of Testset: {int(size_testset)} rows") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| with st.container(border=True): | |
| st.markdown(f"#### Ratings ({task})") | |
| st.table(figure_dict["elo_rating"]) | |
| st.write(show_linebreak_in_md(escape_markdown(interpretation))) | |
| with col2: | |
| with st.container(border=True): | |
| st.plotly_chart( | |
| elo_rating_by_task.update_layout(**default_layout_dict), | |
| use_container_width=True, | |
| key=f"{task}_elo_rating_by_task", | |
| ) | |
| st.divider() | |
| if st.session_state.korean: | |
| st.markdown("### ํ ๋๋จผํธ (ํ ์คํธ ์๋๋ฆฌ์ค) ๋ณ๋ก ๋ณด๊ธฐ") | |
| else: | |
| st.markdown("### Tournament Results by Test Scenario") | |
| # with st.expander("๋ณผ ํ ๋๋จผํธ ๊ณ ๋ฅด๊ธฐ"): | |
| d = list(df.idx_inst_src.unique()) | |
| default_idx = st.session_state.get("selected_tournament", None) | |
| cols = st.columns((1, 18, 1)) | |
| with cols[0]: | |
| if st.button("โ", key="prev_tournament"): | |
| navigate(d, default_idx, "selected_tournament", -1) | |
| with cols[1]: | |
| tournament_prm_select = QueryWrapper("tournament", "Select Tournament")( | |
| st.selectbox, | |
| d, | |
| default_idx, | |
| key=f"{task}_tournament_select", | |
| on_change=lambda: st.session_state.update( | |
| selected_tournament=st.session_state.get(f"{task}_tournament_select"), | |
| selected_match=None, | |
| ), | |
| label_visibility="collapsed", | |
| ) | |
| with cols[2]: | |
| if st.button("โถ", key="next_tournament"): | |
| navigate(d, default_idx, "selected_tournament", 1) | |
| # tournament_prm_select = st.selectbox( | |
| # "Select Tournament", | |
| # df.idx_inst_src.unique(), | |
| # index=d.index(st.session_state.get("selected_tournament")), | |
| # key=f"{task}_tournament_{result_select}", | |
| # ) | |
| # print(tournament_prm_select, type(tournament_prm_select)) | |
| st.session_state["selected_tournament"] = tournament_prm_select | |
| # tournament_prm_select = st.selectbox( | |
| # "Select Tournament", | |
| # df.idx_inst_src.unique(), | |
| # key=f"{task}_tournament_{result_select}", | |
| # ) | |
| df_now_processed = None | |
| if tournament_prm_select: | |
| df_now = df[df.idx_inst_src == tournament_prm_select] | |
| df_now_processed, _alpha2names = au.init_tournament_dataframe( | |
| df_now, | |
| alpha2names=st.session_state["alpha2names"] | |
| if "alpha2names" in st.session_state.keys() | |
| else None, | |
| ) | |
| if "alpha2names" not in st.session_state: | |
| st.session_state["alpha2names"] = _alpha2names | |
| try: | |
| bracket_drawing = au.draw( | |
| df_now_processed, | |
| alpha2names=st.session_state["alpha2names"], | |
| ) | |
| legend = au.make_legend_str( | |
| df_now_processed, st.session_state["alpha2names"] | |
| ) | |
| st.code(bracket_drawing + legend) | |
| m = list(df_now_processed.human_readable_idx) | |
| default_idx = st.session_state.get("selected_match", None) | |
| cols = st.columns((1, 18, 1)) | |
| with cols[0]: | |
| if st.button("โ", key="prev_match"): | |
| navigate(m, default_idx, "selected_match", -1) | |
| with cols[1]: | |
| match_idx_human = QueryWrapper("match", "Select Match")( | |
| st.selectbox, | |
| m, | |
| default_idx, | |
| key=f"{task}_match_select", | |
| label_visibility="collapsed", | |
| ) | |
| with cols[2]: | |
| if st.button("โถ", key="next_match"): | |
| navigate(m, default_idx, "selected_match", 1) | |
| # match_idx_human = st.selectbox( | |
| # "Select Match", | |
| # df_now_processed.human_readable_idx, | |
| # key=f"{task}_match_{result_select}", | |
| # ) | |
| # print(match_idx_human) | |
| st.session_state["selected_match"] = match_idx_human | |
| # match_idx_human = st.selectbox( | |
| # "Select Match", | |
| # df_now_processed.human_readable_idx, | |
| # key=f"{task}_match_{result_select}", | |
| # ) | |
| if match_idx_human: | |
| match_idx = int(match_idx_human.split(": ")[0]) | |
| row = df_now_processed.loc[match_idx] | |
| st.markdown("#### Current Test Scenario:") | |
| with st.expander( | |
| f"### Evaluation Prompt (evalprompt: {eval_prompt_name}--{task})" | |
| ): | |
| prompt = load_prompt(eval_prompt_name, task=task) | |
| kwargs = dict( | |
| inst="{inst}", | |
| src="{src}", | |
| out_a="{out_a}", | |
| out_b="{out_b}", | |
| task=task, | |
| ) | |
| if eval_prompt_name == "translation_pair": | |
| kwargs["source_lang"] = "{source_lang}" | |
| kwargs["target_lang"] = "{target_lang}" | |
| prompt_cmpl = prompt.complete_prompt(**kwargs) | |
| for msg in prompt_cmpl: | |
| st.markdown(f"**{msg['role']}**") | |
| st.info(show_linebreak_in_md(escape_markdown(msg["content"]))) | |
| st.info(show_linebreak_in_md(tournament_prm_select)) | |
| winner = row.winner | |
| col1, col2 = st.columns(2) | |
| winnerbox = st.success | |
| loserbox = st.error | |
| with col1: | |
| iswinner = winner == "model_a" | |
| writemsg = winnerbox if iswinner else loserbox | |
| st.markdown(f"#### ({row.model_a}) {row.human_readable_model_a}") | |
| writemsg( | |
| show_linebreak_in_md(row.generated_a), | |
| icon="โ " if iswinner else "โ", | |
| ) | |
| with col2: | |
| iswinner = winner == "model_b" | |
| writemsg = winnerbox if iswinner else loserbox | |
| st.markdown(f"#### ({row.model_b}) {row.human_readable_model_b}") | |
| writemsg( | |
| show_linebreak_in_md(row.generated_b), | |
| icon="โ " if iswinner else "โ", | |
| ) | |
| except Exception as e: | |
| import traceback | |
| traceback.print_exc() | |
| st.markdown( | |
| "**Bug: ์๋ ํ๋ฅผ ๋ณต์ฌํด์ ์ด์๋ก ๋จ๊ฒจ์ฃผ์๋ฉด ๊ฐ์ ์ ๋์์ด ๋ฉ๋๋ค. ๊ฐ์ฌํฉ๋๋ค๐**" | |
| if st.session_state.korean | |
| else "Bug: Please open issue and attach the table output below to help me out. Thanks in advance.๐" | |
| ) | |
| st.error(e) | |
| st.info(tournament_prm_select) | |
| st.table( | |
| df_now_processed[ | |
| [ | |
| "depth", | |
| "round", | |
| "winner_nodes", | |
| "winner_resolved", | |
| "winner", | |
| "model_a", | |
| "model_b", | |
| ] | |
| ] | |
| ) | |
| st.write("Sharable link") | |
| st.code(f"{get_base_url()}/see_results?{QueryWrapper.get_sharable_link()}") | |
| st.divider() | |
| if st.session_state.korean: | |
| st.markdown("### ๋งค์น ํต๊ณ") | |
| else: | |
| st.markdown("### Match Stats.") | |
| col1, col2 = st.columns(2) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| with st.container(border=True): | |
| st.plotly_chart( | |
| figure_dict[ | |
| "fraction_of_model_a_wins_for_all_a_vs_b_matches" | |
| ].update_layout(autosize=True, **default_layout_dict), | |
| use_container_width=True, | |
| key=f"{task}_fraction_of_model_a_wins_for_all_a_vs_b_matches", | |
| ) | |
| with col2: | |
| with st.container(border=True): | |
| st.plotly_chart( | |
| figure_dict["match_count_of_each_combination_of_models"].update_layout( | |
| autosize=True, **default_layout_dict | |
| ), | |
| use_container_width=True, | |
| key=f"{task}_match_count_of_each_combination_of_models", | |
| ) | |
| with col1: | |
| with st.container(border=True): | |
| st.plotly_chart( | |
| figure_dict["match_count_for_each_model"].update_layout( | |
| **default_layout_dict | |
| ), | |
| use_container_width=True, | |
| key=f"{task}_match_count_for_each_model", | |
| ) | |
| with col2: | |
| pass | |
| if st.session_state.korean: | |
| st.markdown("### ์ฐธ๊ณ ์ฉ LLM Judge ํธํฅ ์ ๋ณด") | |
| else: | |
| st.markdown("### FYI: How biased is your LLM Judge?") | |
| with st.expander("ํผ์ณ์ ๋ณด๊ธฐ" if st.session_state.korean else "Expand to show"): | |
| st.info( | |
| """ | |
| Arena-Lite์์๋ position bias์ ์ํฅ์ ์ต์ํํ๊ธฐ ์ํด ๋ชจ๋ ๋ชจ๋ธ์ด A๋ B์์น์ ๋ฒ๊ฐ์ ์์นํ๋๋ก ํ์์ต๋๋ค. ๊ทธ๋ฌ๋ LLM Judge ํน์ Prompt์ ์ฑ๋ฅ์ด ๋ถ์กฑํ๋ค๊ณ ๋๊ปด์ง๋ค๋ฉด, ์๋ ์๋ ค์ง LLM Judge bias๊ฐ ์ฐธ๊ณ ๊ฐ ๋ ๊ฒ๋๋ค. | |
| * position bias (์ผ์ชฝ) | |
| * length bias (์ค๋ฅธ์ชฝ) | |
| ๊ฒฐ๊ณผ์ ์๊ณก์ด LLM Judge์ ๋ถ์กฑํจ ๋๋ฌธ์ด์๋ค๋ ์ ์ ๊ท๋ช ํ๋ ค๋ฉด ์ฌ์ฉํ์ LLM Judge์ Prompt์ binary classification ์ ํ๋๋ฅผ ์ธก์ ํด๋ณด์๊ธธ ๋ฐ๋๋๋ค (Arena-Lite๋ฅผ ํ์ฉํ์ฌ ์ด๋ฅผ ์ํํด๋ณผ ์ ์์ต๋๋ค!).""".strip() | |
| if st.session_state.korean | |
| else """ | |
| In Arena-Lite, to minimize the effect of position bias, all models are alternately positioned in either position A or B. However, if you feel the LLM Judge or Prompt performance is insufficient, the following known LLM Judge biases may be helpful to reference: | |
| * position bias (left) | |
| * length bias (right) | |
| To determine if result distortion was due to LLM Judge limitations, please measure the binary classification accuracy of your LLM Judge and Prompt (You could use Arena-Lite for this purpose!). | |
| """.strip() | |
| ) | |
| st.markdown(f"#### {judgename} + prompt = {eval_prompt_name}") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| with st.container(border=True): | |
| st.plotly_chart( | |
| figure_dict["counts_of_match_winners"].update_layout( | |
| **default_layout_dict | |
| ), | |
| use_container_width=True, | |
| key=f"{task}_counts_of_match_winners", | |
| ) | |
| with col2: | |
| with st.container(border=True): | |
| st.plotly_chart( | |
| figure_dict["length_bias"].update_layout(**default_layout_dict), | |
| use_container_width=True, | |
| key=f"{task}_length_bias", | |
| ) | |
| st.table(figure_dict["length_bias_df"].groupby("category").describe().T) | |
| if __name__ == "__main__": | |
| main() | |