Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import os | |
import itertools | |
from constants import metric_dict, tags, columns | |
# Download from github and load the data | |
# TODO: Download every x hours | |
def download_data(url = "https://github.com/Pranjal2041/GEO/GEO-Bench/leaderboard/leaderboard.jsonl", path = "leaderboard.jsonl"): | |
ret_code = os.system(f'wget {url} -O {path}_tmp') | |
if ret_code != 0: | |
return ret_code | |
os.system(f'mv {path}_tmp {path}') | |
return 0 | |
def search_leaderboard(df, queries): | |
# Assuming DATA_OVERALL is the DataFrame containing the leaderboard data | |
# filtered_data = df[df["Method"].str.contains(query, case=False, na=False)] | |
temp_pds = [] | |
for query in queries: | |
temp_pds.append(df[df["Method"].str.contains(query, case=False, na=False)]) | |
return pd.concat(temp_pds).drop_duplicates() | |
def search_tags_leaderboard(df, tag_blocks, queries): | |
return search_leaderboard(filter_tags(df, tag_blocks), queries) | |
def filter_tags(df, tag_blocks): | |
def fuzzy_in(x, y_set): | |
return any(x in z for z in y_set) | |
all_tags_sets = [set(tag.lower() for tag in tag_block) for tag_block in tag_blocks] | |
filtered_rows = [i for i, tags in enumerate(complete_dt['tags']) if all('any' in tag_set or any(fuzzy_in(tag.lower(), tag_set) for tag in tags) for tag_set in all_tags_sets)] | |
return prepare_complete_dt(df.iloc[filtered_rows]) | |
def prepare_complete_dt(complete_dt): | |
data = [] | |
DATA_OVERALL = complete_dt.copy() | |
for Method in set(complete_dt['Method']): | |
data.append([]) | |
data[-1].append(Method) | |
for metric in metric_dict: | |
metric_val = metric_dict[metric] | |
data[-1].append(complete_dt[complete_dt['Method'] == Method][metric_val].mean()) | |
data[-1].append(complete_dt[complete_dt['Method'] == Method]['source'].iloc[0]) | |
DATA_OVERALL = pd.DataFrame(data, columns=columns) | |
try: | |
DATA_OVERALL.sort_values(by=['WordPos Overall'], inplace=True, ascending=False) | |
except: ... | |
return DATA_OVERALL | |
def format_df_for_leaderboard(df): | |
# The source column needs to be embedded directly into the Method column using appropriate markdown. | |
df['Method'] = df[['source', 'Method']].apply(lambda x: f'<a target="_blank" style="text-decoration: underline; color: #3571d7;" href="{x[0]}">{x[1]}</a>', axis=1) | |
# Convert all float metrics to 1 decimal | |
df_copy = df.copy() | |
for metric in metric_dict: | |
df_copy[metric] = df_copy[metric].apply(lambda x: float(f'{(100*x):.1f}')) | |
# drop the source column | |
return df_copy.drop(columns=['source']) | |
ret_code = 0 | |
# ret_code = download_data() | |
if ret_code != 0: | |
print("Leaderboard Download failed") | |
complete_dt = pd.read_json('leaderboard.jsonl', lines=True, orient='records') | |
DATA_OVERALL = prepare_complete_dt(complete_dt) | |
with gr.Blocks() as demo: | |
demo_content = """ | |
<style> | |
.badge-container { | |
text-align: center; | |
display: flex; | |
justify-content: center; | |
} | |
.badge { | |
margin: 1px; | |
} | |
</style> | |
<h1 style="text-align: center;">GEO-Bench Leaderboard</h1> | |
<div class="badge-container"> | |
<a href="https://pranjal2041.github.io/GEO/" class="badge"> | |
<img src="https://img.shields.io/website?down_message=down&style=for-the-badge&up_message=up&url=https%3A%2F%2Fpranjal2041.github.io/GEO/" alt="Website"> | |
</a> | |
<a href="https://arxiv.org/abs/2311.09735" class="badge"> | |
<img src="https://img.shields.io/badge/arXiv-2311.09735-red.svg?style=for-the-badge" alt="Arxiv Paper"> | |
</a> | |
<a href="https://huggingface.co/datasets/GEO-optim/geo-bench" class="badge"> | |
<img src="https://img.shields.io/badge/Dataset-GEO-%2DBENCH-orange?style=for-the-badge" alt="Dataset"> | |
</a> | |
<a href="https://github.com/Pranjal2041/GEO" class="badge"> | |
<img src="https://img.shields.io/badge/Github-Code-green?style=for-the-badge" alt="Code"> | |
</a> | |
</div> | |
<p> | |
- For benchmarking content optimization Methods for Generative Engines.<br> | |
- GEO-Bench evaluates Methods for optimizing website content to improve visibility in generative engine responses. Benchmark contains 10K queries across 9 datasets covering diverse domains and intents.<br> | |
- Refer to GEO paper for more <a href="https://arxiv.org/abs/2311.09735">details</a> | |
</p> | |
""" | |
gr.HTML(demo_content) | |
with gr.Tabs(): | |
with gr.TabItem('Overall π'): | |
with gr.Row(): | |
gr.Markdown('## Overall Leaderboard') | |
with gr.Row(): | |
data_overall = gr.components.Dataframe( | |
format_df_for_leaderboard(DATA_OVERALL), | |
datatype=["markdown"] + ["number"] * (len(DATA_OVERALL.columns) - 2) + ['markdown'], | |
type="pandas", | |
wrap=True, | |
interactive=False, | |
) | |
# data_overall. | |
with gr.Row(): | |
# search_bar = gr.Textbox(type="text", label="Search for a Method:") | |
search_bar = gr.Textbox( | |
placeholder=" π Search for your Method (separate multiple queries with `,`) and press ENTER...", | |
show_label=False, | |
elem_id="search-bar", | |
) | |
def search_button_click(query): | |
filtered_data = search_leaderboard(DATA_OVERALL, [x.strip() for x in query.split(',')]) | |
return format_df_for_leaderboard(filtered_data) | |
with gr.TabItem('Tag-Wise Results π'): | |
with gr.Row(): | |
gr.Markdown(f""" | |
## Tag-Wise Results | |
- The following table shows the results for each tag. | |
- The tags are sorted in the order of their performance. | |
- The table is sorted in the order of the overall score. | |
""") | |
with gr.Row(): | |
search_bar_tag = gr.Textbox( | |
placeholder=" π Search for your Method (separate multiple queries with `,`) and press ENTER...", | |
show_label=False, | |
elem_id="search-bar", | |
) | |
def search_button_click(query): | |
filtered_data = search_leaderboard(DATA_OVERALL, [x.strip() for x in query.split(',')]) | |
return format_df_for_leaderboard(filtered_data) | |
with gr.Row(): | |
boxes = dict() | |
with gr.Column(min_width=320): | |
for tag in list(tags.keys())[:3]: | |
with gr.Box(elem_id="box-filter"): | |
boxes[tag] = gr.CheckboxGroup( | |
label=tag, | |
choices=tags[tag], | |
value=tags[tag], | |
interactive=True, | |
elem_id=f"filter-{tag}", | |
) | |
with gr.Column(min_width=320): | |
for tag in list(tags.keys())[4:]: | |
with gr.Box(elem_id="box-filter"): | |
boxes[tag] = gr.CheckboxGroup( | |
label=tag, | |
choices=tags[tag], | |
value=tags[tag], | |
interactive=True, | |
elem_id=f"filter-{tag}", | |
) | |
with gr.Row(): | |
tag = list(tags.keys())[3] | |
with gr.Box(elem_id="box-filter"): | |
boxes[tag] = gr.CheckboxGroup( | |
label=tag, | |
choices=tags[tag], | |
value=tags[tag], | |
interactive=True, | |
elem_id=f"filter-{tag}", | |
) | |
with gr.Row(): | |
data_tag_wise = gr.components.Dataframe( | |
format_df_for_leaderboard(DATA_OVERALL), | |
datatype=["markdown"] + ["number"] * (len(DATA_OVERALL.columns) - 2) + ['markdown'], | |
type="pandas", | |
wrap=True, | |
interactive=False, | |
) | |
def filter_tag_click(*boxes): | |
return format_df_for_leaderboard(filter_tags(complete_dt, list(boxes))) | |
def search_tag_click(query, *boxes): | |
return format_df_for_leaderboard(search_tags_leaderboard(complete_dt, list(boxes), [x.strip() for x in query.split(',')])) | |
for box in boxes: | |
boxes[box].change(fn=filter_tag_click, inputs=list(boxes.values()), outputs=data_tag_wise) | |
search_bar_tag.submit(fn=search_tag_click, inputs=[search_bar_tag] + list(boxes.values()), outputs=data_tag_wise) | |
with gr.TabItem('About GEO-bench π'): | |
with gr.Row(): | |
gr.Markdown(f""" | |
## About GEO-bench | |
- GEO-bench is a benchmarking platform for content optimization Methods for generative engines. | |
- It is a part of the work released under [GEO](https://arxiv.org/abs/2311.09735) | |
- The benchmark comprises of 9 datasets, 7 of which were publicly available, while 2 have been released by us. | |
- Dataset can be downloaded from [here](huggingface.co/datasets/GEO-optim/geo-bench)""") | |
with gr.Row(): | |
# Goal of benchmarking content optimization for generative engines | |
# Contains 10K carefully curated queries | |
# Queries are diverse and cover many domains/intents | |
# Annotated with tags/dimensions like domain, difficulty, etc. | |
# Above list in HTML format | |
gr.HTML(f""" | |
<h3>Key-Highlights of GEO-bench</h3> | |
<ul> | |
<li>Goal of benchmarking content optimization for generative engines</li> | |
<li>Contains 10K carefully curated queries</li> | |
<li>Queries are diverse and cover many domains/intents</li> | |
<li>Annotated with tags/dimensions like domain, difficulty, etc.</li> | |
</ul> | |
""") | |
# Benchmark Link: | |
# gr.Markdown(f"""### Benchmark Link: [GEO-bench](huggingface.co/datasets/GEO-optim/geo-bench)""") | |
# Info about tags and other statistics | |
with gr.TabItem('Submit π'): | |
with gr.Row(): | |
gr.Markdown(f""" | |
## Submit | |
- To submit your Method, please check [here](github.com/Pranjal2041/GEO/GEO-Bench/leaderboard/Readme.md)""") | |
# Create a form to submit, the response should be sent to a google form | |
search_bar.submit(fn=search_button_click, inputs=search_bar, outputs=data_overall) | |
if __name__ == "__main__": | |
demo.launch(share=True) |