Pranjal2041 commited on
Commit
257090a
ยท
unverified ยท
1 Parent(s): ed7d13f

Update Leaderboard

Browse files
__pycache__/app.cpython-311.pyc ADDED
Binary file (19.4 kB). View file
 
__pycache__/constants.cpython-311.pyc ADDED
Binary file (1.96 kB). View file
 
app.py CHANGED
@@ -1,53 +1,255 @@
1
  import gradio as gr
2
  import pandas as pd
 
 
 
3
 
 
4
 
5
- data = [
6
- ['**Baseline**', 19.7, 19.6, 19.8, 19.8, 19.8, 19.8, 19.8, 19.8, 19.8, 19.8, 19.8, "[[1]](https://arxiv.org/abs/2310.18xxx)"],
7
- ['**Keyword Stuffing**', 19.6, 19.5, 19.8, 20.8, 19.8, 20.4, 20.6, 19.9, 21.1, 21.0, 20.6, "[[1]](https://arxiv.org/abs/2310.18xxx)"],
8
- ['**Unique Words**', 20.6, 20.5, 20.7, 20.8, 20.3, 20.5, 20.9, 20.4, 21.5, 21.2, 20.9, "[[1]](https://arxiv.org/abs/2310.18xxx)"],
9
- ['**Simple Language**', 21.5, 22.0, 21.5, 21.0, 21.1, 21.2, 20.9, 20.6, 21.9, 21.4, 21.3, "[[1]](https://arxiv.org/abs/2310.18xxx)"],
10
- ['**Authoritative Language**', 21.3, 21.2, 21.1, 22.3, 22.9, 22.1, 23.2, 21.9, 23.9, 23.0, 23.1, "[[1]](https://arxiv.org/abs/2310.18xxx)"],
11
- ['**Technical Language**', 22.5, 22.4, 22.5, 21.2, 21.8, 20.5, 21.1, 20.5, 22.1, 21.2, 21.4, "[[1]](https://arxiv.org/abs/2310.18xxx)"],
12
- ['**Fluent Language**', 24.4, 24.4, 24.4, 21.3, 23.2, 21.2, 21.4, 20.8, 23.2, 21.5, 22.1, "[[1]](https://arxiv.org/abs/2310.18xxx)"],
13
- ['**Citation Addition**', 25.5, 25.3, 25.3, 22.8, 24.2, 21.7, 22.3, 21.3, 23.5, 21.7, 22.9, "[[1]](https://arxiv.org/abs/2310.18xxx)"],
14
- ['**Quotes Addition**', 27.5, 27.6, 27.1, 24.4, 26.7, 24.6, 24.9, 23.2, 26.4, 24.1, 25.5, "[[1]](https://arxiv.org/abs/2310.18xxx)"],
15
- ['**Adding Statistics**', 25.8, 26.0, 25.5, 23.1, 26.1, 23.6, 24.5, 22.4, 26.1, 23.8, 24.8, "[[1]](https://arxiv.org/abs/2310.18xxx)"]
16
- ]
17
 
18
- # Column names
19
- columns = ['Method', 'Word', 'Position', 'WordPos Overall', 'Rel.', 'Infl.', 'Unique', 'Div.', 'FollowUp', 'Pos.', 'Count', 'Subjective Average', 'Source']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- # Create a DataFrame
22
- DATA_OVERALL = pd.DataFrame(data, columns=columns)
23
- DATA_OVERALL.sort_values(by=['WordPos Overall'], inplace=True, ascending=False)
24
 
25
  with gr.Blocks() as demo:
26
- gr.Markdown(f"""
27
- # GEO-Bench Leaderboard, for benchmarking conent optimziation methods for Generative Engines.
28
- - To submit check [here](https://github.com/Pranjal2041/GEO/GEO-Bench/leaderboard/Readme.md)
29
- - Refer to GEO paper for more [details](https://arxiv.org/abs/2310.18xxx)
30
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  with gr.Tabs():
33
 
34
- with gr.TabItem('Overall'):
35
 
36
  with gr.Row():
37
  gr.Markdown('## Overall Leaderboard')
 
38
  with gr.Row():
39
  data_overall = gr.components.Dataframe(
40
- DATA_OVERALL,
41
- datatype=["markdown"] + ["number"] * len(DATA_OVERALL.columns) + ['markdown'],
42
  type="pandas",
43
  wrap=True,
44
  interactive=False,
45
  )
 
 
46
  with gr.Row():
47
- data_run_overall = gr.Button("Refresh")
48
- # data_run_overall.click(get_mteb_average, inputs=None, outputs=data_overall)
49
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
 
51
 
52
  if __name__ == "__main__":
53
  demo.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import os
4
+ import itertools
5
+ from constants import metric_dict, tags, columns
6
 
7
+ # Download from github and load the data
8
 
9
+ # TODO: Download every x hours
10
+ def download_data(url = "https://github.com/Pranjal2041/GEO/GEO-Bench/leaderboard/leaderboard.jsonl", path = "leaderboard.jsonl"):
11
+ ret_code = os.system(f'wget {url} -O {path}_tmp')
12
+ if ret_code != 0:
13
+ return ret_code
14
+ os.system(f'mv {path}_tmp {path}')
15
+ return 0
 
 
 
 
 
16
 
17
+ def search_leaderboard(df, queries):
18
+ # Assuming DATA_OVERALL is the DataFrame containing the leaderboard data
19
+ # filtered_data = df[df["Method"].str.contains(query, case=False, na=False)]
20
+ temp_pds = []
21
+ for query in queries:
22
+ temp_pds.append(df[df["Method"].str.contains(query, case=False, na=False)])
23
+ return pd.concat(temp_pds).drop_duplicates()
24
+
25
+ def search_tags_leaderboard(df, tag_blocks, queries):
26
+ return search_leaderboard(filter_tags(df, tag_blocks), queries)
27
+
28
+ def filter_tags(df, tag_blocks):
29
+ def fuzzy_in(x, y_set):
30
+ return any(x in z for z in y_set)
31
+ all_tags_sets = [set(tag.lower() for tag in tag_block) for tag_block in tag_blocks]
32
+
33
+ filtered_rows = [i for i, tags in enumerate(complete_dt['tags']) if all('any' in tag_set or any(fuzzy_in(tag.lower(), tag_set) for tag in tags) for tag_set in all_tags_sets)]
34
+
35
+ return prepare_complete_dt(df.iloc[filtered_rows])
36
+
37
+ def prepare_complete_dt(complete_dt):
38
+ data = []
39
+ DATA_OVERALL = complete_dt.copy()
40
+ for Method in set(complete_dt['Method']):
41
+ data.append([])
42
+ data[-1].append(Method)
43
+ for metric in metric_dict:
44
+ metric_val = metric_dict[metric]
45
+ data[-1].append(complete_dt[complete_dt['Method'] == Method][metric_val].mean())
46
+ data[-1].append(complete_dt[complete_dt['Method'] == Method]['source'].iloc[0])
47
+ DATA_OVERALL = pd.DataFrame(data, columns=columns)
48
+ try:
49
+ DATA_OVERALL.sort_values(by=['WordPos Overall'], inplace=True, ascending=False)
50
+ except: ...
51
+ return DATA_OVERALL
52
+
53
+ def format_df_for_leaderboard(df):
54
+ # The source column needs to be embedded directly into the Method column using appropriate markdown.
55
+ df['Method'] = df[['source', 'Method']].apply(lambda x: f'<a target="_blank" style="text-decoration: underline; color: #3571d7;" href="{x[0]}">{x[1]}</a>', axis=1)
56
+ # Convert all float metrics to 1 decimal
57
+ df_copy = df.copy()
58
+ for metric in metric_dict:
59
+ df_copy[metric] = df_copy[metric].apply(lambda x: float(f'{(100*x):.1f}'))
60
+ # drop the source column
61
+ return df_copy.drop(columns=['source'])
62
+
63
+
64
+ ret_code = 0
65
+ # ret_code = download_data()
66
+ if ret_code != 0:
67
+ print("Leaderboard Download failed")
68
+
69
+ complete_dt = pd.read_json('leaderboard.jsonl', lines=True, orient='records')
70
+ DATA_OVERALL = prepare_complete_dt(complete_dt)
71
 
 
 
 
72
 
73
  with gr.Blocks() as demo:
74
+
75
+ demo_content = """
76
+ <style>
77
+ .badge-container {
78
+ text-align: center;
79
+ display: flex;
80
+ justify-content: center;
81
+ }
82
+ .badge {
83
+ margin: 1px;
84
+ }
85
+ </style>
86
+ <h1 style="text-align: center;">GEO-Bench Leaderboard</h1>
87
+ <div class="badge-container">
88
+ <a href="https://pranjal2041.github.io/geo/" class="badge">
89
+ <img src="https://img.shields.io/website?down_message=down&style=for-the-badge&up_message=up&url=https%3A%2F%2Fpranjal2041.github.io/geo/" alt="Website">
90
+ </a>
91
+ <a href="https://arxiv.org/abs/2310.18xxx" class="badge">
92
+ <img src="https://img.shields.io/badge/arXiv-2310.18xxx-red.svg?style=for-the-badge" alt="Arxiv Paper">
93
+ </a>
94
+ <a href="https://huggingface.co/datasets/Pranjal2041/geo-bench" class="badge">
95
+ <img src="https://img.shields.io/badge/Dataset-GEO-%2DBENCH-orange?style=for-the-badge" alt="Dataset">
96
+ </a>
97
+ <a href="https://github.com/Pranjal2041/GEO" class="badge">
98
+ <img src="https://img.shields.io/badge/Github-Code-green?style=for-the-badge" alt="Code">
99
+ </a>
100
+ </div>
101
+ <p>
102
+ - For benchmarking content optimization Methods for Generative Engines.<br>
103
+ - GEO-Bench evaluates Methods for optimizing website content to improve visibility in generative engine responses. Benchmark contains 10K queries across 9 datasets covering diverse domains and intents.<br>
104
+ - Refer to GEO paper for more <a href="https://arxiv.org/abs/2310.18xxx">details</a>
105
+ </p>
106
+ """
107
+
108
+
109
+ gr.HTML(demo_content)
110
+
111
+
112
+
113
 
114
  with gr.Tabs():
115
 
116
+ with gr.TabItem('Overall ๐Ÿ“Š'):
117
 
118
  with gr.Row():
119
  gr.Markdown('## Overall Leaderboard')
120
+
121
  with gr.Row():
122
  data_overall = gr.components.Dataframe(
123
+ format_df_for_leaderboard(DATA_OVERALL),
124
+ datatype=["markdown"] + ["number"] * (len(DATA_OVERALL.columns) - 2) + ['markdown'],
125
  type="pandas",
126
  wrap=True,
127
  interactive=False,
128
  )
129
+ # data_overall.
130
+
131
  with gr.Row():
132
+ # search_bar = gr.Textbox(type="text", label="Search for a Method:")
133
+ search_bar = gr.Textbox(
134
+ placeholder=" ๐Ÿ” Search for your Method (separate multiple queries with `,`) and press ENTER...",
135
+ show_label=False,
136
+ elem_id="search-bar",
137
+ )
138
+
139
+ def search_button_click(query):
140
+ filtered_data = search_leaderboard(DATA_OVERALL, [x.strip() for x in query.split(',')])
141
+ return format_df_for_leaderboard(filtered_data)
142
+
143
+ with gr.TabItem('Tag-Wise Results ๐Ÿ“Š'):
144
+ with gr.Row():
145
+ gr.Markdown(f"""
146
+ ## Tag-Wise Results
147
+ - The following table shows the results for each tag.
148
+ - The tags are sorted in the order of their performance.
149
+ - The table is sorted in the order of the overall score.
150
+ """)
151
+ with gr.Row():
152
+
153
+ search_bar_tag = gr.Textbox(
154
+ placeholder=" ๐Ÿ” Search for your Method (separate multiple queries with `,`) and press ENTER...",
155
+ show_label=False,
156
+ elem_id="search-bar",
157
+ )
158
+
159
+ def search_button_click(query):
160
+ filtered_data = search_leaderboard(DATA_OVERALL, [x.strip() for x in query.split(',')])
161
+ return format_df_for_leaderboard(filtered_data)
162
+
163
+ with gr.Row():
164
+ boxes = dict()
165
+ with gr.Column(min_width=320):
166
+ for tag in list(tags.keys())[:3]:
167
+ with gr.Box(elem_id="box-filter"):
168
+ boxes[tag] = gr.CheckboxGroup(
169
+ label=tag,
170
+ choices=tags[tag],
171
+ value=tags[tag],
172
+ interactive=True,
173
+ elem_id=f"filter-{tag}",
174
+ )
175
+ with gr.Column(min_width=320):
176
+ for tag in list(tags.keys())[4:]:
177
+ with gr.Box(elem_id="box-filter"):
178
+ boxes[tag] = gr.CheckboxGroup(
179
+ label=tag,
180
+ choices=tags[tag],
181
+ value=tags[tag],
182
+ interactive=True,
183
+ elem_id=f"filter-{tag}",
184
+ )
185
+ with gr.Row():
186
+ tag = list(tags.keys())[3]
187
+ with gr.Box(elem_id="box-filter"):
188
+ boxes[tag] = gr.CheckboxGroup(
189
+ label=tag,
190
+ choices=tags[tag],
191
+ value=tags[tag],
192
+ interactive=True,
193
+ elem_id=f"filter-{tag}",
194
+ )
195
+ with gr.Row():
196
+ data_tag_wise = gr.components.Dataframe(
197
+ format_df_for_leaderboard(DATA_OVERALL),
198
+ datatype=["markdown"] + ["number"] * (len(DATA_OVERALL.columns) - 2) + ['markdown'],
199
+ type="pandas",
200
+ wrap=True,
201
+ interactive=False,
202
+ )
203
+ def filter_tag_click(*boxes):
204
+ return format_df_for_leaderboard(filter_tags(complete_dt, list(boxes)))
205
+ def search_tag_click(query, *boxes):
206
+ return format_df_for_leaderboard(search_tags_leaderboard(complete_dt, list(boxes), [x.strip() for x in query.split(',')]))
207
+ for box in boxes:
208
+ boxes[box].change(fn=filter_tag_click, inputs=list(boxes.values()), outputs=data_tag_wise)
209
+ search_bar_tag.submit(fn=search_tag_click, inputs=[search_bar_tag] + list(boxes.values()), outputs=data_tag_wise)
210
+
211
+ with gr.TabItem('About GEO-bench ๐Ÿ“–'):
212
+ with gr.Row():
213
+ gr.Markdown(f"""
214
+ ## About GEO-bench
215
+ - GEO-bench is a benchmarking platform for content optimization Methods for generative engines.
216
+ - It is a part of the work released under [GEO](https://arxiv.org/abs/2310.18xxx)
217
+ - The benchmark comprises of 9 datasets, 7 of which were publicly available, while 2 have been released by us.
218
+ - Dataset can be downloaded from [here](huggingface.co/datasets/pranjal2041/geo-bench)""")
219
+
220
+ with gr.Row():
221
+
222
+ # Goal of benchmarking content optimization for generative engines
223
+ # Contains 10K carefully curated queries
224
+ # Queries are diverse and cover many domains/intents
225
+ # Annotated with tags/dimensions like domain, difficulty, etc.
226
+ # Above list in HTML format
227
+ gr.HTML(f"""
228
+ <h3>Key-Highlights of GEO-bench</h3>
229
+ <ul>
230
+ <li>Goal of benchmarking content optimization for generative engines</li>
231
+ <li>Contains 10K carefully curated queries</li>
232
+ <li>Queries are diverse and cover many domains/intents</li>
233
+ <li>Annotated with tags/dimensions like domain, difficulty, etc.</li>
234
+ </ul>
235
+ """)
236
+
237
+ # Benchmark Link:
238
+ # gr.Markdown(f"""### Benchmark Link: [GEO-bench](huggingface.co/datasets/pranjal2041/geo-bench)""")
239
+
240
+ # Info about tags and other statistics
241
+
242
+
243
+ with gr.TabItem('Submit ๐Ÿ“'):
244
+ with gr.Row():
245
+ gr.Markdown(f"""
246
+ ## Submit
247
+ - To submit your Method, please check [here](github.com/Pranjal2041/GEO/GEO-Bench/leaderboard/Readme.md)""")
248
+
249
+
250
+ # Create a form to submit, the response should be sent to a google form
251
 
252
+ search_bar.submit(fn=search_button_click, inputs=search_bar, outputs=data_overall)
253
 
254
  if __name__ == "__main__":
255
  demo.launch()
constants.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # metrics = ['relevance_detailed', 'uniqueness_detailed', 'subjcount_detailed', 'follow_detailed', 'simple_wordpos', 'simple_pos', 'influence_detailed', 'subjective_score', 'diversity_detailed', 'simple_word', 'subjpos_detailed']
2
+ columns = ['Method', 'Word', 'Position', 'WordPos Overall', 'Rel.', 'Infl.', 'Unique', 'Div.', 'FollowUp', 'Pos.', 'Count', 'Subjective Average', 'source']
3
+ metric_dict = {
4
+ 'Word': 'simple_word',
5
+ 'Position': 'simple_pos',
6
+ 'WordPos Overall': 'simple_wordpos',
7
+ 'Rel.': 'relevance_detailed',
8
+ 'Infl.': 'influence_detailed',
9
+ 'Unique': 'uniqueness_detailed',
10
+ 'Div.': 'diversity_detailed',
11
+ 'FollowUp': 'follow_detailed',
12
+ 'Pos.': 'subjpos_detailed',
13
+ 'Count': 'subjcount_detailed',
14
+ 'Subjective Average': 'subjective_score',
15
+ }
16
+
17
+ tags = {
18
+ "Difficulty Level": ["Simple", "Intermediate", "Complex", "Multi-faceted", "Open-ended", 'any'],
19
+ "Nature of Query": ["Informational", "Navigational", "Transactional", "Debate", "Opinion", "Comparison", "Instructional", "Descriptive", "Predictive", 'any'],
20
+ "Sensitivity": ["Sensitive", "Non-sensitive",'any'],
21
+ "Genre": [
22
+ "๐ŸŽญ Arts and Entertainment", "๐Ÿš— Autos and Vehicles", "๐Ÿ’„ Beauty and Fitness", "๐Ÿ“š Books and Literature", "๐Ÿข Business and Industrial",
23
+ "๐Ÿ’ป Computers and Electronics", "๐Ÿ’ฐ Finance", "๐Ÿ” Food and Drink", "๐ŸŽฎ Games", "๐Ÿฅ Health", "๐ŸŽจ Hobbies and Leisure", "๐Ÿก Home and Garden",
24
+ "๐ŸŒ Internet and Telecom", "๐ŸŽ“ Jobs and Education", "๐Ÿ›๏ธ Law and Government", "๐Ÿ“ฐ News", "๐Ÿ’ฌ Online Communities", "๐Ÿ‘ซ People and Society",
25
+ "๐Ÿพ Pets and Animals", "๐Ÿก Real Estate", "๐Ÿ“š Reference", "๐Ÿ”ฌ Science", "๐Ÿ›’ Shopping", "โšฝ Sports", "โœˆ๏ธ Travel",'any'
26
+ ],
27
+ "Specific Topics": ["Physics", "Chemistry", "Biology", "Mathematics", "Computer Science", "Economics", 'any'],
28
+ "User Intent": ["๐Ÿ” Research", "๐Ÿ’ฐ Purchase", "๐ŸŽ‰ Entertainment", "๐Ÿ“š Learning", "๐Ÿ”„ Comparison", 'any'],
29
+ "Answer Type": ["Fact", "Opinion", "List", "Explanation", "Guide", "Comparison", "Prediction", 'any'],
30
+ }
31
+
leaderboard.jsonl ADDED
The diff for this file is too large to render. See raw diff