diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..4757fc72cdd035b666e98c371a99a93801e2da93 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..095325ae0338009f1a74d6492d66f84c72aaefae
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+__pycache__/
+*.py[cod]
+*$py.class
+.env
+.venv
+env/
+venv/
+ENV/
+.DS_Store
+*.log
+data/data_viewer.jsonl
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..20ed019d3b9f276f29e708c3b435e545d2a36888
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.10-slim
+
+WORKDIR /code
+
+COPY . /code/
+
+RUN pip install --no-cache-dir --upgrade pip && \
+ pip install --no-cache-dir -r requirements.txt
+
+# 默认运行命令
+CMD ["python", "app.py"]
\ No newline at end of file
diff --git a/README.md b/README.md
index 6a0910a0db9c51b46763c29bce9e5836e8585913..6aa28a0bd2d16e1a3719b9ae7f1da8f86ce8e975 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,25 @@
----
-title: DeepResearch Bench
-emoji: 🏆
-colorFrom: green
-colorTo: red
-sdk: gradio
-sdk_version: 5.31.0
-app_file: app.py
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# DeepResearch Bench
+
+**DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents**
+
+This application showcases comprehensive evaluation results for Deep Research Agents. The app includes:
+
+- 🏆 **Leaderboard** - View overall performance metrics across all evaluated models
+- 🔍 **Data Viewer** - Explore detailed results for individual research tasks
+- 📊 **Side-by-Side Comparison** - Compare different models' responses to the same research questions
+
+Visit our [project website](https://deepresearch-bench.github.io) for more information.
+
+## Citation
+```bibtex
+@article{du2025deepresearch,
+ author = {Mingxuan Du and Benfeng Xu and Chiwei Zhu and Xiaorui Wang and Zhendong Mao},
+ title = {DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents},
+ journal = {arXiv preprint},
+ year = {2025},
+}
+```
+
+## Hugging Face Space Details
+- SDK: Gradio
+- SDK Version: 3.50.0
\ No newline at end of file
diff --git a/__pycache__/create_leaderboard.cpython-38.pyc b/__pycache__/create_leaderboard.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3408c5a29d1c6283bc166e8421f6005cbc232be4
Binary files /dev/null and b/__pycache__/create_leaderboard.cpython-38.pyc differ
diff --git a/__pycache__/create_leaderboard.cpython-39.pyc b/__pycache__/create_leaderboard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cf01a49dc75d663c9dada97111764009bfe2e1b
Binary files /dev/null and b/__pycache__/create_leaderboard.cpython-39.pyc differ
diff --git a/__pycache__/gradio.cpython-310.pyc b/__pycache__/gradio.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a14d699f12e8124a5a65a6c832e0edd38943945
Binary files /dev/null and b/__pycache__/gradio.cpython-310.pyc differ
diff --git a/__pycache__/gradio.cpython-39.pyc b/__pycache__/gradio.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff2aea33cdd9d956bbcb684ce606e41faf533a6d
Binary files /dev/null and b/__pycache__/gradio.cpython-39.pyc differ
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa612e18bbb18e39e9662df90f2d194a714bd84b
--- /dev/null
+++ b/app.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+DeepResearch Bench HF Space 入口文件
+"""
+
+from __future__ import annotations
+from create_leaderboard import demo
+
+# 在Hugging Face Space中运行
+if __name__ == "__main__":
+ demo.launch(
+ server_name="0.0.0.0", # 必须这样设置以允许外部访问
+ share=False, # HF Space 自己有分享功能,无需额外分享
+ show_api=False, # 隐藏API文档页面
+ )
\ No newline at end of file
diff --git a/create_leaderboard.py b/create_leaderboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c6003fe69d529d74590f76ec4f234d20015602c
--- /dev/null
+++ b/create_leaderboard.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Gradio UI – v2.1 (Leaderboard · Data Viewer · Prompt-to-Leaderboard)
+"""
+
+from __future__ import annotations
+from pathlib import Path
+import gradio as gr
+
+# ---- Tab 组件 ----
+from tabs.leaderboard_tab import create_leaderboard_tab
+from tabs.data_viewer_tab import create_data_viewer_tab
+from tabs.data_viewer_side_by_side_tab import create_data_viewer_side_by_side_tab
+
+# ---------------------------------------------------------------------------
+# UI
+# ---------------------------------------------------------------------------
+
+with gr.Blocks(title="DeepResearch Bench") as demo:
+
+ # ========= 全局 CSS(仅作用于自定义标题 & 简介) =========
+ gr.HTML("""
+
+ """)
+
+ # ========= 顶部标题 & 简介(不使用 Markdown 标题语法) =========
+ gr.HTML("""
+
+ DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents
+
+
+
+ The research aims to comprehensively evaluate the capabilities of Deep Research Agents.
+
Code |
+
Website |
+
Paper |
+
Eval Dataset |
+ Total models: 16 | Last Update: 28 May 2025
+
+ """)
+
+ # ========= 主 Tabs =========
+ with gr.Tabs():
+ create_leaderboard_tab() # 🏆 Leaderboard
+ create_data_viewer_tab() # 🔍 Data Viewer
+ create_data_viewer_side_by_side_tab()
+
+ with gr.Tab("💬Prompt-to-Leaderboard"):
+ gr.Markdown(
+ """
+🚧 **Prompt-to-Leaderboard** module not implemented yet.
+Planned: inspect how individual prompts affect overall model ranking.
+"""
+ )
+
+# ---------------------------------------------------------------------------
+# Entrypoint
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+ demo.launch()
diff --git a/data/data_viewer.jsonl b/data/data_viewer.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b61da5f814aa722858757c27b9b9ac5c2d9e2f6
--- /dev/null
+++ b/data/data_viewer.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7ab11f250f4ffd6bf9c74ff8dc1e68f86d7abbf4f6319164bb476177ad7bf6e
+size 28044256
diff --git a/data/leaderboard.csv b/data/leaderboard.csv
new file mode 100644
index 0000000000000000000000000000000000000000..facdb1b2f115eb9ea4627520110806a07ae91d9b
--- /dev/null
+++ b/data/leaderboard.csv
@@ -0,0 +1,17 @@
+model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
+gemini-2.5-pro-deepresearch,48.88,48.53,48.50,49.18,49.44,81.44,111.21
+openai-deepresearch,46.98,46.87,45.25,49.27,47.14,77.96,40.79
+perplexity-Research,42.25,40.69,39.39,46.40,44.28,90.24,31.26
+claude-3-7-sonnet-with-search,40.67,38.99,37.66,45.77,41.46,93.68,32.48
+grok-deeper-search,40.24,37.97,35.37,46.30,44.05,83.59,8.15
+perplexity-sonar-reasoning-pro,40.22,37.38,36.11,45.66,44.74,39.36,8.35
+perplexity-sonar-reasoning,40.18,37.14,36.73,45.15,44.35,48.67,11.34
+perplexity-sonar-pro,38.93,36.38,34.26,44.70,43.35,78.66,14.74
+gemini-2.5-pro-with-grounding,35.12,34.06,29.79,41.67,37.16,81.81,32.88
+gpt-4o-search-preview,35.10,31.99,27.57,43.17,41.23,88.41,4.79
+perplexity-sonar,34.54,30.95,27.51,42.33,41.60,74.42,8.67
+gpt-4.1-with-search,33.46,29.42,25.38,42.33,40.77,87.83,4.42
+gemini-2.5-flash-preview-04-17,32.39,31.63,26.73,38.82,34.48,81.92,31.08
+gpt-4o-mini-search-preview,31.55,27.38,22.64,40.67,39.91,84.98,4.95
+gpt-4.1-mini-with-search,30.26,26.05,20.75,39.65,39.33,84.58,4.35
+claude-3-5-sonnet-with-search,28.48,24.82,22.82,35.12,35.08,94.04,9.78
\ No newline at end of file
diff --git a/data/raw_data/claude-3-5-sonnet-with-search.jsonl b/data/raw_data/claude-3-5-sonnet-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2698143a0a013e0ecd092c4b827c2c5c88391abb
--- /dev/null
+++ b/data/raw_data/claude-3-5-sonnet-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8769fc2e0cf4f059da6e34839f9df09a6fdab9e2872faa467eafa1aa42316a69
+size 505860
diff --git a/data/raw_data/claude-3-7-sonnet-with-search.jsonl b/data/raw_data/claude-3-7-sonnet-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d9bf8dbb623cff46ddc103378447dea42545bc3a
--- /dev/null
+++ b/data/raw_data/claude-3-7-sonnet-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc16f997d3ecd09bccf6d9e756d9ad36d2834d2ed0827b8f39579b6321b98837
+size 2281964
diff --git a/data/raw_data/gemini-2.5-flash-with-grounding.jsonl b/data/raw_data/gemini-2.5-flash-with-grounding.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3098842d987f197d817be9cc6e2f9424ddbfaf0a
--- /dev/null
+++ b/data/raw_data/gemini-2.5-flash-with-grounding.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908295fc145ac2f8833396b56eac8913726d93288ad6b93a9c01a69cbdbbf78a
+size 1016172
diff --git a/data/raw_data/gemini-2.5-pro-deepresearch.jsonl b/data/raw_data/gemini-2.5-pro-deepresearch.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06ebc42cb73fab6016d60139b4c774971103e52a
--- /dev/null
+++ b/data/raw_data/gemini-2.5-pro-deepresearch.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33c5d28e76595f22fae1b0fbbe2700958bfe707dafe53f7c5842d3067ccfddef
+size 8523353
diff --git a/data/raw_data/gemini-2.5-pro-with-grounding.jsonl b/data/raw_data/gemini-2.5-pro-with-grounding.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a30903d7569ba5818041b83da67a1193ee7b99e
--- /dev/null
+++ b/data/raw_data/gemini-2.5-pro-with-grounding.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b409b1031cff2876fd20cd8e9fc95501f2a95ad0154d3634b6538c165373447
+size 1050267
diff --git a/data/raw_data/gpt-4.1-mini-with-search.jsonl b/data/raw_data/gpt-4.1-mini-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..547d5d473cf909ca082d8e23ffa43b22561bb415
--- /dev/null
+++ b/data/raw_data/gpt-4.1-mini-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf809806c294364bb45cb337355d360b0e5e023c8e4ffdbf9557880a02137bab
+size 463012
diff --git a/data/raw_data/gpt-4.1-with-search.jsonl b/data/raw_data/gpt-4.1-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..03ca8b1b5a43fc274c4749e82c2e175ec226cc84
--- /dev/null
+++ b/data/raw_data/gpt-4.1-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0228624b09e9d6c25c72156f4dd7f5702e3adcdd71a1f309094c2913eb50639
+size 492406
diff --git a/data/raw_data/gpt-4o-mini-search-preview.jsonl b/data/raw_data/gpt-4o-mini-search-preview.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0634529ba7980f4609ac6a8e9ced2d03de2e849c
--- /dev/null
+++ b/data/raw_data/gpt-4o-mini-search-preview.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd49e75b1e7eb6ff40cd4c030032459d727987dd298863b488b9657ae18815a1
+size 541532
diff --git a/data/raw_data/gpt-4o-search-preview.jsonl b/data/raw_data/gpt-4o-search-preview.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..210a27c21e059df675a2fb32a496da6df1d3b475
--- /dev/null
+++ b/data/raw_data/gpt-4o-search-preview.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfb9de873345d6789197013f0cd60fb2d888957bc123447f2a8486e81c296f04
+size 565183
diff --git a/data/raw_data/grok-deeper-search.jsonl b/data/raw_data/grok-deeper-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3af0f1a3a4366ea4f7f66c3972de52547d81795d
--- /dev/null
+++ b/data/raw_data/grok-deeper-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea6428dcf2e729d84f019c302fb3862a85cefbea08282b5ffcc5c400306ab077
+size 1149933
diff --git a/data/raw_data/openai-deepresearch.jsonl b/data/raw_data/openai-deepresearch.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..89f6322b4e7f791282100772ae2f47300509288a
--- /dev/null
+++ b/data/raw_data/openai-deepresearch.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77d31b8ea1abd9aa8e924303451dc6a0f334f2e9d4d61ec71847c4db004ac62a
+size 6903938
diff --git a/data/raw_data/perplexity-Research.jsonl b/data/raw_data/perplexity-Research.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14b989cca9757ec346ad21bdf63a15077a02b5af
--- /dev/null
+++ b/data/raw_data/perplexity-Research.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f27cb31cbab84f60efc3286592e84690fd117355dd84f9e4a9299108245c2a5
+size 1747979
diff --git a/data/raw_data/perplexity-sonar-pro.jsonl b/data/raw_data/perplexity-sonar-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..56c0d03f64c5cd25b4907cd7254d2930afefb2d0
--- /dev/null
+++ b/data/raw_data/perplexity-sonar-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d577c0a208b35eb2c0454c00c70b12759cd8a1687f730f2133d8f392c1831ee
+size 750234
diff --git a/data/raw_data/perplexity-sonar-reasoning-pro.jsonl b/data/raw_data/perplexity-sonar-reasoning-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b649b3f834897f26017bb8c2515f629b2824473c
--- /dev/null
+++ b/data/raw_data/perplexity-sonar-reasoning-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e08c6c4094bbf0aa1749e7b1a45e856a6635b2df6afdf0de8eeafea99e7477fc
+size 495156
diff --git a/data/raw_data/perplexity-sonar-reasoning.jsonl b/data/raw_data/perplexity-sonar-reasoning.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d3dfeb5fd88942e0ba8d997512ea02d282cf980
--- /dev/null
+++ b/data/raw_data/perplexity-sonar-reasoning.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5aecbee30882b3ccd2d65470526fe48c7c016869f00593933a35e7096fe4fb74
+size 659883
diff --git a/data/raw_data/perplexity-sonar.jsonl b/data/raw_data/perplexity-sonar.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0052be2f8957ae0cef0794674113e9035e5c428c
--- /dev/null
+++ b/data/raw_data/perplexity-sonar.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc0ef26282e404b700d56e158644f44228c49a3d5126fa12c8068e053444131e
+size 574856
diff --git a/data/raw_results/claude-3-5-sonnet-with-search.jsonl b/data/raw_results/claude-3-5-sonnet-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6cbf4daaed0703e6af45f79f84ec8b7de9b73551
--- /dev/null
+++ b/data/raw_results/claude-3-5-sonnet-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0c47d1bab126886420bd53bb41a8905cdfb97f105711bcc2f5a27e3d53652ea
+size 1992421
diff --git a/data/raw_results/claude-3-7-sonnet-with-search.jsonl b/data/raw_results/claude-3-7-sonnet-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4d4b9e9e111c47cc90050482a374c1c1ddfb3893
--- /dev/null
+++ b/data/raw_results/claude-3-7-sonnet-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b3a6bf74400c89d24fa47853ab034ed3696ee0694c2d190ba83c3f5dcd8a0ef
+size 2002379
diff --git a/data/raw_results/gemini-2.5-flash-with-grounding.jsonl b/data/raw_results/gemini-2.5-flash-with-grounding.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06b805115fa66dfdac2aa885a6d5fe5d09129c37
--- /dev/null
+++ b/data/raw_results/gemini-2.5-flash-with-grounding.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43b9f71819babb5c00f65f0dd71d707323fb803c585bd74976f49cdc34ab80aa
+size 1951481
diff --git a/data/raw_results/gemini-2.5-pro-deepresearch.jsonl b/data/raw_results/gemini-2.5-pro-deepresearch.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0cfc1e3352805fd32bbec4397b7945440da9337
--- /dev/null
+++ b/data/raw_results/gemini-2.5-pro-deepresearch.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac2fc53c99697e3276c98d735ed630df6fa49d2972c70a5409adc1958ecaa7b7
+size 1937730
diff --git a/data/raw_results/gemini-2.5-pro-with-grounding.jsonl b/data/raw_results/gemini-2.5-pro-with-grounding.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64b458d9c012cecfd581bbef650f5a5fea526a8d
--- /dev/null
+++ b/data/raw_results/gemini-2.5-pro-with-grounding.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e911a18cf8b8a8207eb45584ac650e4640f79db7352055ca5e92356de37f911
+size 1944815
diff --git a/data/raw_results/gpt-4.1-mini-with-search.jsonl b/data/raw_results/gpt-4.1-mini-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..afb748e6922224b0767dd4944dc7ff0e242c118b
--- /dev/null
+++ b/data/raw_results/gpt-4.1-mini-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:948a403d12bcf6b0e3ce6664f83afeb95413684ab0b7912003ed756a4df15c5e
+size 1992345
diff --git a/data/raw_results/gpt-4.1-with-search.jsonl b/data/raw_results/gpt-4.1-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a2b0e22d3e3e4d03bca5442dbbdf52fe402705d8
--- /dev/null
+++ b/data/raw_results/gpt-4.1-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908a5989af337e381bf2bce6795438edd21966f313b5194f532feb1f47e5b812
+size 2090582
diff --git a/data/raw_results/gpt-4o-mini-search-preview.jsonl b/data/raw_results/gpt-4o-mini-search-preview.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7279175fb94fb3cdb888c7285edeebf9d1967f07
--- /dev/null
+++ b/data/raw_results/gpt-4o-mini-search-preview.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4277a9a91fcdaaeff1afe948c1088095d5f01092404fcd1a62407b7a58b7906e
+size 2074673
diff --git a/data/raw_results/gpt-4o-search-preview.jsonl b/data/raw_results/gpt-4o-search-preview.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..58cae9bc4dfa1f4d9d8aff2f7f388c399c34f14b
--- /dev/null
+++ b/data/raw_results/gpt-4o-search-preview.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7adcd70d49d3b5dd6050201aa4fcd31f51288945f4a23de14432a301cbf295a7
+size 2063854
diff --git a/data/raw_results/grok-deeper-search.jsonl b/data/raw_results/grok-deeper-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00b5fe934660df3f6a024886f4e51e5dfac0ed94
--- /dev/null
+++ b/data/raw_results/grok-deeper-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b19fb7ec93872317eae94abeb02ed9c19912057acfa82600167ca853b750f476
+size 1968989
diff --git a/data/raw_results/openai-deepresearch.jsonl b/data/raw_results/openai-deepresearch.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..603263d5b3d798707cc8950b76c5877c18943e31
--- /dev/null
+++ b/data/raw_results/openai-deepresearch.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae45c25f5b5c56a772331543e4eefe7c80e63f33b441dfe83cb4a5c830c88a35
+size 2007501
diff --git a/data/raw_results/perplexity-Research.jsonl b/data/raw_results/perplexity-Research.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7aaab6c2c3979022f50cf4b639be8de8a36ae40e
--- /dev/null
+++ b/data/raw_results/perplexity-Research.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7715271d17cc344873653464ae3fef884e0f3c6bec89deee347ed7a0651beb9
+size 2030483
diff --git a/data/raw_results/perplexity-sonar-pro.jsonl b/data/raw_results/perplexity-sonar-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..26188e17b7dd8c314c23e1022dc93c2a4ac581fe
--- /dev/null
+++ b/data/raw_results/perplexity-sonar-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a453f5b29492f684f53364121e7c79eeb81aee2737a383e2748830a4e4453afb
+size 1975770
diff --git a/data/raw_results/perplexity-sonar-reasoning-pro.jsonl b/data/raw_results/perplexity-sonar-reasoning-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..de695f14d14d614715ff9b2acb6ad6fd4801f506
--- /dev/null
+++ b/data/raw_results/perplexity-sonar-reasoning-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:669a4a14232c63c716de766af7be050f8712f74a6d5437cc8fa637ded39f3c40
+size 1957092
diff --git a/data/raw_results/perplexity-sonar-reasoning.jsonl b/data/raw_results/perplexity-sonar-reasoning.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ca14b740222c630835f04801a5f485cf3fa22fb2
--- /dev/null
+++ b/data/raw_results/perplexity-sonar-reasoning.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bceb5637a9d0092af5ddcca49557a4f8f3604be9ebb430be32e820fa4d6723b3
+size 1951258
diff --git a/data/raw_results/perplexity-sonar.jsonl b/data/raw_results/perplexity-sonar.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..240df6ffd5c6e230e1e22264cd4f476fe8634b8a
--- /dev/null
+++ b/data/raw_results/perplexity-sonar.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ecd1540447863f66bfe1a43905070f9c9b0d40de803348c3450a396df3d8fc
+size 2016838
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8cee142d5a03cf259e99fbc614a0a2a8e0531baa
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+gradio>=3.50.0
+pandas
+numpy
+plotly
+pathlib
+requests
\ No newline at end of file
diff --git a/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-38.pyc b/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b4a7f5c744a2907eb63fa47661447fb2d198915
Binary files /dev/null and b/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-38.pyc differ
diff --git a/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-39.pyc b/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e270d0925ce03410ce8a2c0a7068e6411961ec6
Binary files /dev/null and b/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-39.pyc differ
diff --git a/tabs/__pycache__/data_viewer_tab.cpython-38.pyc b/tabs/__pycache__/data_viewer_tab.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af2f1c98ecf56a0ffa76cdc501d0042f33a14987
Binary files /dev/null and b/tabs/__pycache__/data_viewer_tab.cpython-38.pyc differ
diff --git a/tabs/__pycache__/data_viewer_tab.cpython-39.pyc b/tabs/__pycache__/data_viewer_tab.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12635444030a641b5f94640e95a6fd08c0f53403
Binary files /dev/null and b/tabs/__pycache__/data_viewer_tab.cpython-39.pyc differ
diff --git a/tabs/__pycache__/leaderboard_tab.cpython-38.pyc b/tabs/__pycache__/leaderboard_tab.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18bc7d60f8ff1506010e9c5d9cef2a1ff0d9ba45
Binary files /dev/null and b/tabs/__pycache__/leaderboard_tab.cpython-38.pyc differ
diff --git a/tabs/__pycache__/leaderboard_tab.cpython-39.pyc b/tabs/__pycache__/leaderboard_tab.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..023a1de7375cf91442f251f8b6e750989ef0fe16
Binary files /dev/null and b/tabs/__pycache__/leaderboard_tab.cpython-39.pyc differ
diff --git a/tabs/data_viewer_side_by_side_tab.py b/tabs/data_viewer_side_by_side_tab.py
new file mode 100644
index 0000000000000000000000000000000000000000..5684276ba85f9277aa78d47dcecd38ff95815282
--- /dev/null
+++ b/tabs/data_viewer_side_by_side_tab.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Data-Viewer Side-by-Side tab
+"""
+
+import gradio as gr
+import pandas as pd
+import json, random
+from pathlib import Path
+import re
+
+# ---------- 路径 ----------
+BASE_DIR = Path(__file__).resolve().parent.parent
+DATA_VIEWER_FILE = BASE_DIR / "data" / "data_viewer.jsonl"
+
+# ---------- 工具 (与data_viewer_tab.py共享或可复用) ----------
+def load_data_viewer_data() -> pd.DataFrame:
+ records = []
+ if DATA_VIEWER_FILE.exists():
+ for line in DATA_VIEWER_FILE.read_text(encoding="utf-8").splitlines():
+ try:
+ records.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ df = pd.DataFrame(records)
+ req = ["model_name", "id", "prompt", "article", "overall_score",
+ "comprehensiveness_score", "insight_score",
+ "instruction_following_score", "readability_score"]
+ if df.empty or not all(c in df.columns for c in req):
+ return pd.DataFrame(columns=req)
+ df["id"] = df["id"].astype(str)
+ return df
+
+def make_user_task_markdown(item_id, prompt):
+ return f"""### User Task 🎯
+
+**Task ID:** {item_id}
+
+**Description:** {prompt}"""
+
+def make_article_markdown(article: str) -> str:
+ if article and isinstance(article, str):
+ processed_article = re.sub(r'\n{2,}', '\n\n', article)
+ table_pattern = r'(\|[^\n]*\n(?:[\|\s\-:]+\n)?(?:\|[^\n]*\n)*)'
+ tables = []
+ def replace_table(match):
+ tables.append(match.group(1))
+ return f'__TABLE_PLACEHOLDER_{len(tables)-1}__'
+ processed_article = re.sub(table_pattern, replace_table, processed_article)
+ processed_article = re.sub(r'(?Score", overall),
+ ("Comprehen-
siveness", comprehensiveness),
+ ("Insight
Score", insight),
+ ("Instruction
Following", instruction),
+ ("Readability
Score", readability)
+ ]
+ html_items_str = ""
+ for title, score in scores_data:
+ score_value = score if score is not None else "N/A"
+ html_items_str += f"""
+
+
{title}
+
{score_value}
+
+ """
+ return f"""
+"""
+
+# ---------- 生成 Tab ----------
+def create_data_viewer_side_by_side_tab():
+ with gr.Tab("⚔️Side-by-Side Viewer"):
+ gr.HTML(
+ """"""
+ )
+
+ df = load_data_viewer_data()
+ if df.empty:
+ gr.Markdown("## ⚠️ 没有可用数据 \n请确认 `data/data_viewer.jsonl` 存在且字段齐全(包括所有分数)。")
+ return
+
+ all_models = sorted(df["model_name"].unique())
+ tasks_df = df[["id", "prompt"]].drop_duplicates().assign(id_num=lambda x: x["id"].astype(int)).sort_values("id_num")
+ task_choices = [f"{row['id']}. {row['prompt'][:60] + ('…' if len(row['prompt']) > 60 else '')}" for _, row in tasks_df.iterrows()]
+
+ init_task = random.choice(task_choices) if task_choices else None
+ init_model_a = random.choice(all_models) if all_models else None
+ init_model_b = random.choice([m for m in all_models if m != init_model_a]) if len(all_models) > 1 else None
+ if init_model_b is None and len(all_models) > 0 : init_model_b = all_models[0] # Fallback for single model case
+
+ # --- UI 组件定义 ---
+ with gr.Row():
+ task_dd = gr.Dropdown(label="Select Task", choices=task_choices, value=init_task, interactive=True)
+
+ user_task_display_md = gr.Markdown(elem_classes=["card", "scrollable-sm"]) # 统一显示任务描述
+
+ with gr.Row():
+ with gr.Column(scale=1):
+ model_a_dd = gr.Dropdown(label="Select Model A", choices=all_models, value=init_model_a, interactive=True)
+ article_a_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
+ scores_a_html = gr.HTML()
+ with gr.Column(scale=1):
+ model_b_dd = gr.Dropdown(label="Select Model B", choices=all_models, value=init_model_b, interactive=True)
+ article_b_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
+ scores_b_html = gr.HTML()
+
+ # --- 回调函数 ---
+ def fetch_side_by_side_data(selected_task_display, model_a_name, model_b_name):
+ if not selected_task_display:
+ no_task_msg = "请选择一个任务。"
+ empty_article = make_article_markdown("")
+ empty_scores = make_scores_html(None,None,None,None,None)
+ return make_user_task_markdown("--", no_task_msg), \
+ empty_article, empty_scores, \
+ empty_article, empty_scores
+
+ item_id_str = selected_task_display.split(".", 1)[0].strip()
+ task_entry = df[df["id"] == item_id_str]
+ user_task_md_content = make_user_task_markdown(item_id_str, task_entry["prompt"].iloc[0] if not task_entry.empty else "任务描述未找到。")
+
+ outputs_a = [make_article_markdown("模型A未选择或数据未找到"), make_scores_html(None,None,None,None,None)]
+ outputs_b = [make_article_markdown("模型B未选择或数据未找到"), make_scores_html(None,None,None,None,None)]
+
+ if model_a_name:
+ entry_a = df[(df["model_name"] == model_a_name) & (df["id"] == item_id_str)]
+ if not entry_a.empty:
+ outputs_a[0] = make_article_markdown(entry_a["article"].iloc[0])
+ outputs_a[1] = make_scores_html(entry_a["overall_score"].iloc[0], entry_a["comprehensiveness_score"].iloc[0],
+ entry_a["insight_score"].iloc[0], entry_a["instruction_following_score"].iloc[0],
+ entry_a["readability_score"].iloc[0])
+
+ if model_b_name:
+ entry_b = df[(df["model_name"] == model_b_name) & (df["id"] == item_id_str)]
+ if not entry_b.empty:
+ outputs_b[0] = make_article_markdown(entry_b["article"].iloc[0])
+ outputs_b[1] = make_scores_html(entry_b["overall_score"].iloc[0], entry_b["comprehensiveness_score"].iloc[0],
+ entry_b["insight_score"].iloc[0], entry_b["instruction_following_score"].iloc[0],
+ entry_b["readability_score"].iloc[0])
+
+ return user_task_md_content, outputs_a[0], outputs_a[1], outputs_b[0], outputs_b[1]
+
+ # --- 初始加载与事件绑定 ---
+ if init_task:
+ initial_data = fetch_side_by_side_data(init_task, init_model_a, init_model_b)
+ user_task_display_md.value = initial_data[0]
+ article_a_md.value = initial_data[1]
+ scores_a_html.value = initial_data[2]
+ article_b_md.value = initial_data[3]
+ scores_b_html.value = initial_data[4]
+ else:
+ no_task_msg = "请选择一个任务进行比较。"
+ user_task_display_md.value = make_user_task_markdown("--", no_task_msg)
+ article_a_md.value = make_article_markdown("")
+ scores_a_html.value = make_scores_html(None,None,None,None,None)
+ article_b_md.value = make_article_markdown("")
+ scores_b_html.value = make_scores_html(None,None,None,None,None)
+
+ task_dd.change(fetch_side_by_side_data, inputs=[task_dd, model_a_dd, model_b_dd], outputs=[user_task_display_md, article_a_md, scores_a_html, article_b_md, scores_b_html])
+ model_a_dd.change(fetch_side_by_side_data, inputs=[task_dd, model_a_dd, model_b_dd], outputs=[user_task_display_md, article_a_md, scores_a_html, article_b_md, scores_b_html])
+ model_b_dd.change(fetch_side_by_side_data, inputs=[task_dd, model_a_dd, model_b_dd], outputs=[user_task_display_md, article_a_md, scores_a_html, article_b_md, scores_b_html])
\ No newline at end of file
diff --git a/tabs/data_viewer_tab.py b/tabs/data_viewer_tab.py
new file mode 100644
index 0000000000000000000000000000000000000000..e421432c1c18364a803c86cc42e3df51540d8f35
--- /dev/null
+++ b/tabs/data_viewer_tab.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Data-Viewer tab ---- 美化·修正版
+"""
+
+import gradio as gr
+import pandas as pd
+import json, random
+from pathlib import Path
+import re # 导入re模块
+
+# ---------- 路径 ----------
+BASE_DIR = Path(__file__).resolve().parent.parent
+DATA_VIEWER_FILE = BASE_DIR / "data" / "data_viewer.jsonl"
+
+# ---------- 工具 ----------
+def load_data_viewer_data() -> pd.DataFrame:
+ records = []
+ if DATA_VIEWER_FILE.exists():
+ for line in DATA_VIEWER_FILE.read_text(encoding="utf-8").splitlines():
+ try:
+ records.append(json.loads(line))
+ except json.JSONDecodeError:
+ continue
+ df = pd.DataFrame(records)
+ req = ["model_name", "id", "prompt", "article", "overall_score",
+ "comprehensiveness_score", "insight_score",
+ "instruction_following_score", "readability_score"]
+ if df.empty or not all(c in df.columns for c in req):
+ # 如果缺少任何必要列,返回一个包含所有期望列的空DataFrame,以避免后续错误
+ return pd.DataFrame(columns=req)
+ df["id"] = df["id"].astype(str)
+ return df
+
+def make_user_task_markdown(item_id, prompt):
+ return f"""### User Task 🎯
+
+**Task ID:** {item_id}
+
+**Description:** {prompt}"""
+
+def make_article_markdown(article: str) -> str:
+ if article and isinstance(article, str):
+ # 首先,标准化已经存在的多个换行符
+ processed_article = re.sub(r'\n{2,}', '\n\n', article)
+
+ # 保护表格区域
+ table_pattern = r'(\|[^\n]*\n(?:[\|\s\-:]+\n)?(?:\|[^\n]*\n)*)'
+ tables = []
+ def replace_table(match):
+ tables.append(match.group(1))
+ return f'__TABLE_PLACEHOLDER_{len(tables)-1}__'
+
+ processed_article = re.sub(table_pattern, replace_table, processed_article)
+
+ # 处理列表格式:识别 * ** 模式并确保前面有换行
+ # 匹配模式:* **标题:** 内容
+ processed_article = re.sub(r'(?
+ {title}
+ {score_value}
+
+ """
+
+ # Outer container styled to mimic the .card class from the main CSS block
+ return f"""
+"""
+
+# ---------- 生成 Tab ----------
+def create_data_viewer_tab():
+ with gr.Tab("🔍Data Viewer"):
+ gr.HTML(
+ """
+
+"""
+ )
+
+ df = load_data_viewer_data()
+ if df.empty:
+ gr.Markdown("## ⚠️ 没有可用数据 \n请确认 `data/data_viewer.jsonl` 存在且字段齐全(包括所有分数)。")
+ return
+
+ models = sorted(df["model_name"].unique())
+ tasks_df = (
+ df[["id", "prompt"]].drop_duplicates()
+ .assign(id_num=lambda x: x["id"].astype(int))
+ .sort_values("id_num")
+ )
+
+ task_choices = []
+ for _, row in tasks_df.iterrows():
+ limit = 30 if int(row["id"]) <= 50 else 60
+ preview = row["prompt"][:limit] + ("…" if len(row["prompt"]) > limit else "")
+ task_choices.append(f"{row['id']}. {preview}")
+
+ init_model = random.choice(models) if models else None
+ init_task = random.choice(task_choices) if task_choices else None
+
+ with gr.Row():
+ model_dd = gr.Dropdown(label="Select Model", choices=models, value=init_model, interactive=True)
+ task_dd = gr.Dropdown(label="Select Task", choices=task_choices, value=init_task, interactive=True)
+
+ user_md = gr.Markdown(elem_classes=["card", "scrollable-sm"])
+ article_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
+ scores_html = gr.HTML() # 新增HTML组件用于显示分数
+
+ def fetch(model, task_disp):
+ if not model or not task_disp:
+ msg = "请选择模型和任务。"
+ return make_user_task_markdown("--", msg), make_article_markdown(msg), ""
+
+ item_id = task_disp.split(".", 1)[0].strip()
+ entry = df[(df["model_name"] == model) & (df["id"] == item_id)]
+ if entry.empty:
+ err = f"未找到模型 **{model}** 对应任务 **{item_id}** 的内容或分数。"
+ return make_user_task_markdown(item_id, err), make_article_markdown(err), ""
+
+ prompt = entry["prompt"].iloc[0]
+ article = entry["article"].iloc[0]
+
+ # 提取分数
+ overall = entry["overall_score"].iloc[0]
+ comprehensiveness = entry["comprehensiveness_score"].iloc[0]
+ insight = entry["insight_score"].iloc[0]
+ instruction = entry["instruction_following_score"].iloc[0]
+ readability = entry["readability_score"].iloc[0]
+
+ scores_content = make_scores_html(overall, comprehensiveness, insight, instruction, readability)
+
+ return make_user_task_markdown(item_id, prompt), make_article_markdown(article), scores_content
+
+ # 初始渲染
+ if init_model and init_task:
+ user_md.value, article_md.value, scores_html.value = fetch(init_model, init_task)
+ else:
+ user_md.value = make_user_task_markdown("--", "请选择模型和任务。")
+ article_md.value = make_article_markdown("请选择模型和任务。")
+ scores_html.value = ""
+
+ model_dd.change(fetch, inputs=[model_dd, task_dd], outputs=[user_md, article_md, scores_html])
+ task_dd.change(fetch, inputs=[model_dd, task_dd], outputs=[user_md, article_md, scores_html])
\ No newline at end of file
diff --git a/tabs/leaderboard_tab.py b/tabs/leaderboard_tab.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe0876846b97fdf972dce3c6cbdded86db6cfea
--- /dev/null
+++ b/tabs/leaderboard_tab.py
@@ -0,0 +1,176 @@
+import gradio as gr
+import pandas as pd
+from pathlib import Path
+
+# 相对于主脚本的路径调整
+BASE_DIR = Path(__file__).resolve().parent.parent
+DATA_PATH = BASE_DIR / "data" / "leaderboard.csv"
+
+# 新增:用于高亮显示的常量
+HIGHLIGHT_COLOR = "#E6D8FF"
+CATEGORY_TO_HIGHLIGHT = "Deep Research Agent"
+
+# 新增:列名重命名映射
+COLUMN_RENAME_MAP = {
+ 'overall_score': 'overall',
+ 'comprehensiveness': 'comp.',
+ 'insight': 'insight',
+ 'instruction_following': 'inst.',
+ 'readability': 'read.',
+ 'citation_accuracy': 'c.acc.',
+ 'effective_citations': 'eff.c.'
+}
+
+# 模型分类映射
+MODEL_CATEGORIES = {
+ "Deep Research Agent": [
+ "gemini-2.5-pro-deepresearch",
+ "grok-deeper-search",
+ "openai-deepresearch",
+ "perplexity-Research"
+ ],
+ "LLM with Search": [
+ "claude-3-7-sonnet-with-search",
+ "perplexity-sonar-reasoning-pro",
+ "perplexity-sonar-reasoning",
+ "perplexity-sonar-pro",
+ "gemini-2.5-pro-with-grounding",
+ "gpt-4o-search-preview",
+ "perplexity-sonar",
+ "gpt-4.1-with-search",
+ "gemini-2.5-flash-preview-04-17",
+ "gpt-4o-mini-search-preview",
+ "gpt-4.1-mini-with-search",
+ "claude-3-5-sonnet-with-search"
+ ]
+}
+
+def load_leaderboard() -> pd.DataFrame:
+ if not DATA_PATH.exists():
+ raise FileNotFoundError(
+ f"Leaderboard file not found: {DATA_PATH}.\n"
+ "→ 先运行 rank_leaderboard.py 生成 data/leaderboard.csv"
+ )
+ df = pd.read_csv(DATA_PATH)
+ df.columns = [c.strip() for c in df.columns]
+
+ def get_category(model_name):
+ for category, models in MODEL_CATEGORIES.items():
+ if model_name in models:
+ return category
+ return "Others" # 为不在预定义类别中的模型提供默认类别
+
+ df['category'] = df['model'].apply(get_category)
+ return df
+
+def make_ranked(df: pd.DataFrame) -> pd.DataFrame:
+ ranked = df.sort_values(by='overall_score', ascending=False).reset_index(drop=True)
+ ranked.insert(0, "Rank", range(1, len(ranked) + 1))
+
+ # 重命名列名为简写形式
+ ranked = ranked.rename(columns=COLUMN_RENAME_MAP)
+
+ return ranked
+
+def filter_data(search_text: str, selected_categories: list):
+ df = load_leaderboard()
+
+ if search_text.strip():
+ df = df[df['model'].str.contains(search_text.strip(), case=False, na=False)]
+
+ if selected_categories:
+ df = df[df['category'].isin(selected_categories)]
+
+ return make_ranked(df)
+
+# 新增:辅助函数用于样式化DataFrame
+def _style_specific_rows(row, category_column_name='category', target_category=CATEGORY_TO_HIGHLIGHT, color=HIGHLIGHT_COLOR):
+ """
+ 根据行的类别返回样式列表。如果类别匹配目标类别,则应用背景色。
+ """
+ apply_color = color if row.get(category_column_name) == target_category else ''
+ return [f'background-color: {apply_color}' for _ in row]
+
+def _apply_table_styling(df: pd.DataFrame) -> pd.io.formats.style.Styler:
+ """
+ 应用表格样式:
+ - 高亮显示 CATEGORY_TO_HIGHLIGHT 的行
+ - 保留 'category' 列显示
+ - 格式化数值为两位小数
+ 返回 Pandas Styler 对象。
+ """
+ if df.empty:
+ return df.style
+
+ styled_df = df.copy()
+
+ # 获取数值列(排除 Rank, model, category 列)
+ numeric_columns = []
+ for col in styled_df.columns:
+ if col not in ['Rank', 'model', 'category']:
+ # 检查是否为数值类型
+ if styled_df[col].dtype in ['float64', 'int64'] or pd.api.types.is_numeric_dtype(styled_df[col]):
+ numeric_columns.append(col)
+
+ # 应用行样式 - 高亮特定类别的行
+ styler = styled_df.style.apply(
+ _style_specific_rows,
+ axis=1,
+ category_column_name='category',
+ target_category=CATEGORY_TO_HIGHLIGHT,
+ color=HIGHLIGHT_COLOR
+ )
+
+ # 使用 Styler 的 format 方法格式化数值列为两位小数
+ if numeric_columns:
+ format_dict = {col: '{:.2f}' for col in numeric_columns}
+ styler = styler.format(format_dict)
+
+ return styler
+
+def create_leaderboard_tab():
+ with gr.Tab("🏆Leaderboard"):
+ with gr.Row():
+ search_box = gr.Textbox(
+ label="Model Search",
+ placeholder="Entering model name to search...",
+ value=""
+ )
+ category_checkboxes = gr.CheckboxGroup(
+ label="Model Categories",
+ choices=list(MODEL_CATEGORIES.keys()),
+ value=list(MODEL_CATEGORIES.keys())
+ )
+
+ initial_df_raw = make_ranked(load_leaderboard())
+ styled_initial_value = _apply_table_styling(initial_df_raw.copy())
+
+ table = gr.Dataframe(
+ interactive=False,
+ wrap=False,
+ value=styled_initial_value,
+ )
+
+ def update_display(search_text, selected_categories):
+ filtered_df_raw = filter_data(search_text, selected_categories)
+ styled_updated_value = _apply_table_styling(filtered_df_raw.copy())
+ return styled_updated_value
+
+ search_box.change(
+ fn=update_display,
+ inputs=[search_box, category_checkboxes],
+ outputs=table
+ )
+ category_checkboxes.change(
+ fn=update_display,
+ inputs=[search_box, category_checkboxes],
+ outputs=table
+ )
+
+ # 在底部添加列名说明
+ gr.Markdown("""
+ ### Column Abbreviations
+ The leaderboard uses abbreviated column names for compact display: (i) **overall** - Overall Score; (ii) **comp.** - Comprehensiveness; (iii) **insight** - Insight quality; (iv) **inst.** - Instruction Following; (v) **read.** - Readability; (vi) **c.acc.** - Citation Accuracy; (vii) **eff.c.** - Effective Citations.
+ """)
+
+ return search_box
\ No newline at end of file
diff --git a/utils/merge_raw_data.py b/utils/merge_raw_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c46dfc8762cda0ed8af5d4801a359f09860b844
--- /dev/null
+++ b/utils/merge_raw_data.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import json
+import os
+from pathlib import Path
+
+
+def calculate_dimension_score(target_score, reference_score):
+ """计算单个维度的分数,与rank_leaderboard.py中的逻辑一致"""
+ if (target_score + reference_score) == 0: # 避免除以零
+ return 0.0
+ return target_score / (target_score + reference_score)
+
+
+def load_scores_for_model(model_results_file_path: Path):
+ """为单个模型加载所有文章的评分数据"""
+ scores_by_id = {}
+ if not model_results_file_path.exists():
+ print(f"警告: 未找到模型 {model_results_file_path.stem} 的结果文件: {model_results_file_path}")
+ return scores_by_id
+
+ print(f" 正在从 {model_results_file_path.name} 加载分数...")
+ with open(model_results_file_path, 'r', encoding='utf-8') as f:
+ for i, line in enumerate(f):
+ try:
+ data = json.loads(line.strip())
+ article_id = str(data.get('id')) # 确保ID是字符串以供匹配
+ if not article_id:
+ print(f" 警告: {model_results_file_path.name} 第 {i+1} 行缺少ID,已跳过。")
+ continue
+
+ # 直接获取 overall_score (原始值,假设在0-1范围,或者已经是0-100,根据您的数据调整)
+ # 根据您之前的修改,这里我们假设原始overall_score需要乘以100
+ overall_score_raw = data.get('overall_score', 0.0)
+ overall_score_scaled = overall_score_raw * 100
+
+ # 计算四个维度的分数
+ comp_score_raw = calculate_dimension_score(
+ data.get('target_comprehensiveness_weighted_avg', 0),
+ data.get('reference_comprehensiveness_weighted_avg', 0)
+ )
+ insight_score_raw = calculate_dimension_score(
+ data.get('target_insight_weighted_avg', 0),
+ data.get('reference_insight_weighted_avg', 0)
+ )
+ instruction_score_raw = calculate_dimension_score(
+ data.get('target_instruction_following_weighted_avg', 0),
+ data.get('reference_instruction_following_weighted_avg', 0)
+ )
+ readability_score_raw = calculate_dimension_score(
+ data.get('target_readability_weighted_avg', 0),
+ data.get('reference_readability_weighted_avg', 0)
+ )
+
+ scores_by_id[article_id] = {
+ 'overall_score': f"{overall_score_scaled:.2f}",
+ 'comprehensiveness_score': f"{comp_score_raw * 100:.2f}",
+ 'insight_score': f"{insight_score_raw * 100:.2f}",
+ 'instruction_following_score': f"{instruction_score_raw * 100:.2f}",
+ 'readability_score': f"{readability_score_raw * 100:.2f}"
+ }
+ except json.JSONDecodeError as e:
+ print(f" 错误: 解析JSON时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}")
+ except Exception as e:
+ print(f" 错误: 处理数据时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}")
+ print(f" 为模型 {model_results_file_path.stem} 加载了 {len(scores_by_id)}篇文章的分数")
+ return scores_by_id
+
+
+def merge_jsonl_files():
+ # 定义目录路径
+ project_root = Path(__file__).resolve().parent.parent
+ raw_data_dir = project_root / "data" / "raw_data" # 包含原始文章内容的目录
+ raw_results_dir = project_root / "data" / "raw_results" # 包含评分结果的目录
+ output_file = project_root / "data" / "data_viewer.jsonl"
+
+ # 获取所有原始数据JSONL文件
+ input_files = list(raw_data_dir.glob("*.jsonl"))
+ print(f"在 {raw_data_dir} 中找到 {len(input_files)} 个模型JSONL文件")
+
+ if not input_files:
+ print("未找到任何原始数据文件,已退出。")
+ return
+
+ # 清空输出文件 (如果需要,或者可以采用追加模式,但通常合并操作会重新生成)
+ with open(output_file, 'w', encoding='utf-8') as f:
+ pass # 创建或清空文件
+
+ all_merged_data = []
+
+ for raw_data_file in input_files:
+ model_name = raw_data_file.stem
+ print(f"正在处理原始数据文件: {raw_data_file.name} (模型: {model_name})")
+
+ # 为当前模型加载评分数据
+ model_results_file = raw_results_dir / f"{model_name}.jsonl"
+ scores_for_current_model = load_scores_for_model(model_results_file)
+
+ processed_articles_count = 0
+ with open(raw_data_file, 'r', encoding='utf-8') as f_raw:
+ for i, line in enumerate(f_raw):
+ try:
+ article_data = json.loads(line.strip())
+ article_id = str(article_data.get('id')) # 确保ID是字符串
+
+ if not article_id:
+ print(f" 警告: {raw_data_file.name} 第 {i+1} 行缺少ID,已跳过。")
+ continue
+
+ # 从加载的评分数据中获取该文章的评分
+ article_scores = scores_for_current_model.get(article_id, {})
+ if not article_scores:
+ print(f" 警告: 模型 {model_name} 的文章ID {article_id} 未在结果文件中找到分数。")
+
+ merged_item = {
+ 'model_name': model_name,
+ 'id': article_id,
+ 'prompt': article_data.get('prompt'),
+ 'article': article_data.get('article'),
+ 'overall_score': article_scores.get('overall_score'), # 可能为None
+ 'comprehensiveness_score': article_scores.get('comprehensiveness_score'),
+ 'insight_score': article_scores.get('insight_score'),
+ 'instruction_following_score': article_scores.get('instruction_following_score'),
+ 'readability_score': article_scores.get('readability_score')
+ }
+ all_merged_data.append(merged_item)
+ processed_articles_count += 1
+ except json.JSONDecodeError as e:
+ print(f" 错误: 解析原始数据JSON时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
+ except Exception as e:
+ print(f" 错误: 处理原始数据时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
+ print(f" 为模型 {model_name} 处理了 {processed_articles_count} 篇文章数据。")
+
+ # 一次性写入所有合并后的数据
+ with open(output_file, 'w', encoding='utf-8') as f_out:
+ for item in all_merged_data:
+ f_out.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+ print(f"\n成功合并并保存到: {output_file}, 共 {len(all_merged_data)} 条记录")
+
+if __name__ == "__main__":
+ merge_jsonl_files()
+ print("所有文件处理完成!")
\ No newline at end of file
diff --git a/utils/rank_leaderboard.py b/utils/rank_leaderboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a2754e949f85cb23d98af5593d2d7beada91ce
--- /dev/null
+++ b/utils/rank_leaderboard.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import json
+import os
+import csv
+from pathlib import Path
+from collections import defaultdict
+
+
+def calculate_dimension_score(target_score, reference_score):
+ """计算单个维度的分数"""
+ return target_score / (target_score + reference_score)
+
+
+def process_model_data(model_file):
+ """处理单个模型文件的数据"""
+ model_name = model_file.stem
+ print(f"正在处理模型: {model_name}")
+
+ overall_scores = []
+ comprehensiveness_scores = []
+ insight_scores = []
+ instruction_following_scores = []
+ readability_scores = []
+
+ with open(model_file, 'r', encoding='utf-8') as f:
+ for line in f:
+ try:
+ data = json.loads(line.strip())
+
+ # 获取总分
+ overall_score = data.get('overall_score', 0)
+ overall_scores.append(overall_score)
+
+ # 计算四个维度的分数
+ target_comp = data.get('target_comprehensiveness_weighted_avg', 0)
+ ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0)
+ comp_score = calculate_dimension_score(target_comp, ref_comp)
+ comprehensiveness_scores.append(comp_score)
+
+ target_insight = data.get('target_insight_weighted_avg', 0)
+ ref_insight = data.get('reference_insight_weighted_avg', 0)
+ insight_score = calculate_dimension_score(target_insight, ref_insight)
+ insight_scores.append(insight_score)
+
+ target_instruction = data.get('target_instruction_following_weighted_avg', 0)
+ ref_instruction = data.get('reference_instruction_following_weighted_avg', 0)
+ instruction_score = calculate_dimension_score(target_instruction, ref_instruction)
+ instruction_following_scores.append(instruction_score)
+
+ target_readability = data.get('target_readability_weighted_avg', 0)
+ ref_readability = data.get('reference_readability_weighted_avg', 0)
+ readability_score = calculate_dimension_score(target_readability, ref_readability)
+ readability_scores.append(readability_score)
+
+ except json.JSONDecodeError as e:
+ print(f"解析JSON时出错 (模型: {model_name}): {e}")
+ continue
+ except Exception as e:
+ print(f"处理数据时出错 (模型: {model_name}): {e}")
+ continue
+
+ # 计算平均分
+ avg_overall = sum(overall_scores) / len(overall_scores)
+ avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores)
+ avg_insight = sum(insight_scores) / len(insight_scores)
+ avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores)
+ avg_readability = sum(readability_scores) / len(readability_scores)
+ print(f" - 处理了 {len(overall_scores)} 条记录")
+ print(f" - 总分: {avg_overall:.4f}")
+
+ return {
+ 'model': model_name,
+ 'overall_score': avg_overall * 100,
+ 'comprehensiveness': avg_comprehensiveness * 100,
+ 'insight': avg_insight * 100,
+ 'instruction_following': avg_instruction_following * 100,
+ 'readability': avg_readability * 100
+ }
+
+
+def rank_leaderboard():
+ """计算排行榜并保存到CSV"""
+ # 定义目录路径
+ project_root = Path(__file__).parent.parent
+ input_dir = project_root / "data" / "raw_results"
+ output_file = project_root / "data" / "leaderboard.csv"
+
+ # 获取所有JSONL文件
+ input_files = list(input_dir.glob("*.jsonl"))
+ print(f"找到 {len(input_files)} 个模型结果文件")
+
+ if not input_files:
+ print("未找到任何JSONL文件")
+ return
+
+ # 处理每个模型的数据
+ model_results = []
+ for input_file in input_files:
+ try:
+ result = process_model_data(input_file)
+ model_results.append(result)
+ except Exception as e:
+ print(f"处理文件 {input_file.name} 时出错: {e}")
+ continue
+
+ # 按总分排序(降序)
+ model_results.sort(key=lambda x: x['overall_score'], reverse=True)
+
+ # 写入CSV文件
+ with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
+ fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability']
+ writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+
+ # 写入表头
+ writer.writeheader()
+
+ # 写入数据
+ for result in model_results:
+ writer.writerow({
+ 'model': result['model'],
+ 'overall_score': f"{result['overall_score']:.2f}",
+ 'comprehensiveness': f"{result['comprehensiveness']:.2f}",
+ 'insight': f"{result['insight']:.2f}",
+ 'instruction_following': f"{result['instruction_following']:.2f}",
+ 'readability': f"{result['readability']:.2f}"
+ })
+
+ print(f"\n排行榜已保存到: {output_file}")
+ print(f"共处理了 {len(model_results)} 个模型")
+
+
+if __name__ == "__main__":
+ rank_leaderboard()
+ print("排行榜计算完成!")