diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..4757fc72cdd035b666e98c371a99a93801e2da93 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..095325ae0338009f1a74d6492d66f84c72aaefae
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+__pycache__/
+*.py[cod]
+*$py.class
+.env
+.venv
+env/
+venv/
+ENV/
+.DS_Store
+*.log
+data/data_viewer.jsonl 
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..20ed019d3b9f276f29e708c3b435e545d2a36888
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.10-slim
+
+WORKDIR /code
+
+COPY . /code/
+
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+
+# 默认运行命令
+CMD ["python", "app.py"] 
\ No newline at end of file
diff --git a/README.md b/README.md
index 6a0910a0db9c51b46763c29bce9e5836e8585913..6aa28a0bd2d16e1a3719b9ae7f1da8f86ce8e975 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,25 @@
----
-title: DeepResearch Bench
-emoji: 🏆
-colorFrom: green
-colorTo: red
-sdk: gradio
-sdk_version: 5.31.0
-app_file: app.py
-pinned: false
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# DeepResearch Bench
+
+**DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents**
+
+This application showcases comprehensive evaluation results for Deep Research Agents. The app includes:
+
+- 🏆 **Leaderboard** - View overall performance metrics across all evaluated models
+- 🔍 **Data Viewer** - Explore detailed results for individual research tasks
+- 📊 **Side-by-Side Comparison** - Compare different models' responses to the same research questions
+
+Visit our [project website](https://deepresearch-bench.github.io) for more information.
+
+## Citation
+```bibtex
+@article{du2025deepresearch,
+  author    = {Mingxuan Du and Benfeng Xu and Chiwei Zhu and Xiaorui Wang and Zhendong Mao},
+  title     = {DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents},
+  journal   = {arXiv preprint},
+  year      = {2025},
+}
+```
+
+## Hugging Face Space Details
+- SDK: Gradio
+- SDK Version: 3.50.0 
\ No newline at end of file
diff --git a/__pycache__/create_leaderboard.cpython-38.pyc b/__pycache__/create_leaderboard.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3408c5a29d1c6283bc166e8421f6005cbc232be4
Binary files /dev/null and b/__pycache__/create_leaderboard.cpython-38.pyc differ
diff --git a/__pycache__/create_leaderboard.cpython-39.pyc b/__pycache__/create_leaderboard.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cf01a49dc75d663c9dada97111764009bfe2e1b
Binary files /dev/null and b/__pycache__/create_leaderboard.cpython-39.pyc differ
diff --git a/__pycache__/gradio.cpython-310.pyc b/__pycache__/gradio.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a14d699f12e8124a5a65a6c832e0edd38943945
Binary files /dev/null and b/__pycache__/gradio.cpython-310.pyc differ
diff --git a/__pycache__/gradio.cpython-39.pyc b/__pycache__/gradio.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff2aea33cdd9d956bbcb684ce606e41faf533a6d
Binary files /dev/null and b/__pycache__/gradio.cpython-39.pyc differ
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa612e18bbb18e39e9662df90f2d194a714bd84b
--- /dev/null
+++ b/app.py
@@ -0,0 +1,16 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+DeepResearch Bench HF Space 入口文件
+"""
+
+from __future__ import annotations
+from create_leaderboard import demo
+
+# 在Hugging Face Space中运行
+if __name__ == "__main__":
+    demo.launch(
+        server_name="0.0.0.0",  # 必须这样设置以允许外部访问
+        share=False,  # HF Space 自己有分享功能，无需额外分享
+        show_api=False,  # 隐藏API文档页面
+    ) 
\ No newline at end of file
diff --git a/create_leaderboard.py b/create_leaderboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c6003fe69d529d74590f76ec4f234d20015602c
--- /dev/null
+++ b/create_leaderboard.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Gradio UI – v2.1  (Leaderboard · Data Viewer · Prompt-to-Leaderboard)
+"""
+
+from __future__ import annotations
+from pathlib import Path
+import gradio as gr
+
+# ---- Tab 组件 ----
+from tabs.leaderboard_tab import create_leaderboard_tab
+from tabs.data_viewer_tab import create_data_viewer_tab
+from tabs.data_viewer_side_by_side_tab import create_data_viewer_side_by_side_tab
+
+# ---------------------------------------------------------------------------
+# UI
+# ---------------------------------------------------------------------------
+
+with gr.Blocks(title="DeepResearch Bench") as demo:
+
+    # ========= 全局 CSS（仅作用于自定义标题 & 简介） =========
+    gr.HTML("""
+    <style>
+      .title-block{
+        /* 渐变文字效果 - 改进版 */
+        background: linear-gradient(to right, #009CFF, #823AFF);
+        background: -webkit-linear-gradient(to right, #009CFF, #823AFF);
+        background: -moz-linear-gradient(to right, #009CFF, #823AFF);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        background-clip: text;
+        color: transparent;
+        
+        text-align: center;
+        font-size: 2.1rem;
+        font-weight: 700;
+        margin: 0 0 1rem 0;
+        padding-bottom: 0.2rem;
+        display: inline-block; /* 重要：确保渐变效果正常 */
+        width: 100%; /* 确保居中对齐 */
+      }
+      .intro-block{
+        text-align:center;
+        margin-bottom:1.25rem;
+        line-height:2;
+      }
+      .intro-block a{
+        color:#0a58ca;
+        text-decoration:none;
+        margin:0 .3rem;
+      }
+      .intro-block a:hover{ text-decoration:underline; }
+    </style>
+    """)
+
+    # ========= 顶部标题 & 简介（不使用 Markdown 标题语法） =========
+    gr.HTML("""
+    <div class="title-block">
+      DeepResearch Bench: A Comprehensive Benchmark for Deep Research Agents
+    </div>
+
+    <div class="intro-block">
+      The research aims to comprehensively evaluate the capabilities of Deep Research Agents.<br>
+      <a href="#">Code</a> |
+      <a href="#">Website</a> |
+      <a href="#">Paper</a> |
+      <a href="#">Eval Dataset</a> |
+      Total models: 16 | Last Update: 28 May 2025
+    </div>
+    """)
+
+    # ========= 主 Tabs =========
+    with gr.Tabs():
+        create_leaderboard_tab()           # 🏆 Leaderboard
+        create_data_viewer_tab()           # 🔍 Data Viewer
+        create_data_viewer_side_by_side_tab()
+
+        with gr.Tab("💬Prompt-to-Leaderboard"):
+            gr.Markdown(
+                """
+🚧 **Prompt-to-Leaderboard** module not implemented yet.  
+Planned: inspect how individual prompts affect overall model ranking.
+"""
+            )
+
+# ---------------------------------------------------------------------------
+# Entrypoint
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    demo.launch()
diff --git a/data/data_viewer.jsonl b/data/data_viewer.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..5b61da5f814aa722858757c27b9b9ac5c2d9e2f6
--- /dev/null
+++ b/data/data_viewer.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7ab11f250f4ffd6bf9c74ff8dc1e68f86d7abbf4f6319164bb476177ad7bf6e
+size 28044256
diff --git a/data/leaderboard.csv b/data/leaderboard.csv
new file mode 100644
index 0000000000000000000000000000000000000000..facdb1b2f115eb9ea4627520110806a07ae91d9b
--- /dev/null
+++ b/data/leaderboard.csv
@@ -0,0 +1,17 @@
+model,overall_score,comprehensiveness,insight,instruction_following,readability,citation_accuracy,effective_citations
+gemini-2.5-pro-deepresearch,48.88,48.53,48.50,49.18,49.44,81.44,111.21
+openai-deepresearch,46.98,46.87,45.25,49.27,47.14,77.96,40.79
+perplexity-Research,42.25,40.69,39.39,46.40,44.28,90.24,31.26
+claude-3-7-sonnet-with-search,40.67,38.99,37.66,45.77,41.46,93.68,32.48
+grok-deeper-search,40.24,37.97,35.37,46.30,44.05,83.59,8.15
+perplexity-sonar-reasoning-pro,40.22,37.38,36.11,45.66,44.74,39.36,8.35
+perplexity-sonar-reasoning,40.18,37.14,36.73,45.15,44.35,48.67,11.34
+perplexity-sonar-pro,38.93,36.38,34.26,44.70,43.35,78.66,14.74
+gemini-2.5-pro-with-grounding,35.12,34.06,29.79,41.67,37.16,81.81,32.88
+gpt-4o-search-preview,35.10,31.99,27.57,43.17,41.23,88.41,4.79
+perplexity-sonar,34.54,30.95,27.51,42.33,41.60,74.42,8.67
+gpt-4.1-with-search,33.46,29.42,25.38,42.33,40.77,87.83,4.42
+gemini-2.5-flash-preview-04-17,32.39,31.63,26.73,38.82,34.48,81.92,31.08
+gpt-4o-mini-search-preview,31.55,27.38,22.64,40.67,39.91,84.98,4.95
+gpt-4.1-mini-with-search,30.26,26.05,20.75,39.65,39.33,84.58,4.35
+claude-3-5-sonnet-with-search,28.48,24.82,22.82,35.12,35.08,94.04,9.78
\ No newline at end of file
diff --git a/data/raw_data/claude-3-5-sonnet-with-search.jsonl b/data/raw_data/claude-3-5-sonnet-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..2698143a0a013e0ecd092c4b827c2c5c88391abb
--- /dev/null
+++ b/data/raw_data/claude-3-5-sonnet-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8769fc2e0cf4f059da6e34839f9df09a6fdab9e2872faa467eafa1aa42316a69
+size 505860
diff --git a/data/raw_data/claude-3-7-sonnet-with-search.jsonl b/data/raw_data/claude-3-7-sonnet-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d9bf8dbb623cff46ddc103378447dea42545bc3a
--- /dev/null
+++ b/data/raw_data/claude-3-7-sonnet-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc16f997d3ecd09bccf6d9e756d9ad36d2834d2ed0827b8f39579b6321b98837
+size 2281964
diff --git a/data/raw_data/gemini-2.5-flash-with-grounding.jsonl b/data/raw_data/gemini-2.5-flash-with-grounding.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3098842d987f197d817be9cc6e2f9424ddbfaf0a
--- /dev/null
+++ b/data/raw_data/gemini-2.5-flash-with-grounding.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908295fc145ac2f8833396b56eac8913726d93288ad6b93a9c01a69cbdbbf78a
+size 1016172
diff --git a/data/raw_data/gemini-2.5-pro-deepresearch.jsonl b/data/raw_data/gemini-2.5-pro-deepresearch.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06ebc42cb73fab6016d60139b4c774971103e52a
--- /dev/null
+++ b/data/raw_data/gemini-2.5-pro-deepresearch.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:33c5d28e76595f22fae1b0fbbe2700958bfe707dafe53f7c5842d3067ccfddef
+size 8523353
diff --git a/data/raw_data/gemini-2.5-pro-with-grounding.jsonl b/data/raw_data/gemini-2.5-pro-with-grounding.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3a30903d7569ba5818041b83da67a1193ee7b99e
--- /dev/null
+++ b/data/raw_data/gemini-2.5-pro-with-grounding.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7b409b1031cff2876fd20cd8e9fc95501f2a95ad0154d3634b6538c165373447
+size 1050267
diff --git a/data/raw_data/gpt-4.1-mini-with-search.jsonl b/data/raw_data/gpt-4.1-mini-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..547d5d473cf909ca082d8e23ffa43b22561bb415
--- /dev/null
+++ b/data/raw_data/gpt-4.1-mini-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf809806c294364bb45cb337355d360b0e5e023c8e4ffdbf9557880a02137bab
+size 463012
diff --git a/data/raw_data/gpt-4.1-with-search.jsonl b/data/raw_data/gpt-4.1-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..03ca8b1b5a43fc274c4749e82c2e175ec226cc84
--- /dev/null
+++ b/data/raw_data/gpt-4.1-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e0228624b09e9d6c25c72156f4dd7f5702e3adcdd71a1f309094c2913eb50639
+size 492406
diff --git a/data/raw_data/gpt-4o-mini-search-preview.jsonl b/data/raw_data/gpt-4o-mini-search-preview.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0634529ba7980f4609ac6a8e9ced2d03de2e849c
--- /dev/null
+++ b/data/raw_data/gpt-4o-mini-search-preview.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd49e75b1e7eb6ff40cd4c030032459d727987dd298863b488b9657ae18815a1
+size 541532
diff --git a/data/raw_data/gpt-4o-search-preview.jsonl b/data/raw_data/gpt-4o-search-preview.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..210a27c21e059df675a2fb32a496da6df1d3b475
--- /dev/null
+++ b/data/raw_data/gpt-4o-search-preview.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfb9de873345d6789197013f0cd60fb2d888957bc123447f2a8486e81c296f04
+size 565183
diff --git a/data/raw_data/grok-deeper-search.jsonl b/data/raw_data/grok-deeper-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..3af0f1a3a4366ea4f7f66c3972de52547d81795d
--- /dev/null
+++ b/data/raw_data/grok-deeper-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea6428dcf2e729d84f019c302fb3862a85cefbea08282b5ffcc5c400306ab077
+size 1149933
diff --git a/data/raw_data/openai-deepresearch.jsonl b/data/raw_data/openai-deepresearch.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..89f6322b4e7f791282100772ae2f47300509288a
--- /dev/null
+++ b/data/raw_data/openai-deepresearch.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77d31b8ea1abd9aa8e924303451dc6a0f334f2e9d4d61ec71847c4db004ac62a
+size 6903938
diff --git a/data/raw_data/perplexity-Research.jsonl b/data/raw_data/perplexity-Research.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..14b989cca9757ec346ad21bdf63a15077a02b5af
--- /dev/null
+++ b/data/raw_data/perplexity-Research.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f27cb31cbab84f60efc3286592e84690fd117355dd84f9e4a9299108245c2a5
+size 1747979
diff --git a/data/raw_data/perplexity-sonar-pro.jsonl b/data/raw_data/perplexity-sonar-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..56c0d03f64c5cd25b4907cd7254d2930afefb2d0
--- /dev/null
+++ b/data/raw_data/perplexity-sonar-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5d577c0a208b35eb2c0454c00c70b12759cd8a1687f730f2133d8f392c1831ee
+size 750234
diff --git a/data/raw_data/perplexity-sonar-reasoning-pro.jsonl b/data/raw_data/perplexity-sonar-reasoning-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..b649b3f834897f26017bb8c2515f629b2824473c
--- /dev/null
+++ b/data/raw_data/perplexity-sonar-reasoning-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e08c6c4094bbf0aa1749e7b1a45e856a6635b2df6afdf0de8eeafea99e7477fc
+size 495156
diff --git a/data/raw_data/perplexity-sonar-reasoning.jsonl b/data/raw_data/perplexity-sonar-reasoning.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6d3dfeb5fd88942e0ba8d997512ea02d282cf980
--- /dev/null
+++ b/data/raw_data/perplexity-sonar-reasoning.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5aecbee30882b3ccd2d65470526fe48c7c016869f00593933a35e7096fe4fb74
+size 659883
diff --git a/data/raw_data/perplexity-sonar.jsonl b/data/raw_data/perplexity-sonar.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..0052be2f8957ae0cef0794674113e9035e5c428c
--- /dev/null
+++ b/data/raw_data/perplexity-sonar.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dc0ef26282e404b700d56e158644f44228c49a3d5126fa12c8068e053444131e
+size 574856
diff --git a/data/raw_results/claude-3-5-sonnet-with-search.jsonl b/data/raw_results/claude-3-5-sonnet-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..6cbf4daaed0703e6af45f79f84ec8b7de9b73551
--- /dev/null
+++ b/data/raw_results/claude-3-5-sonnet-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0c47d1bab126886420bd53bb41a8905cdfb97f105711bcc2f5a27e3d53652ea
+size 1992421
diff --git a/data/raw_results/claude-3-7-sonnet-with-search.jsonl b/data/raw_results/claude-3-7-sonnet-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..4d4b9e9e111c47cc90050482a374c1c1ddfb3893
--- /dev/null
+++ b/data/raw_results/claude-3-7-sonnet-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b3a6bf74400c89d24fa47853ab034ed3696ee0694c2d190ba83c3f5dcd8a0ef
+size 2002379
diff --git a/data/raw_results/gemini-2.5-flash-with-grounding.jsonl b/data/raw_results/gemini-2.5-flash-with-grounding.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..06b805115fa66dfdac2aa885a6d5fe5d09129c37
--- /dev/null
+++ b/data/raw_results/gemini-2.5-flash-with-grounding.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:43b9f71819babb5c00f65f0dd71d707323fb803c585bd74976f49cdc34ab80aa
+size 1951481
diff --git a/data/raw_results/gemini-2.5-pro-deepresearch.jsonl b/data/raw_results/gemini-2.5-pro-deepresearch.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..d0cfc1e3352805fd32bbec4397b7945440da9337
--- /dev/null
+++ b/data/raw_results/gemini-2.5-pro-deepresearch.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ac2fc53c99697e3276c98d735ed630df6fa49d2972c70a5409adc1958ecaa7b7
+size 1937730
diff --git a/data/raw_results/gemini-2.5-pro-with-grounding.jsonl b/data/raw_results/gemini-2.5-pro-with-grounding.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..64b458d9c012cecfd581bbef650f5a5fea526a8d
--- /dev/null
+++ b/data/raw_results/gemini-2.5-pro-with-grounding.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5e911a18cf8b8a8207eb45584ac650e4640f79db7352055ca5e92356de37f911
+size 1944815
diff --git a/data/raw_results/gpt-4.1-mini-with-search.jsonl b/data/raw_results/gpt-4.1-mini-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..afb748e6922224b0767dd4944dc7ff0e242c118b
--- /dev/null
+++ b/data/raw_results/gpt-4.1-mini-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:948a403d12bcf6b0e3ce6664f83afeb95413684ab0b7912003ed756a4df15c5e
+size 1992345
diff --git a/data/raw_results/gpt-4.1-with-search.jsonl b/data/raw_results/gpt-4.1-with-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..a2b0e22d3e3e4d03bca5442dbbdf52fe402705d8
--- /dev/null
+++ b/data/raw_results/gpt-4.1-with-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:908a5989af337e381bf2bce6795438edd21966f313b5194f532feb1f47e5b812
+size 2090582
diff --git a/data/raw_results/gpt-4o-mini-search-preview.jsonl b/data/raw_results/gpt-4o-mini-search-preview.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7279175fb94fb3cdb888c7285edeebf9d1967f07
--- /dev/null
+++ b/data/raw_results/gpt-4o-mini-search-preview.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4277a9a91fcdaaeff1afe948c1088095d5f01092404fcd1a62407b7a58b7906e
+size 2074673
diff --git a/data/raw_results/gpt-4o-search-preview.jsonl b/data/raw_results/gpt-4o-search-preview.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..58cae9bc4dfa1f4d9d8aff2f7f388c399c34f14b
--- /dev/null
+++ b/data/raw_results/gpt-4o-search-preview.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7adcd70d49d3b5dd6050201aa4fcd31f51288945f4a23de14432a301cbf295a7
+size 2063854
diff --git a/data/raw_results/grok-deeper-search.jsonl b/data/raw_results/grok-deeper-search.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..00b5fe934660df3f6a024886f4e51e5dfac0ed94
--- /dev/null
+++ b/data/raw_results/grok-deeper-search.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b19fb7ec93872317eae94abeb02ed9c19912057acfa82600167ca853b750f476
+size 1968989
diff --git a/data/raw_results/openai-deepresearch.jsonl b/data/raw_results/openai-deepresearch.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..603263d5b3d798707cc8950b76c5877c18943e31
--- /dev/null
+++ b/data/raw_results/openai-deepresearch.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae45c25f5b5c56a772331543e4eefe7c80e63f33b441dfe83cb4a5c830c88a35
+size 2007501
diff --git a/data/raw_results/perplexity-Research.jsonl b/data/raw_results/perplexity-Research.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..7aaab6c2c3979022f50cf4b639be8de8a36ae40e
--- /dev/null
+++ b/data/raw_results/perplexity-Research.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b7715271d17cc344873653464ae3fef884e0f3c6bec89deee347ed7a0651beb9
+size 2030483
diff --git a/data/raw_results/perplexity-sonar-pro.jsonl b/data/raw_results/perplexity-sonar-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..26188e17b7dd8c314c23e1022dc93c2a4ac581fe
--- /dev/null
+++ b/data/raw_results/perplexity-sonar-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a453f5b29492f684f53364121e7c79eeb81aee2737a383e2748830a4e4453afb
+size 1975770
diff --git a/data/raw_results/perplexity-sonar-reasoning-pro.jsonl b/data/raw_results/perplexity-sonar-reasoning-pro.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..de695f14d14d614715ff9b2acb6ad6fd4801f506
--- /dev/null
+++ b/data/raw_results/perplexity-sonar-reasoning-pro.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:669a4a14232c63c716de766af7be050f8712f74a6d5437cc8fa637ded39f3c40
+size 1957092
diff --git a/data/raw_results/perplexity-sonar-reasoning.jsonl b/data/raw_results/perplexity-sonar-reasoning.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..ca14b740222c630835f04801a5f485cf3fa22fb2
--- /dev/null
+++ b/data/raw_results/perplexity-sonar-reasoning.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bceb5637a9d0092af5ddcca49557a4f8f3604be9ebb430be32e820fa4d6723b3
+size 1951258
diff --git a/data/raw_results/perplexity-sonar.jsonl b/data/raw_results/perplexity-sonar.jsonl
new file mode 100644
index 0000000000000000000000000000000000000000..240df6ffd5c6e230e1e22264cd4f476fe8634b8a
--- /dev/null
+++ b/data/raw_results/perplexity-sonar.jsonl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:36ecd1540447863f66bfe1a43905070f9c9b0d40de803348c3450a396df3d8fc
+size 2016838
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8cee142d5a03cf259e99fbc614a0a2a8e0531baa
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+gradio>=3.50.0
+pandas
+numpy
+plotly
+pathlib
+requests 
\ No newline at end of file
diff --git a/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-38.pyc b/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b4a7f5c744a2907eb63fa47661447fb2d198915
Binary files /dev/null and b/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-38.pyc differ
diff --git a/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-39.pyc b/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e270d0925ce03410ce8a2c0a7068e6411961ec6
Binary files /dev/null and b/tabs/__pycache__/data_viewer_side_by_side_tab.cpython-39.pyc differ
diff --git a/tabs/__pycache__/data_viewer_tab.cpython-38.pyc b/tabs/__pycache__/data_viewer_tab.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af2f1c98ecf56a0ffa76cdc501d0042f33a14987
Binary files /dev/null and b/tabs/__pycache__/data_viewer_tab.cpython-38.pyc differ
diff --git a/tabs/__pycache__/data_viewer_tab.cpython-39.pyc b/tabs/__pycache__/data_viewer_tab.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12635444030a641b5f94640e95a6fd08c0f53403
Binary files /dev/null and b/tabs/__pycache__/data_viewer_tab.cpython-39.pyc differ
diff --git a/tabs/__pycache__/leaderboard_tab.cpython-38.pyc b/tabs/__pycache__/leaderboard_tab.cpython-38.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18bc7d60f8ff1506010e9c5d9cef2a1ff0d9ba45
Binary files /dev/null and b/tabs/__pycache__/leaderboard_tab.cpython-38.pyc differ
diff --git a/tabs/__pycache__/leaderboard_tab.cpython-39.pyc b/tabs/__pycache__/leaderboard_tab.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..023a1de7375cf91442f251f8b6e750989ef0fe16
Binary files /dev/null and b/tabs/__pycache__/leaderboard_tab.cpython-39.pyc differ
diff --git a/tabs/data_viewer_side_by_side_tab.py b/tabs/data_viewer_side_by_side_tab.py
new file mode 100644
index 0000000000000000000000000000000000000000..5684276ba85f9277aa78d47dcecd38ff95815282
--- /dev/null
+++ b/tabs/data_viewer_side_by_side_tab.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Data-Viewer Side-by-Side tab
+"""
+
+import gradio as gr
+import pandas as pd
+import json, random
+from pathlib import Path
+import re 
+
+# ---------- 路径 ----------
+BASE_DIR = Path(__file__).resolve().parent.parent
+DATA_VIEWER_FILE = BASE_DIR / "data" / "data_viewer.jsonl"
+
+# ---------- 工具 (与data_viewer_tab.py共享或可复用) ----------
+def load_data_viewer_data() -> pd.DataFrame:
+    records = []
+    if DATA_VIEWER_FILE.exists():
+        for line in DATA_VIEWER_FILE.read_text(encoding="utf-8").splitlines():
+            try:
+                records.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    df = pd.DataFrame(records)
+    req = ["model_name", "id", "prompt", "article", "overall_score", 
+           "comprehensiveness_score", "insight_score", 
+           "instruction_following_score", "readability_score"]
+    if df.empty or not all(c in df.columns for c in req):
+        return pd.DataFrame(columns=req)
+    df["id"] = df["id"].astype(str)
+    return df
+
+def make_user_task_markdown(item_id, prompt):
+    return f"""### User Task 🎯
+
+**Task ID:** {item_id}
+
+**Description:** {prompt}"""
+
+def make_article_markdown(article: str) -> str:
+    if article and isinstance(article, str):
+        processed_article = re.sub(r'\n{2,}', '\n\n', article)
+        table_pattern = r'(\|[^\n]*\n(?:[\|\s\-:]+\n)?(?:\|[^\n]*\n)*)'
+        tables = []
+        def replace_table(match):
+            tables.append(match.group(1))
+            return f'__TABLE_PLACEHOLDER_{len(tables)-1}__'
+        processed_article = re.sub(table_pattern, replace_table, processed_article)
+        processed_article = re.sub(r'(?<!\n)\*\s*\*\*([^*]+?)\*\*:', r'\n\n* **\1**:', processed_article)
+        processed_article = re.sub(r'\*\s*\*\*([^*]+?)\*\*:\s*([^*]*?)\s*\*\s*\*\*', r'* **\1**: \2\n  * **', processed_article)
+        processed_article = re.sub(r'(?<!\n)\[\d+[^]]*\]\*\s*\*\*', r'\n\n* **', processed_article)
+        lines = processed_article.split('\n')
+        result_lines = []
+        for i, line in enumerate(lines):
+            result_lines.append(line)
+            if (i < len(lines) - 1 and 
+                line.strip() and 
+                lines[i + 1].strip() and
+                not line.strip().startswith('*') and
+                not lines[i + 1].strip().startswith('*') and
+                not line.strip().startswith('#')):
+                if i + 1 < len(lines) and lines[i + 1].strip():
+                    result_lines.append('')
+        processed_article = '\n'.join(result_lines)
+        for i, table in enumerate(tables):
+            processed_article = processed_article.replace(f'__TABLE_PLACEHOLDER_{i}__', table)
+    else:
+        processed_article = article if article is not None else ""
+    return f"""### Generated Article 📖
+
+{processed_article}"""
+
+def make_scores_html(overall, comprehensiveness, insight, instruction, readability):
+    scores_data = [
+        ("Overall<br>Score", overall),
+        ("Comprehen-<br>siveness", comprehensiveness),
+        ("Insight<br>Score", insight),
+        ("Instruction<br>Following", instruction),
+        ("Readability<br>Score", readability)
+    ]
+    html_items_str = ""
+    for title, score in scores_data:
+        score_value = score if score is not None else "N/A"
+        html_items_str += f"""
+        <div style="text-align: center; padding: 10px 3px; flex-grow: 1; flex-basis: 19%; min-width: 0;">
+            <h4 style="margin: 0 0 5px 0; font-size: 1em; color: #4a4a4a; font-weight: 600; line-height: 1.2;">{title}</h4>
+            <p style="margin: 0; font-size: 1.1em; font-weight: bold; color: #333;">{score_value}</p>
+        </div>
+        """
+    return f"""
+<div style="background:#fff; border:1px solid #e0e0e0; border-radius:8px; padding: 15px 10px; margin:18px 0; box-shadow:0 2px 4px rgba(0,0,0,.06);">
+    <div style="display: flex; justify-content: space-around; align-items: stretch;">
+        {html_items_str}
+    </div>
+</div>"""
+
+# ---------- 生成 Tab ----------
+def create_data_viewer_side_by_side_tab():
+    with gr.Tab("⚔️Side-by-Side Viewer"):
+        gr.HTML(
+            """<style>
+            .card{background:#fff;border:1px solid #e0e0e0;border-radius:8px;padding:22px 24px;margin:18px 0;box-shadow:0 2px 4px rgba(0,0,0,.06);}
+            .scrollable-sm{max-height:180px;overflow-y:auto;} /* 稍微减小任务区高度 */
+            .scrollable-lg{max-height:550px;overflow-y:auto;} /* 调整文章区高度 */
+            .card p{color:#424242 !important;line-height:1.75;margin:0 0 14px 0;text-align:justify;}
+            .card ul,.card ol{margin:12px 0 12px 24px;color:#424242 !important;}
+            .card li{margin:4px 0;color:#424242 !important;}
+            .card blockquote{border-left:4px solid #3498db;margin:18px 0;padding:14px 18px;background:#f8f9fa;font-style:italic;color:#555 !important;}
+            .card pre{background:#f8f8f8;color:#333 !important;padding:18px;border-radius:6px;overflow-x:auto;border:1px solid #e0e0e0;}
+            .card strong,.card b{font-weight:700 !important;}
+            .card::-webkit-scrollbar{width:10px}
+            .card::-webkit-scrollbar-track{background:#f5f5f5;border-radius:5px}
+            .card::-webkit-scrollbar-thumb{background:#c0c0c0;border-radius:5px}
+            .card::-webkit-scrollbar-thumb:hover{background:#a0a0a0}
+            </style>"""
+        )
+
+        df = load_data_viewer_data()
+        if df.empty:
+            gr.Markdown("## ⚠️ 没有可用数据  \n请确认 `data/data_viewer.jsonl` 存在且字段齐全(包括所有分数)。")
+            return
+
+        all_models = sorted(df["model_name"].unique())
+        tasks_df = df[["id", "prompt"]].drop_duplicates().assign(id_num=lambda x: x["id"].astype(int)).sort_values("id_num")
+        task_choices = [f"{row['id']}. {row['prompt'][:60] + ('…' if len(row['prompt']) > 60 else '')}" for _, row in tasks_df.iterrows()]
+
+        init_task = random.choice(task_choices) if task_choices else None
+        init_model_a = random.choice(all_models) if all_models else None
+        init_model_b = random.choice([m for m in all_models if m != init_model_a]) if len(all_models) > 1 else None
+        if init_model_b is None and len(all_models) > 0 : init_model_b = all_models[0] # Fallback for single model case
+        
+        # --- UI 组件定义 ---
+        with gr.Row():
+            task_dd = gr.Dropdown(label="Select Task", choices=task_choices, value=init_task, interactive=True)
+        
+        user_task_display_md = gr.Markdown(elem_classes=["card", "scrollable-sm"]) # 统一显示任务描述
+
+        with gr.Row():
+            with gr.Column(scale=1):
+                model_a_dd = gr.Dropdown(label="Select Model A", choices=all_models, value=init_model_a, interactive=True)
+                article_a_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
+                scores_a_html = gr.HTML()
+            with gr.Column(scale=1):
+                model_b_dd = gr.Dropdown(label="Select Model B", choices=all_models, value=init_model_b, interactive=True)
+                article_b_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
+                scores_b_html = gr.HTML()
+
+        # --- 回调函数 ---
+        def fetch_side_by_side_data(selected_task_display, model_a_name, model_b_name):
+            if not selected_task_display:
+                no_task_msg = "请选择一个任务。"
+                empty_article = make_article_markdown("")
+                empty_scores = make_scores_html(None,None,None,None,None)
+                return make_user_task_markdown("--", no_task_msg), \
+                       empty_article, empty_scores, \
+                       empty_article, empty_scores
+            
+            item_id_str = selected_task_display.split(".", 1)[0].strip()
+            task_entry = df[df["id"] == item_id_str]
+            user_task_md_content = make_user_task_markdown(item_id_str, task_entry["prompt"].iloc[0] if not task_entry.empty else "任务描述未找到。")
+
+            outputs_a = [make_article_markdown("模型A未选择或数据未找到"), make_scores_html(None,None,None,None,None)]
+            outputs_b = [make_article_markdown("模型B未选择或数据未找到"), make_scores_html(None,None,None,None,None)]
+
+            if model_a_name:
+                entry_a = df[(df["model_name"] == model_a_name) & (df["id"] == item_id_str)]
+                if not entry_a.empty:
+                    outputs_a[0] = make_article_markdown(entry_a["article"].iloc[0])
+                    outputs_a[1] = make_scores_html(entry_a["overall_score"].iloc[0], entry_a["comprehensiveness_score"].iloc[0], 
+                                                    entry_a["insight_score"].iloc[0], entry_a["instruction_following_score"].iloc[0], 
+                                                    entry_a["readability_score"].iloc[0])
+            
+            if model_b_name:
+                entry_b = df[(df["model_name"] == model_b_name) & (df["id"] == item_id_str)]
+                if not entry_b.empty:
+                    outputs_b[0] = make_article_markdown(entry_b["article"].iloc[0])
+                    outputs_b[1] = make_scores_html(entry_b["overall_score"].iloc[0], entry_b["comprehensiveness_score"].iloc[0], 
+                                                    entry_b["insight_score"].iloc[0], entry_b["instruction_following_score"].iloc[0], 
+                                                    entry_b["readability_score"].iloc[0])
+            
+            return user_task_md_content, outputs_a[0], outputs_a[1], outputs_b[0], outputs_b[1]
+
+        # --- 初始加载与事件绑定 ---
+        if init_task:
+            initial_data = fetch_side_by_side_data(init_task, init_model_a, init_model_b)
+            user_task_display_md.value = initial_data[0]
+            article_a_md.value = initial_data[1]
+            scores_a_html.value = initial_data[2]
+            article_b_md.value = initial_data[3]
+            scores_b_html.value = initial_data[4]
+        else:
+            no_task_msg = "请选择一个任务进行比较。"
+            user_task_display_md.value = make_user_task_markdown("--", no_task_msg)
+            article_a_md.value = make_article_markdown("")
+            scores_a_html.value = make_scores_html(None,None,None,None,None)
+            article_b_md.value = make_article_markdown("")
+            scores_b_html.value = make_scores_html(None,None,None,None,None)
+
+        task_dd.change(fetch_side_by_side_data, inputs=[task_dd, model_a_dd, model_b_dd], outputs=[user_task_display_md, article_a_md, scores_a_html, article_b_md, scores_b_html])
+        model_a_dd.change(fetch_side_by_side_data, inputs=[task_dd, model_a_dd, model_b_dd], outputs=[user_task_display_md, article_a_md, scores_a_html, article_b_md, scores_b_html])
+        model_b_dd.change(fetch_side_by_side_data, inputs=[task_dd, model_a_dd, model_b_dd], outputs=[user_task_display_md, article_a_md, scores_a_html, article_b_md, scores_b_html]) 
\ No newline at end of file
diff --git a/tabs/data_viewer_tab.py b/tabs/data_viewer_tab.py
new file mode 100644
index 0000000000000000000000000000000000000000..e421432c1c18364a803c86cc42e3df51540d8f35
--- /dev/null
+++ b/tabs/data_viewer_tab.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Data-Viewer tab ---- 美化·修正版
+"""
+
+import gradio as gr
+import pandas as pd
+import json, random
+from pathlib import Path
+import re # 导入re模块
+
+# ---------- 路径 ----------
+BASE_DIR = Path(__file__).resolve().parent.parent
+DATA_VIEWER_FILE = BASE_DIR / "data" / "data_viewer.jsonl"
+
+# ---------- 工具 ----------
+def load_data_viewer_data() -> pd.DataFrame:
+    records = []
+    if DATA_VIEWER_FILE.exists():
+        for line in DATA_VIEWER_FILE.read_text(encoding="utf-8").splitlines():
+            try:
+                records.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    df = pd.DataFrame(records)
+    req = ["model_name", "id", "prompt", "article", "overall_score", 
+           "comprehensiveness_score", "insight_score", 
+           "instruction_following_score", "readability_score"]
+    if df.empty or not all(c in df.columns for c in req):
+        # 如果缺少任何必要列，返回一个包含所有期望列的空DataFrame，以避免后续错误
+        return pd.DataFrame(columns=req) 
+    df["id"] = df["id"].astype(str)
+    return df
+
+def make_user_task_markdown(item_id, prompt):
+    return f"""### User Task 🎯
+
+**Task ID:** {item_id}
+
+**Description:** {prompt}"""
+
+def make_article_markdown(article: str) -> str:
+    if article and isinstance(article, str):
+        # 首先，标准化已经存在的多个换行符
+        processed_article = re.sub(r'\n{2,}', '\n\n', article)
+        
+        # 保护表格区域
+        table_pattern = r'(\|[^\n]*\n(?:[\|\s\-:]+\n)?(?:\|[^\n]*\n)*)'
+        tables = []
+        def replace_table(match):
+            tables.append(match.group(1))
+            return f'__TABLE_PLACEHOLDER_{len(tables)-1}__'
+        
+        processed_article = re.sub(table_pattern, replace_table, processed_article)
+        
+        # 处理列表格式：识别 * ** 模式并确保前面有换行
+        # 匹配模式：* **标题:** 内容
+        processed_article = re.sub(r'(?<!\n)\*\s*\*\*([^*]+?)\*\*:', r'\n\n* **\1**:', processed_article)
+        
+        # 处理嵌套列表：识别 * ** 后跟 * ** 的模式
+        processed_article = re.sub(r'\*\s*\*\*([^*]+?)\*\*:\s*([^*]*?)\s*\*\s*\*\*', r'* **\1**: \2\n  * **', processed_article)
+        
+        # 在引用标记前确保有适当的换行
+        processed_article = re.sub(r'(?<!\n)\[\d+[^\]]*\]\*\s*\*\*', r'\n\n* **', processed_article)
+        
+        # 处理其他孤立的换行符（避免破坏我们刚创建的格式）
+        # 但要小心不要影响列表结构
+        lines = processed_article.split('\n')
+        result_lines = []
+        
+        for i, line in enumerate(lines):
+            result_lines.append(line)
+            # 如果当前行不为空，下一行也不为空，且都不是列表项，则添加空行
+            if (i < len(lines) - 1 and 
+                line.strip() and 
+                lines[i + 1].strip() and
+                not line.strip().startswith('*') and
+                not lines[i + 1].strip().startswith('*') and
+                not line.strip().startswith('#')):
+                # 检查是否已经是双换行
+                if i + 1 < len(lines) and lines[i + 1].strip():
+                    result_lines.append('')  # 添加空行
+        
+        processed_article = '\n'.join(result_lines)
+        
+        # 恢复表格
+        for i, table in enumerate(tables):
+            processed_article = processed_article.replace(f'__TABLE_PLACEHOLDER_{i}__', table)
+            
+    else:
+        processed_article = article if article is not None else ""
+        
+    return f"""### Generated Article 📖
+
+{processed_article}"""
+
+def make_scores_html(overall, comprehensiveness, insight, instruction, readability):
+    scores_data = [
+        ("Overall Score", overall),
+        ("Comprehensiveness Score", comprehensiveness),
+        ("Insight Score", insight),
+        ("Instruction-Following Score", instruction),
+        ("Readability Score", readability)
+    ]
+    
+    html_items_str = ""
+    for title, score in scores_data:
+        score_value = score if score is not None else "N/A"
+        html_items_str += f"""
+        <div style="text-align: center; padding: 8px 5px; flex-grow: 1; flex-basis: 0;">
+            <h4 style="margin: 0 0 6px 0; font-size: 1.2em; color: #4a4a4a; font-weight: 600;">{title}</h4>
+            <p style="margin: 0; font-size: 1.2em; font-weight: bold; color: #333;">{score_value}</p>
+        </div>
+        """
+    
+    # Outer container styled to mimic the .card class from the main CSS block
+    return f"""
+<div style="background:#fff; border:1px solid #e0e0e0; border-radius:8px; padding: 18px 15px; margin:18px 0; box-shadow:0 2px 4px rgba(0,0,0,.06);">
+    <div style="display: flex; justify-content: space-between; align-items: flex-start;">
+        {html_items_str}
+    </div>
+</div>"""
+
+# ---------- 生成 Tab ----------
+def create_data_viewer_tab():
+    with gr.Tab("🔍Data Viewer"):
+        gr.HTML(
+            """
+<style>
+.card{background:#fff;border:1px solid #e0e0e0;border-radius:8px;padding:22px 24px;margin:18px 0;box-shadow:0 2px 4px rgba(0,0,0,.06);}
+.scrollable-sm{max-height:260px;overflow-y:auto;}
+.scrollable-lg{max-height:700px;overflow-y:auto;} /* 调整高度为分数区域腾出空间 */
+.card p{color:#424242 !important;line-height:1.75;margin:0 0 14px 0;text-align:justify;}
+.card ul,.card ol{margin:12px 0 12px 24px;color:#424242 !important;}
+.card li{margin:4px 0;color:#424242 !important;}
+.card blockquote{border-left:4px solid #3498db;margin:18px 0;padding:14px 18px;background:#f8f9fa;font-style:italic;color:#555 !important;}
+.card pre{background:#f8f8f8;color:#333 !important;padding:18px;border-radius:6px;overflow-x:auto;border:1px solid #e0e0e0;}
+.card strong,.card b{font-weight:700 !important;}
+.card::-webkit-scrollbar{width:10px}
+.card::-webkit-scrollbar-track{background:#f5f5f5;border-radius:5px}
+.card::-webkit-scrollbar-thumb{background:#c0c0c0;border-radius:5px}
+.card::-webkit-scrollbar-thumb:hover{background:#a0a0a0}
+</style>
+"""
+        )
+
+        df = load_data_viewer_data()
+        if df.empty:
+            gr.Markdown("## ⚠️ 没有可用数据  \n请确认 `data/data_viewer.jsonl` 存在且字段齐全(包括所有分数)。")
+            return
+
+        models = sorted(df["model_name"].unique())
+        tasks_df = (
+            df[["id", "prompt"]].drop_duplicates()
+            .assign(id_num=lambda x: x["id"].astype(int))
+            .sort_values("id_num")
+        )
+
+        task_choices = []
+        for _, row in tasks_df.iterrows():
+            limit = 30 if int(row["id"]) <= 50 else 60
+            preview = row["prompt"][:limit] + ("…" if len(row["prompt"]) > limit else "")
+            task_choices.append(f"{row['id']}. {preview}")
+
+        init_model = random.choice(models) if models else None
+        init_task = random.choice(task_choices) if task_choices else None
+
+        with gr.Row():
+            model_dd = gr.Dropdown(label="Select Model", choices=models, value=init_model, interactive=True)
+            task_dd = gr.Dropdown(label="Select Task", choices=task_choices, value=init_task, interactive=True)
+
+        user_md = gr.Markdown(elem_classes=["card", "scrollable-sm"])
+        article_md = gr.Markdown(elem_classes=["card", "scrollable-lg"])
+        scores_html = gr.HTML() # 新增HTML组件用于显示分数
+
+        def fetch(model, task_disp):
+            if not model or not task_disp:
+                msg = "请选择模型和任务。"
+                return make_user_task_markdown("--", msg), make_article_markdown(msg), ""
+
+            item_id = task_disp.split(".", 1)[0].strip()
+            entry = df[(df["model_name"] == model) & (df["id"] == item_id)]
+            if entry.empty:
+                err = f"未找到模型 **{model}** 对应任务 **{item_id}** 的内容或分数。"
+                return make_user_task_markdown(item_id, err), make_article_markdown(err), ""
+
+            prompt  = entry["prompt"].iloc[0]
+            article = entry["article"].iloc[0]
+            
+            # 提取分数
+            overall = entry["overall_score"].iloc[0]
+            comprehensiveness = entry["comprehensiveness_score"].iloc[0]
+            insight = entry["insight_score"].iloc[0]
+            instruction = entry["instruction_following_score"].iloc[0]
+            readability = entry["readability_score"].iloc[0]
+            
+            scores_content = make_scores_html(overall, comprehensiveness, insight, instruction, readability)
+            
+            return make_user_task_markdown(item_id, prompt), make_article_markdown(article), scores_content
+
+        # 初始渲染
+        if init_model and init_task:
+            user_md.value, article_md.value, scores_html.value = fetch(init_model, init_task)
+        else:
+            user_md.value = make_user_task_markdown("--", "请选择模型和任务。")
+            article_md.value = make_article_markdown("请选择模型和任务。")
+            scores_html.value = ""
+
+        model_dd.change(fetch, inputs=[model_dd, task_dd], outputs=[user_md, article_md, scores_html])
+        task_dd.change(fetch, inputs=[model_dd, task_dd], outputs=[user_md, article_md, scores_html])
\ No newline at end of file
diff --git a/tabs/leaderboard_tab.py b/tabs/leaderboard_tab.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fe0876846b97fdf972dce3c6cbdded86db6cfea
--- /dev/null
+++ b/tabs/leaderboard_tab.py
@@ -0,0 +1,176 @@
+import gradio as gr
+import pandas as pd
+from pathlib import Path
+
+# 相对于主脚本的路径调整
+BASE_DIR = Path(__file__).resolve().parent.parent
+DATA_PATH = BASE_DIR / "data" / "leaderboard.csv"
+
+# 新增：用于高亮显示的常量
+HIGHLIGHT_COLOR = "#E6D8FF"  
+CATEGORY_TO_HIGHLIGHT = "Deep Research Agent"
+
+# 新增：列名重命名映射
+COLUMN_RENAME_MAP = {
+    'overall_score': 'overall',
+    'comprehensiveness': 'comp.',
+    'insight': 'insight',
+    'instruction_following': 'inst.',
+    'readability': 'read.',
+    'citation_accuracy': 'c.acc.',
+    'effective_citations': 'eff.c.'
+}
+
+# 模型分类映射
+MODEL_CATEGORIES = {
+    "Deep Research Agent": [
+        "gemini-2.5-pro-deepresearch",
+        "grok-deeper-search",
+        "openai-deepresearch", 
+        "perplexity-Research"
+    ],
+    "LLM with Search": [
+        "claude-3-7-sonnet-with-search",
+        "perplexity-sonar-reasoning-pro",
+        "perplexity-sonar-reasoning",
+        "perplexity-sonar-pro",
+        "gemini-2.5-pro-with-grounding",
+        "gpt-4o-search-preview",
+        "perplexity-sonar",
+        "gpt-4.1-with-search",
+        "gemini-2.5-flash-preview-04-17",
+        "gpt-4o-mini-search-preview",
+        "gpt-4.1-mini-with-search",
+        "claude-3-5-sonnet-with-search"
+    ]
+}
+
+def load_leaderboard() -> pd.DataFrame:
+    if not DATA_PATH.exists():
+        raise FileNotFoundError(
+            f"Leaderboard file not found: {DATA_PATH}.\n"
+            "→ 先运行 rank_leaderboard.py 生成 data/leaderboard.csv"
+        )
+    df = pd.read_csv(DATA_PATH)
+    df.columns = [c.strip() for c in df.columns]
+    
+    def get_category(model_name):
+        for category, models in MODEL_CATEGORIES.items():
+            if model_name in models:
+                return category
+        return "Others"  # 为不在预定义类别中的模型提供默认类别
+    
+    df['category'] = df['model'].apply(get_category)
+    return df
+
+def make_ranked(df: pd.DataFrame) -> pd.DataFrame:
+    ranked = df.sort_values(by='overall_score', ascending=False).reset_index(drop=True)
+    ranked.insert(0, "Rank", range(1, len(ranked) + 1))
+    
+    # 重命名列名为简写形式
+    ranked = ranked.rename(columns=COLUMN_RENAME_MAP)
+    
+    return ranked
+
+def filter_data(search_text: str, selected_categories: list):
+    df = load_leaderboard()
+    
+    if search_text.strip():
+        df = df[df['model'].str.contains(search_text.strip(), case=False, na=False)]
+    
+    if selected_categories:
+        df = df[df['category'].isin(selected_categories)]
+    
+    return make_ranked(df)
+
+# 新增：辅助函数用于样式化DataFrame
+def _style_specific_rows(row, category_column_name='category', target_category=CATEGORY_TO_HIGHLIGHT, color=HIGHLIGHT_COLOR):
+    """
+    根据行的类别返回样式列表。如果类别匹配目标类别，则应用背景色。
+    """
+    apply_color = color if row.get(category_column_name) == target_category else ''
+    return [f'background-color: {apply_color}' for _ in row]
+
+def _apply_table_styling(df: pd.DataFrame) -> pd.io.formats.style.Styler:
+    """
+    应用表格样式：
+    - 高亮显示 CATEGORY_TO_HIGHLIGHT 的行
+    - 保留 'category' 列显示
+    - 格式化数值为两位小数
+    返回 Pandas Styler 对象。
+    """
+    if df.empty:
+        return df.style
+
+    styled_df = df.copy()
+    
+    # 获取数值列（排除 Rank, model, category 列）
+    numeric_columns = []
+    for col in styled_df.columns:
+        if col not in ['Rank', 'model', 'category']:
+            # 检查是否为数值类型
+            if styled_df[col].dtype in ['float64', 'int64'] or pd.api.types.is_numeric_dtype(styled_df[col]):
+                numeric_columns.append(col)
+    
+    # 应用行样式 - 高亮特定类别的行
+    styler = styled_df.style.apply(
+        _style_specific_rows, 
+        axis=1, 
+        category_column_name='category', 
+        target_category=CATEGORY_TO_HIGHLIGHT, 
+        color=HIGHLIGHT_COLOR
+    )
+    
+    # 使用 Styler 的 format 方法格式化数值列为两位小数
+    if numeric_columns:
+        format_dict = {col: '{:.2f}' for col in numeric_columns}
+        styler = styler.format(format_dict)
+    
+    return styler
+
+def create_leaderboard_tab():
+    with gr.Tab("🏆Leaderboard"):
+        with gr.Row():
+            search_box = gr.Textbox(
+                label="Model Search", 
+                placeholder="Entering model name to search...",
+                value=""
+            )
+            category_checkboxes = gr.CheckboxGroup(
+                label="Model Categories",
+                choices=list(MODEL_CATEGORIES.keys()),
+                value=list(MODEL_CATEGORIES.keys())
+            )
+        
+        initial_df_raw = make_ranked(load_leaderboard())
+        styled_initial_value = _apply_table_styling(initial_df_raw.copy())
+        
+        table = gr.Dataframe(
+            interactive=False, 
+            wrap=False,  
+            value=styled_initial_value,
+        )
+
+        def update_display(search_text, selected_categories):
+            filtered_df_raw = filter_data(search_text, selected_categories)
+            styled_updated_value = _apply_table_styling(filtered_df_raw.copy())
+            return styled_updated_value
+
+        search_box.change(
+            fn=update_display,
+            inputs=[search_box, category_checkboxes],
+            outputs=table
+        )
+        category_checkboxes.change(
+            fn=update_display,
+            inputs=[search_box, category_checkboxes],
+            outputs=table
+        )
+        
+        # 在底部添加列名说明
+    gr.Markdown("""
+    ### Column Abbreviations
+    The leaderboard uses abbreviated column names for compact display: (i) **overall** - Overall Score; (ii) **comp.** - Comprehensiveness; (iii) **insight** - Insight quality; (iv) **inst.** - Instruction Following; (v) **read.** - Readability; (vi) **c.acc.** - Citation Accuracy; (vii) **eff.c.** - Effective Citations.
+    """)
+        
+    return search_box
\ No newline at end of file
diff --git a/utils/merge_raw_data.py b/utils/merge_raw_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c46dfc8762cda0ed8af5d4801a359f09860b844
--- /dev/null
+++ b/utils/merge_raw_data.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import json
+import os
+from pathlib import Path
+
+
+def calculate_dimension_score(target_score, reference_score):
+    """计算单个维度的分数，与rank_leaderboard.py中的逻辑一致"""
+    if (target_score + reference_score) == 0:  # 避免除以零
+        return 0.0
+    return target_score / (target_score + reference_score)
+
+
+def load_scores_for_model(model_results_file_path: Path):
+    """为单个模型加载所有文章的评分数据"""
+    scores_by_id = {}
+    if not model_results_file_path.exists():
+        print(f"警告: 未找到模型 {model_results_file_path.stem} 的结果文件: {model_results_file_path}")
+        return scores_by_id
+
+    print(f"  正在从 {model_results_file_path.name} 加载分数...")
+    with open(model_results_file_path, 'r', encoding='utf-8') as f:
+        for i, line in enumerate(f):
+            try:
+                data = json.loads(line.strip())
+                article_id = str(data.get('id')) # 确保ID是字符串以供匹配
+                if not article_id:
+                    print(f"    警告: {model_results_file_path.name} 第 {i+1} 行缺少ID，已跳过。")
+                    continue
+
+                # 直接获取 overall_score (原始值，假设在0-1范围，或者已经是0-100，根据您的数据调整)
+                # 根据您之前的修改，这里我们假设原始overall_score需要乘以100
+                overall_score_raw = data.get('overall_score', 0.0) 
+                overall_score_scaled = overall_score_raw * 100
+
+                # 计算四个维度的分数
+                comp_score_raw = calculate_dimension_score(
+                    data.get('target_comprehensiveness_weighted_avg', 0),
+                    data.get('reference_comprehensiveness_weighted_avg', 0)
+                )
+                insight_score_raw = calculate_dimension_score(
+                    data.get('target_insight_weighted_avg', 0),
+                    data.get('reference_insight_weighted_avg', 0)
+                )
+                instruction_score_raw = calculate_dimension_score(
+                    data.get('target_instruction_following_weighted_avg', 0),
+                    data.get('reference_instruction_following_weighted_avg', 0)
+                )
+                readability_score_raw = calculate_dimension_score(
+                    data.get('target_readability_weighted_avg', 0),
+                    data.get('reference_readability_weighted_avg', 0)
+                )
+                
+                scores_by_id[article_id] = {
+                    'overall_score': f"{overall_score_scaled:.2f}",
+                    'comprehensiveness_score': f"{comp_score_raw * 100:.2f}",
+                    'insight_score': f"{insight_score_raw * 100:.2f}",
+                    'instruction_following_score': f"{instruction_score_raw * 100:.2f}",
+                    'readability_score': f"{readability_score_raw * 100:.2f}"
+                }
+            except json.JSONDecodeError as e:
+                print(f"    错误: 解析JSON时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}")
+            except Exception as e:
+                print(f"    错误: 处理数据时出错 (文件: {model_results_file_path.name}, 行: {i+1}): {e}")
+    print(f"  为模型 {model_results_file_path.stem} 加载了 {len(scores_by_id)}篇文章的分数")
+    return scores_by_id
+
+
+def merge_jsonl_files():
+    # 定义目录路径
+    project_root = Path(__file__).resolve().parent.parent
+    raw_data_dir = project_root / "data" / "raw_data"         # 包含原始文章内容的目录
+    raw_results_dir = project_root / "data" / "raw_results"   # 包含评分结果的目录
+    output_file = project_root / "data" / "data_viewer.jsonl"
+    
+    # 获取所有原始数据JSONL文件
+    input_files = list(raw_data_dir.glob("*.jsonl"))
+    print(f"在 {raw_data_dir} 中找到 {len(input_files)} 个模型JSONL文件")
+    
+    if not input_files:
+        print("未找到任何原始数据文件，已退出。")
+        return
+
+    # 清空输出文件 (如果需要，或者可以采用追加模式，但通常合并操作会重新生成)
+    with open(output_file, 'w', encoding='utf-8') as f:
+        pass # 创建或清空文件
+    
+    all_merged_data = []
+    
+    for raw_data_file in input_files:
+        model_name = raw_data_file.stem
+        print(f"正在处理原始数据文件: {raw_data_file.name} (模型: {model_name})")
+        
+        # 为当前模型加载评分数据
+        model_results_file = raw_results_dir / f"{model_name}.jsonl"
+        scores_for_current_model = load_scores_for_model(model_results_file)
+        
+        processed_articles_count = 0
+        with open(raw_data_file, 'r', encoding='utf-8') as f_raw:
+            for i, line in enumerate(f_raw):
+                try:
+                    article_data = json.loads(line.strip())
+                    article_id = str(article_data.get('id')) # 确保ID是字符串
+
+                    if not article_id:
+                        print(f"  警告: {raw_data_file.name} 第 {i+1} 行缺少ID，已跳过。")
+                        continue
+                    
+                    # 从加载的评分数据中获取该文章的评分
+                    article_scores = scores_for_current_model.get(article_id, {})
+                    if not article_scores:
+                        print(f"  警告: 模型 {model_name} 的文章ID {article_id} 未在结果文件中找到分数。")
+
+                    merged_item = {
+                        'model_name': model_name,
+                        'id': article_id,
+                        'prompt': article_data.get('prompt'),
+                        'article': article_data.get('article'),
+                        'overall_score': article_scores.get('overall_score'), # 可能为None
+                        'comprehensiveness_score': article_scores.get('comprehensiveness_score'),
+                        'insight_score': article_scores.get('insight_score'),
+                        'instruction_following_score': article_scores.get('instruction_following_score'),
+                        'readability_score': article_scores.get('readability_score')
+                    }
+                    all_merged_data.append(merged_item)
+                    processed_articles_count += 1
+                except json.JSONDecodeError as e:
+                    print(f"  错误: 解析原始数据JSON时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
+                except Exception as e:
+                    print(f"  错误: 处理原始数据时出错 (文件: {raw_data_file.name}, 行: {i+1}): {e}")
+        print(f"  为模型 {model_name} 处理了 {processed_articles_count} 篇文章数据。")
+    
+    # 一次性写入所有合并后的数据
+    with open(output_file, 'w', encoding='utf-8') as f_out:
+        for item in all_merged_data:
+            f_out.write(json.dumps(item, ensure_ascii=False) + '\n')
+    
+    print(f"\n成功合并并保存到: {output_file}, 共 {len(all_merged_data)} 条记录")
+
+if __name__ == "__main__":
+    merge_jsonl_files()
+    print("所有文件处理完成！") 
\ No newline at end of file
diff --git a/utils/rank_leaderboard.py b/utils/rank_leaderboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a2754e949f85cb23d98af5593d2d7beada91ce
--- /dev/null
+++ b/utils/rank_leaderboard.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import json
+import os
+import csv
+from pathlib import Path
+from collections import defaultdict
+
+
+def calculate_dimension_score(target_score, reference_score):
+    """计算单个维度的分数"""
+    return target_score / (target_score + reference_score)
+
+
+def process_model_data(model_file):
+    """处理单个模型文件的数据"""
+    model_name = model_file.stem
+    print(f"正在处理模型: {model_name}")
+    
+    overall_scores = []
+    comprehensiveness_scores = []
+    insight_scores = []
+    instruction_following_scores = []
+    readability_scores = []
+    
+    with open(model_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            try:
+                data = json.loads(line.strip())
+                
+                # 获取总分
+                overall_score = data.get('overall_score', 0)
+                overall_scores.append(overall_score)
+                
+                # 计算四个维度的分数
+                target_comp = data.get('target_comprehensiveness_weighted_avg', 0)
+                ref_comp = data.get('reference_comprehensiveness_weighted_avg', 0)
+                comp_score = calculate_dimension_score(target_comp, ref_comp)
+                comprehensiveness_scores.append(comp_score)
+                
+                target_insight = data.get('target_insight_weighted_avg', 0)
+                ref_insight = data.get('reference_insight_weighted_avg', 0)
+                insight_score = calculate_dimension_score(target_insight, ref_insight)
+                insight_scores.append(insight_score)
+                
+                target_instruction = data.get('target_instruction_following_weighted_avg', 0)
+                ref_instruction = data.get('reference_instruction_following_weighted_avg', 0)
+                instruction_score = calculate_dimension_score(target_instruction, ref_instruction)
+                instruction_following_scores.append(instruction_score)
+                
+                target_readability = data.get('target_readability_weighted_avg', 0)
+                ref_readability = data.get('reference_readability_weighted_avg', 0)
+                readability_score = calculate_dimension_score(target_readability, ref_readability)
+                readability_scores.append(readability_score)
+                
+            except json.JSONDecodeError as e:
+                print(f"解析JSON时出错 (模型: {model_name}): {e}")
+                continue
+            except Exception as e:
+                print(f"处理数据时出错 (模型: {model_name}): {e}")
+                continue
+    
+    # 计算平均分
+    avg_overall = sum(overall_scores) / len(overall_scores)
+    avg_comprehensiveness = sum(comprehensiveness_scores) / len(comprehensiveness_scores)
+    avg_insight = sum(insight_scores) / len(insight_scores)
+    avg_instruction_following = sum(instruction_following_scores) / len(instruction_following_scores)
+    avg_readability = sum(readability_scores) / len(readability_scores)
+    print(f"  - 处理了 {len(overall_scores)} 条记录")
+    print(f"  - 总分: {avg_overall:.4f}")
+    
+    return {
+        'model': model_name,
+        'overall_score': avg_overall * 100,
+        'comprehensiveness': avg_comprehensiveness * 100,
+        'insight': avg_insight * 100,
+        'instruction_following': avg_instruction_following * 100,
+        'readability': avg_readability * 100
+    }
+
+
+def rank_leaderboard():
+    """计算排行榜并保存到CSV"""
+    # 定义目录路径
+    project_root = Path(__file__).parent.parent
+    input_dir = project_root / "data" / "raw_results"
+    output_file = project_root / "data" / "leaderboard.csv"
+    
+    # 获取所有JSONL文件
+    input_files = list(input_dir.glob("*.jsonl"))
+    print(f"找到 {len(input_files)} 个模型结果文件")
+    
+    if not input_files:
+        print("未找到任何JSONL文件")
+        return
+    
+    # 处理每个模型的数据
+    model_results = []
+    for input_file in input_files:
+        try:
+            result = process_model_data(input_file)
+            model_results.append(result)
+        except Exception as e:
+            print(f"处理文件 {input_file.name} 时出错: {e}")
+            continue
+    
+    # 按总分排序（降序）
+    model_results.sort(key=lambda x: x['overall_score'], reverse=True)
+    
+    # 写入CSV文件
+    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
+        fieldnames = ['model', 'overall_score', 'comprehensiveness', 'insight', 'instruction_following', 'readability']
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        
+        # 写入表头
+        writer.writeheader()
+        
+        # 写入数据
+        for result in model_results:
+            writer.writerow({
+                'model': result['model'],
+                'overall_score': f"{result['overall_score']:.2f}",
+                'comprehensiveness': f"{result['comprehensiveness']:.2f}",
+                'insight': f"{result['insight']:.2f}",
+                'instruction_following': f"{result['instruction_following']:.2f}",
+                'readability': f"{result['readability']:.2f}"
+            })
+    
+    print(f"\n排行榜已保存到: {output_file}")
+    print(f"共处理了 {len(model_results)} 个模型")
+
+
+if __name__ == "__main__":
+    rank_leaderboard()
+    print("排行榜计算完成！")