Spaces:

SUSTech
/

ChineseSafe-Benchmark

Running

App Files Files Community

hongfu_update_20250701

#19

by Jerry0723 - opened 14 days ago

base: refs/heads/main

←

from: refs/pr/19

Discussion Files changed

+95

-35

Files changed (5) hide show

.idea/workspace.xml +58 -0
app.py +34 -32
data/chinese_benchmark_per.csv +1 -1
data/subclass_gen.csv +1 -1
data/subclass_per.csv +1 -1

.idea/workspace.xml ADDED Viewed

	@@ -0,0 +1,58 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="Git.Settings">
+    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$" />
+  </component>
+  <component name="MarkdownSettingsMigration">
+    <option name="stateVersion" value="1" />
+  </component>
+  <component name="ProjectColorInfo">{
+  &quot;customColor&quot;: &quot;&quot;,
+  &quot;associatedIndex&quot;: 2
+}</component>
+  <component name="ProjectId" id="2zGmpeKAt5GZlNtHRIRD45uRoxd" />
+  <component name="ProjectViewState">
+    <option name="hideEmptyMiddlePackages" value="true" />
+    <option name="showLibraryContents" value="true" />
+  </component>
+  <component name="PropertiesComponent"><![CDATA[{
+  "keyToString": {
+    "RunOnceActivity.OpenProjectViewOnStart": "true",
+    "RunOnceActivity.ShowReadmeOnStart": "true",
+    "git-widget-placeholder": "pr/18",
+    "last_opened_file_path": "E:/pythonProject/ChineseSafe-Benchmark",
+    "nodejs_package_manager_path": "npm",
+    "vue.rearranger.settings.migration": "true"
+  }
+}]]></component>
+  <component name="SharedIndexes">
+    <attachedChunks>
+      <set>
+        <option value="bundled-python-sdk-67fca87a943a-c986f194a52a-com.jetbrains.pycharm.pro.sharedIndexes.bundled-PY-233.11799.259" />
+      </set>
+    </attachedChunks>
+  </component>
+  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="60da6b73-38f4-48aa-bd78-5731d35b3a7c" name="Changes" comment="" />
+      <created>1751365967779</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1751365967779</updated>
+      <workItem from="1751365968934" duration="39000" />
+      <workItem from="1751366116696" duration="54000" />
+    </task>
+    <servers />
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="3" />
+  </component>
+</project>

app.py CHANGED Viewed

@@ -6,11 +6,13 @@ import pandas as pd
 from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
-ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", sep='\t') # space separated values
-ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", sep='\t') #
-ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", sep=',') #
-ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", sep=',')
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
@@ -28,7 +30,7 @@ CLASSIFICATION = {
         "5B~10B",
         "API",
     ]
 }
@@ -36,13 +38,13 @@ CLASSIFICATION = {
 _BIBTEX = """
 @misc{zhang2024chinesesafechinesebenchmarkevaluating,
-      title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
       author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
       year={2024},
       eprint={2410.18491},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
-      url={https://arxiv.org/abs/2410.18491},
 }
 """
@@ -62,8 +64,8 @@ def format_csv_numbers(text):
 def format_csv_numbers_second(text):
     return text.split()
 def format_number(x):
     return float(f"{x:.3}")
@@ -73,7 +75,7 @@ def get_dataset_csv(
 ):
     df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
@@ -101,11 +103,11 @@ def get_dataset_csv_sub_gen(
 ):
     df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
@@ -120,11 +122,11 @@ def get_dataset_csv_sub_per(
 ):
     df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
@@ -143,7 +145,7 @@ def get_dataset_classfier_gen(
         subclass_choice = main_choice
         leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
     return leaderboard_table
 def get_dataset_classfier_per(
     model_size: List[str],
     main_choice: List[str],
@@ -164,10 +166,10 @@ with gr.Blocks() as demo:
     with gr.Row():
         gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Column(scale=0.8):
             main_choice = gr.Dropdown(
@@ -176,8 +178,8 @@ with gr.Blocks() as demo:
                 label="Type",
                 info="Please choose the type to display.",
             )
-        with gr.Column(scale=10):
             model_choice = gr.CheckboxGroup(
                 choices=CLASSIFICATION["model_size"],
                 value=CLASSIFICATION["model_size"],  # all be choosed
@@ -188,12 +190,12 @@ with gr.Blocks() as demo:
     #👉 this part is for csv table generatived
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # ----------------- modify text -----------------
         with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=6):
             dataframe_all_gen = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
         with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=5):
             dataframe_all_per = gr.components.Dataframe(
                 elem_id="leaderboard-table",
@@ -202,10 +204,10 @@ with gr.Blocks() as demo:
     # ----------------- modify text -----------------
     with gr.Row():
         gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
     # 👉 this part is for citation
     with gr.Row():
         with gr.Accordion("📙 Citation", open=True):
@@ -216,18 +218,18 @@ with gr.Blocks() as demo:
                 elem_id="citation-button",
                 show_copy_button=True
             )
     gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
     # --------------------------- all --------------------------------
     # this is  all result Perplexity
     main_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     model_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
@@ -239,26 +241,26 @@ with gr.Blocks() as demo:
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     # this is all result generatived
     main_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     model_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     demo.load(
         fn=get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
 demo.launch(share=True)

 from assets.text import INTRODUCTION_TEXT, METRICS_TEXT, EVALUTION_TEXT, ACKNOWLEDGEMENTS_TEXT, REFERENCE_TEXT
+ORIGINAL_DF = pd.read_csv("./data/chinese_benchmark_gen.csv", encoding='utf-8') # space separated values
+ORIGINAL_DF_PER = pd.read_csv("./data/chinese_benchmark_per.csv", encoding='utf-8') #
+ORIGINAL_DF_SUB_GEN = pd.read_csv("./data/subclass_gen.csv", encoding='utf-8') #
+ORIGINAL_DF_SUB_PER = pd.read_csv("./data/subclass_per.csv", encoding='utf-8')
 METRICS = ["Accuracy", "Precision_Unsafe", "Recall_Unsafe", "Precision_Safe", "Recall_Safe", "None"]
         "5B~10B",
         "API",
     ]
 }
 _BIBTEX = """
 @misc{zhang2024chinesesafechinesebenchmarkevaluating,
+      title={ChineseSafe: A Chinese Benchmark for Evaluating Safety in Large Language Models},
       author={Hengxiang Zhang and Hongfu Gao and Qiang Hu and Guanhua Chen and Lili Yang and Bingyi Jing and Hongxin Wei and Bing Wang and Haifeng Bai and Lei Yang},
       year={2024},
       eprint={2410.18491},
       archivePrefix={arXiv},
       primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2410.18491},
 }
 """
 def format_csv_numbers_second(text):
     return text.split()
 def format_number(x):
     return float(f"{x:.3}")
 ):
     df = ORIGINAL_DF[ORIGINAL_DF['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
 ):
     df = ORIGINAL_DF_SUB_GEN[ORIGINAL_DF_SUB_GEN['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
 ):
     df = ORIGINAL_DF_SUB_PER[ORIGINAL_DF_SUB_PER['Size'].isin(model_size)]
     df = df.drop(columns="Size")
     # get subclass
     subclass_choice_label = ["Model", subclass_choice+"_Accuracy", subclass_choice+"_Precision", subclass_choice+"_Recall"]
     df = df[subclass_choice_label]
     leaderboard_table = gr.components.Dataframe(
         value=df,
         interactive=False,
         subclass_choice = main_choice
         leaderboard_table = get_dataset_csv_sub_gen(model_size, subclass_choice)
     return leaderboard_table
 def get_dataset_classfier_per(
     model_size: List[str],
     main_choice: List[str],
     with gr.Row():
         gr.Markdown(METRICS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(EVALUTION_TEXT, elem_classes="markdown-text")
     with gr.Row():
         with gr.Column(scale=0.8):
             main_choice = gr.Dropdown(
                 label="Type",
                 info="Please choose the type to display.",
             )
+        with gr.Column(scale=10):
             model_choice = gr.CheckboxGroup(
                 choices=CLASSIFICATION["model_size"],
                 value=CLASSIFICATION["model_size"],  # all be choosed
     #👉 this part is for csv table generatived
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         # ----------------- modify text -----------------
         with gr.TabItem("🏅 Generation", elem_id="od-benchmark-tab-table", id=6):
             dataframe_all_gen = gr.components.Dataframe(
                 elem_id="leaderboard-table",
             )
         with gr.TabItem("🏅 Perplexity", elem_id="od-benchmark-tab-table", id=5):
             dataframe_all_per = gr.components.Dataframe(
                 elem_id="leaderboard-table",
     # ----------------- modify text -----------------
     with gr.Row():
         gr.Markdown(ACKNOWLEDGEMENTS_TEXT, elem_classes="markdown-text")
     with gr.Row():
         gr.Markdown(REFERENCE_TEXT, elem_classes="markdown-text")
     # 👉 this part is for citation
     with gr.Row():
         with gr.Accordion("📙 Citation", open=True):
                 elem_id="citation-button",
                 show_copy_button=True
             )
     gr.Markdown(f"Last updated on **{_LAST_UPDATED}**", elem_classes="markdown-text")
     # --------------------------- all --------------------------------
     # this is  all result Perplexity
     main_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     model_choice.change(
         get_dataset_classfier_per,
         inputs=[model_choice, main_choice],
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_per,
     )
     # this is all result generatived
     main_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     model_choice.change(
         get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
     demo.load(
         fn=get_dataset_classfier_gen,
         inputs=[model_choice, main_choice],
         outputs=dataframe_all_gen,
     )
 demo.launch(share=True)

data/chinese_benchmark_per.csv CHANGED Viewed

@@ -43,4 +43,4 @@ Opt-6.7B,5B~10B,48.54/0.43,49.24/0.31,86.62/1.03,43.40/1.18,10.30/0.55
 Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
 Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
 Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
-Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00

 Mistral-7B-Instruct-v0.3,5B~10B,42.99/0.06,39.54/0.47,26.01/0.69,44.69/0.11,60.05/0.50
 Llama3-ChatQA-1.5-8B,5B~10B,42.11/0.29,37.46/0.85,23.20/0.89,44.20/0.09,61.11/0.57
 Qwen3-4B,5B~10B,46.04/0.00,47.79/0.00,85.94/0.00,30.39/0.00,6.14/0.00
+Gemma-3-4B-it,5B~10B,50.00/0.00,0.00/0.00,0.00/0.00,50.00/0.00,100.00/0.00

data/subclass_gen.csv CHANGED Viewed

@@ -48,4 +48,4 @@ Opt-30B,~30B,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.53
 QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
 Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
 Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
-OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516

 QwQ-32B-Preview,~30B,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516,0.8198,0.6977,0.8121,0.823,0.8081,0.847,0.8208,0.8801,0.6113,0.6736,0.3973,0.605,0.67,0.3873,0.7492,0.7768,0.6783,0.4656,0.3791,0.1124
 Qwen3-32B,~30B,0.5416,0.5902,0.2095,0.5495,0.6557,0.2531,0.477,0.3724,0.0843,0.6293,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192
 Gemma-3-27b-it,~30B,0.66,0.6114,0.8339,0.7311,0.6644,0.9577,0.3309,0.2379,0.1626,0.6958,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314
+OpenThinker2-32B,~30B,0.6204,0.8741,0.2629,0.9049,0.9606,0.8489,0.5103,0.547,0.0453,0.8192,0.4672,0.4683,0.6648,0.5002,0.5082,0.7109,0.5044,0.4987,0.7354,0.5314,0.6837,0.7403,0.547,0.812,0.8219,0.8084,0.606,0.6749,0.3914,0.7516

data/subclass_per.csv CHANGED Viewed

@@ -41,4 +41,4 @@ Opt-30B,~30B,0.5831,0.5754,0.5565,0.3952,0.338,0.1915,0.6784,0.6507,0.7506,0.579
 QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
 Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
 OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
-Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814

 QwQ-32B-Preview,~30B,0.5231,0.5061,0.9839,0.5519,0.5328,1,0.4141,0.4443,0.7537,0.5814,0.565,0.9989,0.5529,0.534,0.9993,0.5318,0.5111,0.9993,0.5083,0.4978,0.9542,0.4392,0.4593,0.808,0.5238,0.5042,0.9922,0.5269,0.5128,0.9743
 Mistral-Small-24B-Instruct-2501,~30B,0.5897,0.5714,0.6393,0.7706,0.6931,0.9888,0.3109,0.1339,0.0727,0.7308,0.6984,0.8887,0.7454,0.683,0.9385,0.7584,0.6732,0.9835,0.585,0.5671,0.6297,0.3646,0.2744,0.1803,0.7088,0.645,0.8855,0.3839,0.3257,0.2233
 OpenThinker2-32B,~30B,0.7139 ,0.8341 ,0.5176 ,0.7722 ,0.8735 ,0.6482 ,0.4750 ,0.2581 ,0.0357 ,0.7162 ,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798
+Qwen3-32B,~30B,0.6749 ,0.6366 ,0.7789 ,0.7893 ,0.7099 ,0.9938 ,0.4372 ,0.4025 ,0.2943 ,0.7921 ,0.5831 ,0.5754 ,0.5565 ,0.3952 ,0.3380 ,0.1915 ,0.6784 ,0.6507 ,0.7506 ,0.5798 ,0.5231 ,0.5061 ,0.9839 ,0.5519 ,0.5328 ,1.0000 ,0.4141 ,0.4443 ,0.7537 ,0.5814