Spaces:

mixed-modality-search
/

Evaluation_for_MixBench

Running

App Files Files Community

mixed-modality-search commited on Jun 20

Commit

a2f941d

1 Parent(s): acc872d

update

Browse files

Files changed (2) hide show

data/answer.enc +0 -0
main.py +17 -52

data/answer.enc ADDED Viewed

The diff for this file is too large to render. See raw diff

main.py CHANGED Viewed

@@ -14,7 +14,7 @@ def load_and_decrypt_qrel(secret_key):
         decrypted_data = cipher.decrypt(encrypted_data).decode("utf-8")
         raw_data = json.loads(decrypted_data)
-        # qrel_dict: dataset -> query_id -> {corpus_id: score}
         qrel_dict = defaultdict(lambda: defaultdict(dict))
         for dataset, records in raw_data.items():
             for item in records:
@@ -22,18 +22,15 @@ def load_and_decrypt_qrel(secret_key):
                 qrel_dict[dataset][qid][cid] = score
         return qrel_dict
     except Exception as e:
-        raise ValueError(f"Failed to decrypt answer file: {str(e)}")
 def recall_at_k(corpus_top_100_list, relevant_ids, k=1):
     return int(any(item in relevant_ids for item in corpus_top_100_list[:k]))
 def ndcg_at_k(corpus_top_100_list, rel_dict, k):
     all_items = list(dict.fromkeys(corpus_top_100_list + list(rel_dict.keys())))
     y_true = [rel_dict.get(item, 0) for item in all_items]
     y_score = [len(all_items) - i for i in range(len(all_items))]
     return ndcg_score([y_true], [y_score], k=k)
 def evaluate(pred_data, qrel_dict):
@@ -43,7 +40,6 @@ def evaluate(pred_data, qrel_dict):
             continue
         recall_1, ndcg_10, ndcg_100 = [], [], []
         for item in queries:
             qid = item["query_id"]
             corpus_top_100_list = item["corpus_top_100_list"].split(",")
@@ -63,15 +59,17 @@ def evaluate(pred_data, qrel_dict):
     return results
-# ==== Gradio Wrapper ====
 def process_json(file):
     try:
         pred_data = json.load(open(file))
     except Exception as e:
-        return f"Invalid JSON format: {str(e)}"
     try:
-        secret_key = os.getenv("SECRET_KEY")
         qrel_dict = load_and_decrypt_qrel(secret_key)
     except Exception as e:
         return str(e)
@@ -80,68 +78,35 @@ def process_json(file):
         metrics = evaluate(pred_data, qrel_dict)
         return json.dumps(metrics, indent=2)
     except Exception as e:
-        return f"Error during evaluation: {str(e)}"
-# ==== Launch Gradio App ====
-# def main_gradio():
-#     example_json = '''{
-#   "Google_WIT": [
-#     {"query_id": "1", "corpus_top_100_list": "5, 2, 8, ..."},
-#     {"query_id": "2", "corpus_top_100_list": "90, 13, 3, ..."}
-#   ],
-#   "MSCOCO": [
-#     {"query_id": "3", "corpus_top_100_list": "122, 35, 22, ..."},
-#     {"query_id": "2", "corpus_top_100_list": "90, 19, 3, ..."}
-#   ]
-#   "OVEN": [
-#     {"query_id": "3", "corpus_top_100_list": "11, 15, 22, ..."}
-#   ]
-#   "VisualNews": [
-#     {"query_id": "3", "corpus_top_100_list": "101, 35, 22, ..."}
-#   ]
-# }'''
-#     gr.Interface(
-#         fn=process_json,
-#         inputs=gr.File(label="Upload Retrieval Result (JSON)"),
-#         outputs=gr.Textbox(label="Results"),
-#         title="Automated Evaluation of MixBench",
-#         description="Upload a prediction JSON to evaluate Recall@1, NDCG@10, and NDCG@100 against encrypted qrels.\n\nExample input:\n" + example_json
-#     ).launch(share=True)
 def main_gradio():
     example_json_html = (
-        '{<br>'
         '&nbsp;&nbsp;"Google_WIT": [<br>'
         '&nbsp;&nbsp;&nbsp;&nbsp;{"query_id": "1", "corpus_top_100_list": "5, 2, 8, ..."},<br>'
         '&nbsp;&nbsp;&nbsp;&nbsp;{"query_id": "2", "corpus_top_100_list": "90, 13, 3, ..."}<br>'
         '&nbsp;&nbsp;],<br>'
         '&nbsp;&nbsp;"MSCOCO": [<br>'
-        '&nbsp;&nbsp;&nbsp;&nbsp;{"query_id": "3", "corpus_top_100_list": "122, 35, 22, ..."},<br>'
-        '&nbsp;&nbsp;&nbsp;&nbsp;{"query_id": "2", "corpus_top_100_list": "90, 19, 3, ..."}<br>'
-        '&nbsp;&nbsp;],<br>'
-        '&nbsp;&nbsp;"OVEN": [<br>'
-        '&nbsp;&nbsp;&nbsp;&nbsp;{"query_id": "3", "corpus_top_100_list": "11, 15, 22, ..."}<br>'
-        '&nbsp;&nbsp;],<br>'
-        '&nbsp;&nbsp;"VisualNews": [<br>'
-        '&nbsp;&nbsp;&nbsp;&nbsp;{"query_id": "3", "corpus_top_100_list": "101, 35, 22, ..."}<br>'
         '&nbsp;&nbsp;]<br>'
-        '}'
     )
     gr.Interface(
         fn=process_json,
         inputs=gr.File(label="Upload Retrieval Result (JSON)"),
-        outputs=gr.Textbox(label="Results"),
-        title="Automated Evaluation of MixBench",
         description=(
             "Please upload your model's retrieval result on MixBench (in JSON format) to automatically evaluate its performance.<br><br>"
             "For each subset (e.g., <code>MSCOCO</code>, <code>Google_WIT</code>, <code>VisualNews</code>, <code>OVEN</code>), "
-            "we will compute the following metrics based on the top-100 retrieved results:<br>"
             "- <strong>Recall@1</strong><br>"
             "- <strong>NDCG@10</strong><br>"
             "- <strong>NDCG@100</strong><br><br>"
-            "Expected input JSON format:<br><br>"
-            + example_json_html +
-            "<br>For reference query IDs, see the "
             "<a href='https://huggingface.co/datasets/mixed-modality-search/MixBench2025/viewer/Google_WIT/mixed_corpus' target='_blank'>MixBench2025 dataset viewer</a>."
         )
     ).launch(share=True)

         decrypted_data = cipher.decrypt(encrypted_data).decode("utf-8")
         raw_data = json.loads(decrypted_data)
+        # Convert to: dataset -> query_id -> {corpus_id: score}
         qrel_dict = defaultdict(lambda: defaultdict(dict))
         for dataset, records in raw_data.items():
             for item in records:
                 qrel_dict[dataset][qid][cid] = score
         return qrel_dict
     except Exception as e:
+        raise ValueError(f"❌ Failed to decrypt answer file: {str(e)}")
 def recall_at_k(corpus_top_100_list, relevant_ids, k=1):
     return int(any(item in relevant_ids for item in corpus_top_100_list[:k]))
 def ndcg_at_k(corpus_top_100_list, rel_dict, k):
     all_items = list(dict.fromkeys(corpus_top_100_list + list(rel_dict.keys())))
     y_true = [rel_dict.get(item, 0) for item in all_items]
     y_score = [len(all_items) - i for i in range(len(all_items))]
     return ndcg_score([y_true], [y_score], k=k)
 def evaluate(pred_data, qrel_dict):
             continue
         recall_1, ndcg_10, ndcg_100 = [], [], []
         for item in queries:
             qid = item["query_id"]
             corpus_top_100_list = item["corpus_top_100_list"].split(",")
     return results
 def process_json(file):
     try:
         pred_data = json.load(open(file))
     except Exception as e:
+        return f"❌ Invalid JSON format: {str(e)}"
+    secret_key = os.getenv("SECRET_KEY")
+    if not secret_key:
+        return "❌ SECRET_KEY environment variable not set. Please configure it in your Hugging Face Space."
     try:
         qrel_dict = load_and_decrypt_qrel(secret_key)
     except Exception as e:
         return str(e)
         metrics = evaluate(pred_data, qrel_dict)
         return json.dumps(metrics, indent=2)
     except Exception as e:
+        return f"❌ Error during evaluation: {str(e)}"
 def main_gradio():
     example_json_html = (
+        '<pre><code>{<br>'
         '&nbsp;&nbsp;"Google_WIT": [<br>'
         '&nbsp;&nbsp;&nbsp;&nbsp;{"query_id": "1", "corpus_top_100_list": "5, 2, 8, ..."},<br>'
         '&nbsp;&nbsp;&nbsp;&nbsp;{"query_id": "2", "corpus_top_100_list": "90, 13, 3, ..."}<br>'
         '&nbsp;&nbsp;],<br>'
         '&nbsp;&nbsp;"MSCOCO": [<br>'
+        '&nbsp;&nbsp;&nbsp;&nbsp;{"query_id": "3", "corpus_top_100_list": "122, 35, 22, ..."}<br>'
         '&nbsp;&nbsp;]<br>'
+        '}</code></pre>'
     )
     gr.Interface(
         fn=process_json,
         inputs=gr.File(label="Upload Retrieval Result (JSON)"),
+        outputs=gr.Textbox(label="Evaluation Results"),
+        title="🔍 Automated Evaluation of MixBench",
         description=(
             "Please upload your model's retrieval result on MixBench (in JSON format) to automatically evaluate its performance.<br><br>"
             "For each subset (e.g., <code>MSCOCO</code>, <code>Google_WIT</code>, <code>VisualNews</code>, <code>OVEN</code>), "
+            "we compute:<br>"
             "- <strong>Recall@1</strong><br>"
             "- <strong>NDCG@10</strong><br>"
             "- <strong>NDCG@100</strong><br><br>"
+            "Expected input JSON format:<br><br>" + example_json_html +
+            "<br>To find valid query IDs, see the "
             "<a href='https://huggingface.co/datasets/mixed-modality-search/MixBench2025/viewer/Google_WIT/mixed_corpus' target='_blank'>MixBench2025 dataset viewer</a>."
         )
     ).launch(share=True)