medical_llm_leaderboard

Sleeping

App Files Files Community

AI4H Group commited on Nov 10, 2024

Commit

df490c1

1 Parent(s): 6bd59f7

Initial commit

Browse files

Files changed (8) hide show

OpenVLM_subset.json +656 -0
README.md +24 -8
ShoppingMMLU.json +64 -0
ShoppingMMLU_overall.json +653 -0
app.py +170 -0
gen_table.py +204 -0
meta_data.py +169 -0
requirements.txt +3 -0

OpenVLM_subset.json ADDED Viewed

	@@ -0,0 +1,656 @@

+{
+    "time": "241031154353",
+    "results": {
+        "GPT-4o (0513, detail-high)": {
+            "META": {
+                "Method": [
+                    "GPT-4o (0513, detail-high)",
+                    "https://openai.com/index/hello-gpt-4o/"
+                ],
+                "Parameters": "",
+                "Language Model": "",
+                "Vision Model": "",
+                "Org": "OpenAI",
+                "Time": "2024/05/31",
+                "Verified": "Yes",
+                "OpenSource": "No",
+                "key": 270,
+                "dir_name": "GPT4o_HIGH"
+            },
+            "SEEDBench_IMG": {
+                "Overall": 77.1,
+                "Instance Attributes": 79.3,
+                "Instance Identity": 81.0,
+                "Instance Interaction": 80.4,
+                "Instance Location": 72.9,
+                "Instances Counting": 69.5,
+                "Scene Understanding": 80.1,
+                "Spatial Relation": 67.9,
+                "Text Understanding": 72.6,
+                "Visual Reasoning": 83.1,
+                "Overall (official)": "N/A"
+            },
+            "CCBench": {
+                "Overall": 71.2,
+                "Sketch Reasoning": 91.1,
+                "Historical Figure": 37.1,
+                "Calligraphy Painting": 70.2,
+                "Scenery Building": 89.5,
+                "Food Clothes": 62.6,
+                "Cultural Relic": 67.0,
+                "Traditional Show": 71.2
+            },
+            "MMBench_TEST_EN": {
+                "Overall": 83.4,
+                "CP": 87.4,
+                "FP-S": 78.9,
+                "FP-C": 83.8,
+                "AR": 86.5,
+                "LR": 80.3,
+                "RR": 80.6
+            },
+            "MMBench_TEST_CN": {
+                "Overall": 82.1,
+                "CP": 87.6,
+                "FP-S": 76.6,
+                "FP-C": 83.4,
+                "AR": 83.7,
+                "LR": 78.0,
+                "RR": 80.1
+            },
+            "MMBench_TEST_EN_V11": {
+                "Overall": 83.0,
+                "AR": 90.2,
+                "CP": 81.3,
+                "FP-C": 86.1,
+                "FP-S": 81.4,
+                "LR": 78.8,
+                "RR": 82.2,
+                "Action Recognition": 93.2,
+                "Attribute Comparison": 82.7,
+                "Attribute Recognition": 91.0,
+                "Celebrity Recognition": 62.6,
+                "Function Reasoning": 93.3,
+                "Future Prediction": 82.7,
+                "Identity Reasoning": 98.7,
+                "Image Emotion": 81.1,
+                "Image Quality": 59.7,
+                "Image Scene": 88.2,
+                "Image Style": 83.7,
+                "Image Topic": 97.8,
+                "Nature Relation": 92.4,
+                "Object Localization": 84.8,
+                "Ocr": 98.9,
+                "Physical Property Reasoning": 78.5,
+                "Physical Relation": 61.3,
+                "Social Relation": 89.0,
+                "Spatial Relationship": 78.7,
+                "Structuralized Imagetext Understanding": 76.1
+            },
+            "MMBench_TEST_CN_V11": {
+                "Overall": 81.5,
+                "AR": 86.5,
+                "CP": 81.5,
+                "FP-C": 85.0,
+                "FP-S": 79.1,
+                "LR": 77.2,
+                "RR": 79.8,
+                "Action Recognition": 94.0,
+                "Attribute Comparison": 81.3,
+                "Attribute Recognition": 91.0,
+                "Celebrity Recognition": 57.8,
+                "Function Reasoning": 94.4,
+                "Future Prediction": 82.7,
+                "Identity Reasoning": 97.4,
+                "Image Emotion": 85.6,
+                "Image Quality": 58.9,
+                "Image Scene": 88.2,
+                "Image Style": 80.4,
+                "Image Topic": 98.9,
+                "Nature Relation": 94.6,
+                "Object Localization": 82.9,
+                "Ocr": 97.8,
+                "Physical Property Reasoning": 67.1,
+                "Physical Relation": 53.3,
+                "Social Relation": 86.8,
+                "Spatial Relationship": 74.7,
+                "Structuralized Imagetext Understanding": 73.4
+            },
+            "MME": {
+                "Overall": 2310.3,
+                "Perception": 1614.2,
+                "Cognition": 696.1,
+                "OCR": 192.5,
+                "Artwork": 145.2,
+                "Celebrity": 67.9,
+                "Code Reasoning": 177.5,
+                "Color": 185.0,
+                "Commonsense Reasoning": 178.6,
+                "Count": 185.0,
+                "Existence": 185.0,
+                "Landmark": 182.0,
+                "Numerical Calculation": 147.5,
+                "Position": 133.3,
+                "Posters": 191.2,
+                "Scene": 147.0,
+                "Text Translation": 192.5
+            },
+            "MMVet": {
+                "Rec": 67.8,
+                "Ocr": 76.8,
+                "Know": 58.3,
+                "Gen": 56.9,
+                "Spat": 74.3,
+                "Math": 76.2,
+                "Overall": 69.1,
+                "Overall (official)": "N/A"
+            },
+            "MMMU_VAL": {
+                "Overall": 69.2,
+                "Art & Design": 72.5,
+                "Business": 73.3,
+                "Science": 64.7,
+                "Health & Medicine": 74.0,
+                "Humanities & Social Science": 80.8,
+                "Tech & Engineering": 57.6
+            },
+            "MathVista": {
+                "Overall": 61.3,
+                "SCI": 64.8,
+                "TQA": 70.3,
+                "NUM": 44.4,
+                "ARI": 58.4,
+                "VQA": 47.5,
+                "GEO": 61.5,
+                "ALG": 62.3,
+                "GPS": 60.1,
+                "MWP": 69.9,
+                "LOG": 43.2,
+                "FQA": 60.2,
+                "STA": 68.4
+            },
+            "HallusionBench": {
+                "aAcc": 70.2,
+                "fAcc": 49.1,
+                "qAcc": 45.5,
+                "Overall": 55.0
+            },
+            "LLaVABench": {
+                "Overall": 102.0,
+                "Conv": 93.6,
+                "Complex": 111.2,
+                "Detail": 93.6,
+                "Overall (official)": "N/A"
+            },
+            "AI2D": {
+                "Overall": 84.6,
+                "atomStructure": 75.0,
+                "eclipses": 90.3,
+                "faultsEarthquakes": 78.6,
+                "foodChainsWebs": 92.2,
+                "lifeCycles": 83.5,
+                "moonPhaseEquinox": 68.2,
+                "partsOfA": 80.9,
+                "partsOfTheEarth": 82.7,
+                "photosynthesisRespiration": 83.5,
+                "rockCycle": 73.1,
+                "rockStrata": 87.8,
+                "solarSystem": 97.2,
+                "typesOf": 81.0,
+                "volcano": 100.0,
+                "waterCNPCycle": 68.2
+            },
+            "ScienceQA_VAL": {
+                "Overall": 89.7,
+                "Adaptations": 97.9,
+                "Adaptations and natural selection": 100.0,
+                "Age of Exploration": 100.0,
+                "Ancient Egypt and Kush": 100.0,
+                "Ancient Mesopotamia": 100.0,
+                "Animals": 100.0,
+                "Astronomy": 100.0,
+                "Atoms and molecules": 100.0,
+                "Basic economic principles": 32.8,
+                "Chemical reactions": 100.0,
+                "Cities": 87.5,
+                "Classification": 98.8,
+                "Classification and scientific names": 100.0,
+                "Climate change": 100.0,
+                "Colonial America": 90.5,
+                "Context clues": 100.0,
+                "Descriptive details": 100.0,
+                "Designing experiments": 100.0,
+                "Domain-specific vocabulary": 60.0,
+                "Early 19th century American history": 100.0,
+                "Early Americas": 50.0,
+                "Earth events": 100.0,
+                "Ecological interactions": 76.0,
+                "Ecosystems": 95.5,
+                "Engineering practices": 100.0,
+                "English colonies in North America": 74.4,
+                "Force and motion": 84.0,
+                "Fossils": 82.4,
+                "Genes to traits": 83.0,
+                "Geography": 98.6,
+                "Government": 100.0,
+                "Independent reading comprehension": 100.0,
+                "Informational texts: level 1": 100.0,
+                "Magnets": 72.2,
+                "Maps": 96.8,
+                "Materials": 96.6,
+                "Medieval Asia": 100.0,
+                "Natural resources and human impacts": 100.0,
+                "Oceania: geography": 59.6,
+                "Oceans and continents": 100.0,
+                "Oceans and continents\t": 100.0,
+                "Particle motion and energy": 92.6,
+                "Persuasive strategies": 100.0,
+                "Physical Geography": 83.7,
+                "Plant reproduction": 90.0,
+                "Plants": 100.0,
+                "Plate tectonics": 100.0,
+                "Read-alone texts": 100.0,
+                "Rocks and minerals": 100.0,
+                "Rome and the Byzantine Empire": 100.0,
+                "Scientific names": 100.0,
+                "Solutions": 65.7,
+                "State capitals": 100.0,
+                "States": 100.0,
+                "States of matter": 97.4,
+                "The American Revolution": 100.0,
+                "The Americas: geography": 83.3,
+                "The Antebellum period": 100.0,
+                "The Civil War and Reconstruction": 100.0,
+                "The Silk Road": 100.0,
+                "Thermal energy": 100.0,
+                "Velocity, acceleration, and forces": 68.6,
+                "Visual elements": 100.0,
+                "Water cycle": 100.0,
+                "Weather and climate": 90.6,
+                "World religions": 100.0
+            },
+            "ScienceQA_TEST": {
+                "Overall": 90.7,
+                "Adaptations": 100.0,
+                "Ancient Egypt and Kush": 100.0,
+                "Ancient Mesopotamia": 100.0,
+                "Animals": 100.0,
+                "Astronomy": 100.0,
+                "Atoms and molecules": 100.0,
+                "Basic economic principles": 38.0,
+                "Cells": 100.0,
+                "Chemical reactions": 100.0,
+                "Cities": 91.7,
+                "Classification": 100.0,
+                "Classification and scientific names": 100.0,
+                "Climate change": 100.0,
+                "Colonial America": 81.6,
+                "Context clues": 100.0,
+                "Descriptive details": 100.0,
+                "Designing experiments": 100.0,
+                "Domain-specific vocabulary": 100.0,
+                "Early 19th century American history": 100.0,
+                "Earth events": 100.0,
+                "Ecological interactions": 66.7,
+                "Ecosystems": 90.4,
+                "Engineering practices": 98.2,
+                "English colonies in North America": 92.3,
+                "Force and motion": 100.0,
+                "Fossils": 100.0,
+                "Genes to traits": 76.3,
+                "Geography": 95.2,
+                "Government": 100.0,
+                "Greece": 100.0,
+                "Independent reading comprehension": 100.0,
+                "Informational texts: level 1": 100.0,
+                "Kinetic and potential energy": 100.0,
+                "Magnets": 77.3,
+                "Maps": 97.8,
+                "Materials": 96.5,
+                "Medieval Asia": 100.0,
+                "Oceania: geography": 76.5,
+                "Oceans and continents": 100.0,
+                "Oceans and continents\t": 100.0,
+                "Particle motion and energy": 97.6,
+                "Persuasive strategies": 100.0,
+                "Photosynthesis": 100.0,
+                "Physical Geography": 92.2,
+                "Plant reproduction": 100.0,
+                "Plants": 66.7,
+                "Plate tectonics": 100.0,
+                "Read-alone texts": 100.0,
+                "Rocks and minerals": 100.0,
+                "Scientific names": 100.0,
+                "Solutions": 72.2,
+                "State capitals": 100.0,
+                "States": 94.4,
+                "States of matter": 100.0,
+                "The American Revolution": 100.0,
+                "The Americas: geography": 71.1,
+                "The Antebellum period": 100.0,
+                "The Civil War and Reconstruction": 100.0,
+                "Thermal energy": 95.5,
+                "Topographic maps": 100.0,
+                "Velocity, acceleration, and forces": 67.7,
+                "Visual elements": 100.0,
+                "Water cycle": 100.0,
+                "Weather and climate": 91.4,
+                "World religions": 100.0
+            },
+            "OCRBench": {
+                "Text Recognition": 199,
+                "Scene Text-centric VQA": 181,
+                "Doc-oriented VQA": 168,
+                "Key Information Extraction": 170,
+                "Handwritten Mathematical Expression Recognition": 18,
+                "Final Score": 736
+            },
+            "MMStar": {
+                "Overall": 63.9,
+                "coarse perception": 73.6,
+                "fine-grained perception": 54.8,
+                "instance reasoning": 66.4,
+                "logical reasoning": 72.0,
+                "math": 66.4,
+                "science & technology": 50.0
+            },
+            "RealWorldQA": {
+                "Overall": 75.4
+            },
+            "POPE": {
+                "Overall": 85.6,
+                "acc": 86.7,
+                "precision": 93.0,
+                "recall": 79.3
+            },
+            "SEEDBench2_Plus": {
+                "Overall": 72.0,
+                "chart": 71.4,
+                "map": 62.0,
+                "web": 85.2
+            },
+            "MMT-Bench_VAL": {
+                "Overall": 67.3,
+                "VR": 85.3,
+                "Loc": 68.1,
+                "OCR": 82.5,
+                "Count": 57.2,
+                "HLN": 75.0,
+                "IR": 85.0,
+                "3D": 57.5,
+                "VC": 87.9,
+                "VG": 46.2,
+                "DU": 72.9,
+                "AR": 51.0,
+                "PLP": 43.5,
+                "I2IT": 50.0,
+                "RR": 76.2,
+                "IQT": 15.0,
+                "Emo": 58.3,
+                "VI": 33.9,
+                "MemU": 87.5,
+                "VPU": 84.9,
+                "AND": 57.0,
+                "KD": 57.1,
+                "VCR": 80.0,
+                "IEJ": 40.0,
+                "MIA": 42.5,
+                "CIM": 61.7,
+                "TU": 49.5,
+                "VP": 66.7,
+                "MedU": 74.0,
+                "AUD": 58.0,
+                "DKR": 64.6,
+                "EA": 90.0,
+                "GN": 46.2,
+                "abstract_visual_recognition": 85.0,
+                "action_quality_assessment": 15.0,
+                "age_gender_race_recognition": 60.0,
+                "anatomy_identification": 75.0,
+                "animal_keypoint_detection": 35.0,
+                "animals_recognition": 100.0,
+                "animated_character_recognition": 90.0,
+                "art_design": 81.8,
+                "artwork_emotion_recognition": 55.0,
+                "astronomical_recognition": 100.0,
+                "attribute_hallucination": 80.0,
+                "behavior_anomaly_detection": 30.0,
+                "body_emotion_recognition": 40.0,
+                "building_recognition": 90.0,
+                "business": 66.7,
+                "camouflage_object_detection": 55.0,
+                "celebrity_recognition": 0.0,
+                "chart_to_table": 95.0,
+                "chart_to_text": 90.0,
+                "chart_vqa": 70.0,
+                "chemical_apparatusn_recognition": 80.0,
+                "clock_reading": 30.0,
+                "clothes_keypoint_detection": 70.0,
+                "color_assimilation": 35.0,
+                "color_constancy": 14.3,
+                "color_contrast": 40.0,
+                "color_recognition": 95.0,
+                "counting_by_category": 33.8,
+                "counting_by_reasoning": 95.0,
+                "counting_by_visual_prompting": 50.0,
+                "crowd_counting": 50.0,
+                "deepfake_detection": 60.0,
+                "depth_estimation": 40.0,
+                "disaster_recognition": 85.0,
+                "disease_diagnose": 60.0,
+                "doc_vqa": 80.0,
+                "electronic_object_recognition": 100.0,
+                "eqn2latex": 90.0,
+                "exist_hallucination": 90.0,
+                "facail_expression_change_recognition": 95.0,
+                "face_detection": 90.0,
+                "face_mask_anomaly_dectection": 70.0,
+                "face_retrieval": 100.0,
+                "facial_expression_recognition": 75.0,
+                "fashion_recognition": 75.0,
+                "film_and_television_recognition": 95.0,
+                "font_recognition": 50.0,
+                "food_recognition": 100.0,
+                "furniture_keypoint_detection": 55.0,
+                "gaze_estimation": 10.0,
+                "general_action_recognition": 95.0,
+                "geometrical_perspective": 50.0,
+                "geometrical_relativity": 30.0,
+                "gesture_recognition": 65.0,
+                "google_apps": 50.0,
+                "gui_general": 45.0,
+                "gui_install": 50.0,
+                "handwritten_mathematical_expression_recognition": 90.0,
+                "handwritten_retrieval": 90.0,
+                "handwritten_text_recognition": 100.0,
+                "health_medicine": 92.9,
+                "helmet_anomaly_detection": 90.0,
+                "human_interaction_understanding": 95.0,
+                "human_keypoint_detection": 70.0,
+                "human_object_interaction_recognition": 75.0,
+                "humanitites_social_science": 54.5,
+                "image2image_retrieval": 75.0,
+                "image_based_action_recognition": 95.0,
+                "image_captioning": 100.0,
+                "image_captioning_paragraph": 95.0,
+                "image_colorization": 60.0,
+                "image_dense_captioning": 68.4,
+                "image_matting": 15.0,
+                "image_quality_assessment": 35.0,
+                "image_season_recognition": 80.0,
+                "industrial_produce_anomaly_detection": 40.0,
+                "instance_captioning": 95.0,
+                "interactive_segmentation": 85.7,
+                "jigsaw_puzzle_solving": 40.0,
+                "landmark_recognition": 100.0,
+                "lesion_grading": 90.0,
+                "logo_and_brand_recognition": 95.0,
+                "lvlm_response_judgement": 45.0,
+                "medical_modality_recognition": 100.0,
+                "meme_image_understanding": 95.0,
+                "meme_vedio_understanding": 80.0,
+                "mevis": 30.0,
+                "micro_expression_recognition": 20.0,
+                "multiple_image_captioning": 95.0,
+                "multiple_instance_captioning": 95.0,
+                "multiple_view_image_understanding": 10.0,
+                "muscial_instrument_recognition": 95.0,
+                "national_flag_recognition": 100.0,
+                "navigation": 90.0,
+                "next_img_prediction": 65.0,
+                "object_detection": 90.0,
+                "one_shot_detection": 85.0,
+                "order_hallucination": 50.0,
+                "other_biological_attributes": 45.0,
+                "painting_recognition": 90.0,
+                "person_reid": 95.0,
+                "pixel_localization": 25.0,
+                "pixel_recognition": 55.0,
+                "plant_recognition": 90.0,
+                "point_tracking": 35.0,
+                "polygon_localization": 40.0,
+                "profession_recognition": 90.0,
+                "ravens_progressive_matrices": 15.0,
+                "reason_seg": 47.4,
+                "referring_detection": 45.0,
+                "relation_hallucination": 80.0,
+                "religious_recognition": 75.0,
+                "remote_sensing_object_detection": 60.0,
+                "rock_recognition": 80.0,
+                "rotated_object_detection": 77.8,
+                "salient_object_detection_rgb": 55.0,
+                "salient_object_detection_rgbd": 50.0,
+                "scene_emotion_recognition": 65.0,
+                "scene_graph_recognition": 85.0,
+                "scene_recognition": 65.0,
+                "scene_text_recognition": 90.0,
+                "science": 58.3,
+                "screenshot2code": 60.0,
+                "sculpture_recognition": 80.0,
+                "shape_recognition": 95.0,
+                "sign_language_recognition": 40.0,
+                "single_object_tracking": 65.0,
+                "sketch2code": 50.0,
+                "sketch2image_retrieval": 95.0,
+                "small_object_detection": 60.0,
+                "social_relation_recognition": 50.0,
+                "som_recognition": 94.7,
+                "sports_recognition": 95.0,
+                "spot_the_diff": 10.0,
+                "spot_the_similarity": 75.0,
+                "table_structure_recognition": 50.0,
+                "tech_engineering": 33.3,
+                "temporal_anticipation": 75.0,
+                "temporal_localization": 52.6,
+                "temporal_ordering": 25.0,
+                "temporal_sequence_understanding": 25.0,
+                "text2image_retrieval": 55.0,
+                "texture_material_recognition": 75.0,
+                "threed_cad_recognition": 70.0,
+                "threed_indoor_recognition": 45.0,
+                "traffic_anomaly_detection": 55.0,
+                "traffic_light_understanding": 100.0,
+                "traffic_participants_understanding": 60.0,
+                "traffic_sign_understanding": 95.0,
+                "transparent_object_detection": 75.0,
+                "vehicle_keypoint_detection": 55.6,
+                "vehicle_recognition": 100.0,
+                "vehicle_retrieval": 85.0,
+                "video_captioning": 95.0,
+                "visual_document_information_extraction": 95.0,
+                "visual_prompt_understanding": 75.0,
+                "waste_recognition": 100.0,
+                "weapon_recognition": 100.0,
+                "weather_recognition": 100.0,
+                "web_shopping": 40.0,
+                "whoops": 80.0,
+                "writing_poetry_from_image": 60.0
+            },
+            "BLINK": {
+                "Overall": 68.0,
+                "Art_Style": 82.9,
+                "Counting": 66.7,
+                "Forensic_Detection": 90.9,
+                "Functional_Correspondence": 43.1,
+                "IQ_Test": 32.0,
+                "Jigsaw": 76.7,
+                "Multi-view_Reasoning": 58.6,
+                "Object_Localization": 69.7,
+                "Relative_Depth": 75.8,
+                "Relative_Reflectance": 32.8,
+                "Semantic_Correspondence": 61.2,
+                "Spatial_Relation": 83.2,
+                "Visual_Correspondence": 92.4,
+                "Visual_Similarity": 83.0
+            },
+            "QBench": {
+                "Overall": 78.9,
+                "type_0_concern_0": 82.4,
+                "type_0_concern_1": 82.3,
+                "type_0_concern_2": 81.2,
+                "type_0_concern_3": 87.1,
+                "type_1_concern_0": 76.7,
+                "type_1_concern_1": 84.8,
+                "type_1_concern_2": 87.0,
+                "type_1_concern_3": 88.9,
+                "type_2_concern_0": 66.5,
+                "type_2_concern_1": 72.4,
+                "type_2_concern_2": 66.7,
+                "type_2_concern_3": 80.0
+            },
+            "ABench": {
+                "Overall": 79.2,
+                "part1 -> bag_of_words -> attribute": 92.7,
+                "part1 -> bag_of_words -> composition -> arrangement": 86.7,
+                "part1 -> bag_of_words -> composition -> occlusion": 60.0,
+                "part1 -> bag_of_words -> composition -> orientation": 76.9,
+                "part1 -> bag_of_words -> composition -> size": 71.4,
+                "part1 -> bag_of_words -> counting": 79.6,
+                "part1 -> bag_of_words -> noun_as_adjective": 81.4,
+                "part1 -> basic_recognition -> major": 92.9,
+                "part1 -> basic_recognition -> minor": 93.2,
+                "part1 -> outside_knowledge -> contradiction overcome": 70.8,
+                "part1 -> outside_knowledge -> specific-terms -> company": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> creature": 83.3,
+                "part1 -> outside_knowledge -> specific-terms -> daily": 94.1,
+                "part1 -> outside_knowledge -> specific-terms -> food": 95.5,
+                "part1 -> outside_knowledge -> specific-terms -> geography": 81.0,
+                "part1 -> outside_knowledge -> specific-terms -> material": 95.2,
+                "part1 -> outside_knowledge -> specific-terms -> science": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> sports": 68.2,
+                "part1 -> outside_knowledge -> specific-terms -> style -> abstract": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> art": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> art_deco": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> cubism": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> dadaism": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> deco": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> expressionism": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> fauvism": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> futurism": 66.7,
+                "part1 -> outside_knowledge -> specific-terms -> style -> minimalism": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> pop": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> psychedelic": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> steampunk": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> surrealism": 100.0,
+                "part1 -> outside_knowledge -> specific-terms -> style -> victorian": 0.0,
+                "part1 -> outside_knowledge -> specific-terms -> vehicle": 94.7,
+                "part1 -> outside_knowledge -> specific-terms -> weather": 92.3,
+                "part2 -> aesthetic": 62.6,
+                "part2 -> generative": 72.4,
+                "part2 -> technical": 74.9
+            },
+            "MTVQA": {
+                "Overall": 31.2,
+                "AR": 21.3,
+                "DE": 35.1,
+                "FR": 42.2,
+                "IT": 37.2,
+                "JA": 19.9,
+                "KR": 35.1,
+                "RU": 15.9,
+                "TH": 26.0,
+                "VI": 39.6
+            }
+        }
+    }
+}

README.md CHANGED Viewed

@@ -1,14 +1,30 @@
 ---
-title: Medical Llm Leaderboard
-emoji: 🌖
-colorFrom: purple
-colorTo: gray
 sdk: gradio
-sdk_version: 5.5.0
 app_file: app.py
-pinned: false
 license: apache-2.0
-short_description: A Benchmark of  Large Language Models in the Clinic
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Shopping MMLU Leaderboard
+emoji: 🌎
+colorFrom: blue
+colorTo: green
 sdk: gradio
+sdk_version: 4.44.1
 app_file: app.py
+pinned: true
 license: apache-2.0
+tags:
+- leaderboard
+short_description: 'Massive Multi-Task LLM Benchmark for Online Shopping'
 ---
+In this leaderboard, we display evaluation results obtained with Shopping MMLU. The space provides an overall leaderboard, consisting of 4 main online shopping skills:
+- Shopping Concept Understanding
+- Shopping Knowledge Reasoning
+- User Behavior Alignment
+- Multi-lingual Abilities
+Github: https://github.com/KL4805/ShoppingMMLU
+Report: https://arxiv.org/abs/2410.20745
+Please consider to cite the report if the resource is useful to your research:
+```BibTex
+```

ShoppingMMLU.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+    "time": "241031154353",
+    "results": {
+        "GPT-4o (0513, detail-high)": {
+            "META": {
+                "Method": [
+                    "GPT-4o (0513, detail-high)",
+                    "https://openai.com/index/hello-gpt-4o/"
+                ],
+                "Parameters": "",
+                "Language Model": "",
+                "Vision Model": "",
+                "Org": "OpenAI",
+                "Time": "2024/05/31",
+                "Verified": "Yes",
+                "OpenSource": "No",
+                "key": 270,
+                "dir_name": "GPT4o_HIGH"
+            },
+            "Shopping Concept Understanding": {
+                "Rec": 67.8,
+                "Ocr": 76.8,
+                "Know": 58.3,
+                "Gen": 56.9,
+                "Spat": 74.3,
+                "Math": 76.2,
+                "Overall": 69.1,
+                "Overall (official)": "N/A"
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 61.3,
+                "SCI": 64.8,
+                "TQA": 70.3,
+                "NUM": 44.4,
+                "ARI": 58.4,
+                "VQA": 47.5,
+                "GEO": 61.5,
+                "ALG": 62.3,
+                "GPS": 60.1,
+                "MWP": 69.9,
+                "LOG": 43.2,
+                "FQA": 60.2,
+                "STA": 68.4
+            },
+            "User Behavior Alignment": {
+                "Text Recognition": 199,
+                "Scene Text-centric VQA": 181,
+                "Doc-oriented VQA": 168,
+                "Key Information Extraction": 170,
+                "Handwritten Mathematical Expression Recognition": 18,
+                "Overall": 736
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 63.9,
+                "coarse perception": 73.6,
+                "fine-grained perception": 54.8,
+                "instance reasoning": 66.4,
+                "logical reasoning": 72.0,
+                "math": 66.4,
+                "science & technology": 50.0
+            }
+        }
+    }
+}

ShoppingMMLU_overall.json ADDED Viewed

	@@ -0,0 +1,653 @@

+{
+    "time": "241031154353",
+    "results": {
+        "Claude3-Sonnet": {
+            "META": {
+                "Method": [
+                    "Claude3-Sonnet",
+                    "https://aws.amazon.com/bedrock/claude/"
+                ],
+                "Parameters": "",
+                "Org": "Anthropic",
+                "OpenSource": "No",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 80.75
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 71.63
+            },
+            "User Behavior Alignment": {
+                "Overall": 70.17
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 67.76
+            }
+        },
+        "Claude2": {
+            "META": {
+                "Method": [
+                    "Claude2",
+                    "https://aws.amazon.com/bedrock/claude/"
+                ],
+                "Parameters": "",
+                "Org": "Anthropic",
+                "OpenSource": "No",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 75.46
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 65.5
+            },
+            "User Behavior Alignment": {
+                "Overall": 63.53
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 65.24
+            }
+        },
+        "ChatGPT": {
+            "META": {
+                "Method": [
+                    "ChatGPT",
+                    "https://platform.openai.com/docs/models#gpt-3-5-turbo"
+                ],
+                "Parameters": "",
+                "Org": "OpenAI",
+                "OpenSource": "No",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 75.63
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 64.97
+            },
+            "User Behavior Alignment": {
+                "Overall": 59.79
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 60.81
+            }
+        },
+        "LLaMA3-70B-Instruct": {
+            "META": {
+                "Method": [
+                    "LLaMA3-70B-Instruct",
+                    "https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"
+                ],
+                "Parameters": "70B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 75.24
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 69.29
+            },
+            "User Behavior Alignment": {
+                "Overall": 67.67
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 62.0
+            }
+        },
+        "QWen1.5-72B": {
+            "META": {
+                "Method": [
+                    "QWen1.5-72B",
+                    "https://huggingface.co/Qwen/Qwen1.5-72B"
+                ],
+                "Parameters": "72B",
+                "Org": "Alibaba",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 71.67
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 68.92
+            },
+            "User Behavior Alignment": {
+                "Overall": 64.12
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 64.84
+            }
+        },
+        "LLaMA3-70B": {
+            "META": {
+                "Method": [
+                    "LLaMA3-70B",
+                    "https://huggingface.co/meta-llama/Meta-Llama-3-70B"
+                ],
+                "Parameters": "70B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 69.59
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 63.56
+            },
+            "User Behavior Alignment": {
+                "Overall": 55.77
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 58.95
+            }
+        },
+        "LLaMA2-70B-Chat": {
+            "META": {
+                "Method": [
+                    "LLaMA2-70B-Chat",
+                    "https://huggingface.co/meta-llama/Llama-2-70b-chat-hf"
+                ],
+                "Parameters": "70B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 61.84
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 40.73
+            },
+            "User Behavior Alignment": {
+                "Overall": 44.2
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 47.04
+            }
+        },
+        "LLaMA2-70B": {
+            "META": {
+                "Method": [
+                    "LLaMA2-70B",
+                    "https://huggingface.co/meta-llama/Llama-2-70b-hf"
+                ],
+                "Parameters": "70B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 61.05
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 55.87
+            },
+            "User Behavior Alignment": {
+                "Overall": 43.24
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 47.85
+            }
+        },
+        "Mixtral-8x7B": {
+            "META": {
+                "Method": [
+                    "Mixtral-8x7B",
+                    "https://huggingface.co/mistralai/Mixtral-8x7B-v0.1"
+                ],
+                "Parameters": "46.7B",
+                "Org": "MistralAI",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 59.43
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 54.32
+            },
+            "User Behavior Alignment": {
+                "Overall": 55.31
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 44.69
+            }
+        },
+        "QWen1.5-14B": {
+            "META": {
+                "Method": [
+                    "QWen1.5-14B",
+                    "https://huggingface.co/Qwen/Qwen1.5-14B"
+                ],
+                "Parameters": "14B",
+                "Org": "Alibaba",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 67.22
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 60.92
+            },
+            "User Behavior Alignment": {
+                "Overall": 54.92
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 55.21
+            }
+        },
+        "eCeLLM-L": {
+            "META": {
+                "Method": [
+                    "eCeLLM-L",
+                    "https://huggingface.co/NingLab/eCeLLM-L"
+                ],
+                "Parameters": "13B",
+                "Org": "OSU NingLab",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 61.54
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 54.84
+            },
+            "User Behavior Alignment": {
+                "Overall": 54.55
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 59.64
+            }
+        },
+        "Vicuna-13B-v1.5": {
+            "META": {
+                "Method": [
+                    "Vicuna-13B-v1.5",
+                    "https://huggingface.co/lmsys/vicuna-13b-v1.5"
+                ],
+                "Parameters": "13B",
+                "Org": "LMSys",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 59.64
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 52.63
+            },
+            "User Behavior Alignment": {
+                "Overall": 49.81
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 49.64
+            }
+        },
+        "LLaMA2-13B-Chat": {
+            "META": {
+                "Method": [
+                    "LLaMA2-13B-Chat",
+                    "https://huggingface.co/meta-llama/Llama-2-13b-chat-hf"
+                ],
+                "Parameters": "13B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 51.79
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 45.01
+            },
+            "User Behavior Alignment": {
+                "Overall": 39.95
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 42.99
+            }
+        },
+        "LLaMA2-13B": {
+            "META": {
+                "Method": [
+                    "LLaMA2-13B",
+                    "https://huggingface.co/meta-llama/Llama-2-13b-hf"
+                ],
+                "Parameters": "13B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 45.86
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 39.47
+            },
+            "User Behavior Alignment": {
+                "Overall": 39.43
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 44.23
+            }
+        },
+        "LLaMA3-8B-Instruct": {
+            "META": {
+                "Method": [
+                    "LLaMA3-8B-Instruct",
+                    "https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"
+                ],
+                "Parameters": "8B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 65.26
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 56.84
+            },
+            "User Behavior Alignment": {
+                "Overall": 54.88
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 55.37
+            }
+        },
+        "LLaMA3-8B": {
+            "META": {
+                "Method": [
+                    "LLaMA3-8B",
+                    "https://huggingface.co/meta-llama/Meta-Llama-3-8B"
+                ],
+                "Parameters": "8B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 58.02
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 49.74
+            },
+            "User Behavior Alignment": {
+                "Overall": 44.16
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 51.03
+            }
+        },
+        "QWen1.5-7B": {
+            "META": {
+                "Method": [
+                    "QWen1.5-7B",
+                    "https://huggingface.co/Qwen/Qwen1.5-7B"
+                ],
+                "Parameters": "7B",
+                "Org": "Alibaba",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 58.89
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 52.34
+            },
+            "User Behavior Alignment": {
+                "Overall": 49.81
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 50.14
+            }
+        },
+        "eCeLLM-M": {
+            "META": {
+                "Method": [
+                    "eCeLLM-M",
+                    "https://huggingface.co/NingLab/eCeLLM-M"
+                ],
+                "Parameters": "7B",
+                "Org": "OSU NingLab",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 63.29
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 48.94
+            },
+            "User Behavior Alignment": {
+                "Overall": 53.78
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 56.08
+            }
+        },
+        "Zephyr-Beta": {
+            "META": {
+                "Method": [
+                    "Zephyr-Beta",
+                    "https://huggingface.co/HuggingFaceH4/zephyr-7b-beta"
+                ],
+                "Parameters": "7B",
+                "Org": "HuggingFace H4",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 61.65
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 52.57
+            },
+            "User Behavior Alignment": {
+                "Overall": 44.73
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 45.35
+            }
+        },
+        "Mistral-7B-Instruct": {
+            "META": {
+                "Method": [
+                    "Mistral-7B-Instruct",
+                    "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2"
+                ],
+                "Parameters": "7B",
+                "Org": "MistralAI",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 62.03
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 46.36
+            },
+            "User Behavior Alignment": {
+                "Overall": 42.21
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 43.32
+            }
+        },
+        "Mistral-7B": {
+            "META": {
+                "Method": [
+                    "Mistral-7B",
+                    "https://huggingface.co/mistralai/Mistral-7B-v0.1"
+                ],
+                "Parameters": "7B",
+                "Org": "MistralAI",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 55.82
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 46.69
+            },
+            "User Behavior Alignment": {
+                "Overall": 46.27
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 41.47
+            }
+        },
+        "Vicuna-7B-v1.5": {
+            "META": {
+                "Method": [
+                    "Vicuna-7B-v1.5",
+                    "https://huggingface.co/lmsys/vicuna-7b-v1.5"
+                ],
+                "Parameters": "7B",
+                "Org": "LMSys",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 53.46
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 45.06
+            },
+            "User Behavior Alignment": {
+                "Overall": 41.11
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 43.82
+            }
+        },
+        "LLaMA2-7B-Chat": {
+            "META": {
+                "Method": [
+                    "LLaMA2-7B-Chat",
+                    "https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"
+                ],
+                "Parameters": "7B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 51.67
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 43.48
+            },
+            "User Behavior Alignment": {
+                "Overall": 41.42
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 40.43
+            }
+        },
+        "LLaMA2-7B": {
+            "META": {
+                "Method": [
+                    "LLaMA2-7B",
+                    "https://huggingface.co/meta-llama/Llama-2-7b-hf"
+                ],
+                "Parameters": "7B",
+                "Org": "Meta",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 38.22
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 32.81
+            },
+            "User Behavior Alignment": {
+                "Overall": 32.56
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 27.71
+            }
+        },
+        "QWen1.5-4B": {
+            "META": {
+                "Method": [
+                    "QWen1.5-4B",
+                    "https://huggingface.co/Qwen/Qwen1.5-4B"
+                ],
+                "Parameters": "4B",
+                "Org": "Alibaba",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 57.21
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 52.56
+            },
+            "User Behavior Alignment": {
+                "Overall": 42.74
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 49.78
+            }
+        },
+        "Phi-2": {
+            "META": {
+                "Method": [
+                    "Phi-2",
+                    "https://huggingface.co/microsoft/phi-2"
+                ],
+                "Parameters": "2.8B",
+                "Org": "Microsoft",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 49.34
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 42.83
+            },
+            "User Behavior Alignment": {
+                "Overall": 36.38
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 32.91
+            }
+        },
+        "eCeLLM-S": {
+            "META": {
+                "Method": [
+                    "eCeLLM-S",
+                    "https://huggingface.co/NingLab/eCeLLM-S"
+                ],
+                "Parameters": "2.8B",
+                "Org": "OSU NingLab",
+                "OpenSource": "Yes",
+                "Verified": "Yes"
+            },
+            "Shopping Concept Understanding": {
+                "Overall": 49.4
+            },
+            "Shopping Knowledge Reasoning": {
+                "Overall": 39.06
+            },
+            "User Behavior Alignment": {
+                "Overall": 36.33
+            },
+            "Multi-lingual Abilities": {
+                "Overall": 32.79
+            }
+        }
+    }
+}

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import abc
+import gradio as gr
+from gen_table import *
+from meta_data import *
+with gr.Blocks() as demo:
+    struct = load_results_local()
+    timestamp = struct['time']
+    EVAL_TIME = format_timestamp(timestamp)
+    results = struct['results']
+    N_MODEL = len(results)
+    N_DATA = len(results['Claude3-Sonnet']) - 1
+    DATASETS = list(results['Claude3-Sonnet'])
+    DATASETS.remove('META')
+    print(DATASETS)
+    gr.Markdown(LEADERBORAD_INTRODUCTION.format(N_MODEL, N_DATA, EVAL_TIME))
+    structs = [abc.abstractproperty() for _ in range(N_DATA)]
+    with gr.Tabs(elem_classes='tab-buttons') as tabs:
+        with gr.TabItem('🏅 Shopping MMLU Leaderboard', elem_id='main', id=0):
+            gr.Markdown(LEADERBOARD_MD['MAIN'])
+            _, check_box = BUILD_L1_DF(results, MAIN_FIELDS)
+            table = generate_table(results, DEFAULT_BENCH)
+            table['Rank'] = list(range(1, len(table) + 1))
+            type_map = check_box['type_map']
+            type_map['Rank'] = 'number'
+            checkbox_group = gr.CheckboxGroup(
+                choices=check_box['all'],
+                value=check_box['required'],
+                label='Evaluation Dimension',
+                interactive=True,
+            )
+            headers = ['Rank'] + check_box['essential'] + checkbox_group.value
+            with gr.Row():
+                model_size = gr.CheckboxGroup(
+                    choices=MODEL_SIZE,
+                    value=MODEL_SIZE,
+                    label='Model Size',
+                    interactive=True
+                )
+                model_type = gr.CheckboxGroup(
+                    choices=MODEL_TYPE,
+                    value=MODEL_TYPE,
+                    label='Model Type',
+                    interactive=True
+                )
+            print(headers)
+            print(check_box['essential'])
+            data_component = gr.components.DataFrame(
+                value=table[headers],
+                type='pandas',
+                datatype=[type_map[x] for x in headers],
+                interactive=False,
+                visible=True)
+            def filter_df(fields, model_size, model_type):
+                filter_list = ['Avg Score', 'Avg Rank', 'OpenSource', 'Verified']
+                headers = ['Rank'] + check_box['essential'] + fields
+                new_fields = [field for field in fields if field not in filter_list]
+                df = generate_table(results, new_fields)
+                df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
+                df = df[df['flag']]
+                df.pop('flag')
+                if len(df):
+                    df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
+                    df = df[df['flag']]
+                    df.pop('flag')
+                df['Rank'] = list(range(1, len(df) + 1))
+                comp = gr.components.DataFrame(
+                    value=df[headers],
+                    type='pandas',
+                    datatype=[type_map[x] for x in headers],
+                    interactive=False,
+                    visible=True)
+                return comp
+            for cbox in [checkbox_group, model_size, model_type]:
+                cbox.change(fn=filter_df, inputs=[checkbox_group, model_size, model_type], outputs=data_component)
+        with gr.TabItem('🔍 About', elem_id='about', id=1):
+            gr.Markdown(urlopen(SHOPPINGMMLU_README).read().decode())
+        for i, dataset in enumerate(DATASETS):
+            with gr.TabItem(f'📊 {dataset} Leaderboard', elem_id=dataset, id=i + 2):
+                if dataset in LEADERBOARD_MD:
+                    gr.Markdown(LEADERBOARD_MD[dataset])
+                s = structs[i]
+                s.table, s.check_box = BUILD_L2_DF(results, dataset)
+                s.type_map = s.check_box['type_map']
+                s.type_map['Rank'] = 'number'
+                s.checkbox_group = gr.CheckboxGroup(
+                    choices=s.check_box['all'],
+                    value=s.check_box['required'],
+                    label=f'{dataset} CheckBoxes',
+                    interactive=True,
+                )
+                s.headers = ['Rank'] + s.check_box['essential'] + s.checkbox_group.value
+                print(s.check_box['essential'])
+                print(s.checkbox_group.value)
+                s.table['Rank'] = list(range(1, len(s.table) + 1))
+                print(s.headers)
+                with gr.Row():
+                    s.model_size = gr.CheckboxGroup(
+                        choices=MODEL_SIZE,
+                        value=MODEL_SIZE,
+                        label='Model Size',
+                        interactive=True
+                    )
+                    s.model_type = gr.CheckboxGroup(
+                        choices=MODEL_TYPE,
+                        value=MODEL_TYPE,
+                        label='Model Type',
+                        interactive=True
+                    )
+                s.data_component = gr.components.DataFrame(
+                    value=s.table[s.headers],
+                    type='pandas',
+                    datatype=[s.type_map[x] for x in s.headers],
+                    interactive=False,
+                    visible=True)
+                s.dataset = gr.Textbox(value=dataset, label=dataset, visible=False)
+                def filter_df_l2(dataset_name, fields, model_size, model_type):
+                    s = structs[DATASETS.index(dataset_name)]
+                    headers = ['Rank'] + s.check_box['essential'] + fields
+                    df = cp.deepcopy(s.table)
+                    df['flag'] = [model_size_flag(x, model_size) for x in df['Param (B)']]
+                    df = df[df['flag']]
+                    df.pop('flag')
+                    if len(df):
+                        df['flag'] = [model_type_flag(df.iloc[i], model_type) for i in range(len(df))]
+                        df = df[df['flag']]
+                        df.pop('flag')
+                    df['Rank'] = list(range(1, len(df) + 1))
+                    comp = gr.components.DataFrame(
+                        value=df[headers],
+                        type='pandas',
+                        datatype=[s.type_map[x] for x in headers],
+                        interactive=False,
+                        visible=True)
+                    return comp
+                for cbox in [s.checkbox_group, s.model_size, s.model_type]:
+                    cbox.change(
+                        fn=filter_df_l2,
+                        inputs=[s.dataset, s.checkbox_group, s.model_size, s.model_type],
+                        outputs=s.data_component)
+    with gr.Row():
+        with gr.Accordion('Citation', open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                elem_id='citation-button')
+if __name__ == '__main__':
+    demo.launch(server_name='0.0.0.0')

gen_table.py ADDED Viewed

	@@ -0,0 +1,204 @@

+import copy as cp
+import json
+from collections import defaultdict
+from urllib.request import urlopen
+import gradio as gr
+import numpy as np
+import pandas as pd
+from meta_data import DEFAULT_BENCH, META_FIELDS, URL, RESULTS
+def listinstr(lst, s):
+    assert isinstance(lst, list)
+    for item in lst:
+        if item in s:
+            return True
+    return False
+def load_results():
+    data = json.loads(urlopen(URL).read())
+    return data
+def load_results_local():
+    with open(RESULTS, 'r') as infile:
+        data = json.load(infile)
+    return data
+def nth_large(val, vals):
+    return sum([1 for v in vals if v > val]) + 1
+def format_timestamp(timestamp):
+    date = timestamp[:2] + '.' + timestamp[2:4] + '.' + timestamp[4:6]
+    time = timestamp[6:8] + ':' + timestamp[8:10] + ':' + timestamp[10:12]
+    return date + ' ' + time
+def model_size_flag(sz, FIELDS):
+    if pd.isna(sz) and 'Unknown' in FIELDS:
+        return True
+    if pd.isna(sz):
+        return False
+    if '<4B' in FIELDS and sz < 4:
+        return True
+    if '4B-10B' in FIELDS and sz >= 4 and sz < 10:
+        return True
+    if '10B-20B' in FIELDS and sz >= 10 and sz < 20:
+        return True
+    if '20B-40B' in FIELDS and sz >= 20 and sz < 40:
+        return True
+    if '>40B' in FIELDS and sz >= 40:
+        return True
+    return False
+def model_type_flag(line, FIELDS):
+    if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
+        return True
+    if 'API' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'Yes':
+        return True
+    if 'Proprietary' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'No':
+        return True
+    return False
+def BUILD_L1_DF(results, fields):
+    check_box = {}
+    check_box['essential'] = ['Method', 'Param (B)']
+    # revise there to set default dataset
+    check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
+    check_box['avg'] = ['Avg Score', 'Avg Rank']
+    check_box['all'] = check_box['avg'] + fields
+    type_map = defaultdict(lambda: 'number')
+    type_map['Method'] = 'html'
+    type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
+    check_box['type_map'] = type_map
+    df = generate_table(results, fields)
+    return df, check_box
+def BUILD_L2_DF(results, dataset):
+    res = defaultdict(list)
+    sub = [v for v in results.values() if dataset in v]
+    assert len(sub)
+    fields = list(sub[0][dataset].keys())
+    non_overall_fields = [x for x in fields if 'Overall' not in x]
+    overall_fields = [x for x in fields if 'Overall' in x]
+    if dataset == 'MME':
+        non_overall_fields = [x for x in non_overall_fields if not listinstr(['Perception', 'Cognition'], x)]
+        overall_fields = overall_fields + ['Perception', 'Cognition']
+    if dataset == 'OCRBench':
+        non_overall_fields = [x for x in non_overall_fields if not listinstr(['Final Score'], x)]
+        overall_fields = ['Final Score']
+    print(overall_fields)
+    print(non_overall_fields)
+    for m in results:
+        item = results[m]
+        if dataset not in item:
+            continue
+        meta = item['META']
+        for k in META_FIELDS:
+            if k == 'Param (B)':
+                param = meta['Parameters']
+                res[k].append(float(param.replace('B', '')) if param != '' else None)
+            elif k == 'Method':
+                name, url = meta['Method']
+                res[k].append(f'<a href="{url}">{name}</a>')
+            else:
+                res[k].append(meta[k])
+        fields = [x for x in fields]
+        for d in non_overall_fields:
+            res[d].append(item[dataset][d])
+        for d in overall_fields:
+            res[d].append(item[dataset][d])
+    df = pd.DataFrame(res)
+    print(df)
+    all_fields = overall_fields + non_overall_fields
+    # Use the first 5 non-overall fields as required fields
+    # required_fields = overall_fields if len(overall_fields) else non_overall_fields[:5]
+    required_fields = all_fields
+    if dataset == 'OCRBench':
+        df = df.sort_values('Final Score')
+    elif dataset == 'COCO_VAL':
+        df = df.sort_values('CIDEr')
+    else:
+        df = df.sort_values('Overall')
+    df = df.iloc[::-1]
+    check_box = {}
+    check_box['essential'] = ['Method', 'Param (B)']
+    check_box['required'] = required_fields
+    check_box['all'] = all_fields
+    type_map = defaultdict(lambda: 'number')
+    type_map['Method'] = 'html'
+    type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
+    check_box['type_map'] = type_map
+    return df, check_box
+def generate_table(results, fields):
+    def get_mmbench_v11(item):
+        assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
+        val = (item['MMBench_TEST_CN_V11']['Overall'] + item['MMBench_TEST_EN_V11']['Overall']) / 2
+        val = float(f'{val:.1f}')
+        return val
+    res = defaultdict(list)
+    for i, m in enumerate(results):
+        item = results[m]
+        meta = item['META']
+        for k in META_FIELDS:
+            if k == 'Param (B)':
+                param = meta['Parameters']
+                res[k].append(float(param.replace('B', '')) if param != '' else None)
+            elif k == 'Method':
+                name, url = meta['Method']
+                res[k].append(f'<a href="{url}">{name}</a>')
+                res['name'].append(name)
+            else:
+                res[k].append(meta[k])
+        scores, ranks = [], []
+        for d in fields:
+            key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
+            # Every Model should have MMBench_V11 results
+            if d == 'MMBench_V11':
+                val = get_mmbench_v11(item)
+                res[d].append(val)
+                scores.append(val)
+                ranks.append(nth_large(val, [get_mmbench_v11(x) for x in results.values()]))
+            elif d in item:
+                res[d].append(item[d][key_name])
+                if d == 'MME':
+                    scores.append(item[d][key_name] / 28)
+                elif d == 'OCRBench':
+                    scores.append(item[d][key_name] / 10)
+                else:
+                    scores.append(item[d][key_name])
+                ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x]))
+            else:
+                res[d].append(None)
+                scores.append(None)
+                ranks.append(None)
+        res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None)
+        res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None)
+    df = pd.DataFrame(res)
+    valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
+    valid = valid.sort_values('Avg Score')
+    valid = valid.iloc[::-1]
+    if len(fields):
+        missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0])
+        missing = missing.iloc[::-1]
+    df = pd.concat([valid, missing])
+    return df

meta_data.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# CONSTANTS-URL
+URL = "http://opencompass.openxlab.space/assets/OpenVLM.json"
+RESULTS = 'ShoppingMMLU_overall.json'
+SHOPPINGMMLU_README = 'https://raw.githubusercontent.com/KL4805/ShoppingMMLU/refs/heads/main/README.md'
+# CONSTANTS-CITATION
+CITATION_BUTTON_TEXT = r"""@article{jin2024shopping,
+  title={Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models},
+  author={Jin, Yilun and Li, Zheng and Zhang, Chenwei and Cao, Tianyu and Gao, Yifan and Jayarao, Pratik and Li, Mao and Liu, Xin and Sarkhel, Ritesh and Tang, Xianfeng and others},
+  journal={arXiv preprint arXiv:2410.20745},
+  year={2024}
+}"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+# CONSTANTS-TEXT
+LEADERBORAD_INTRODUCTION = """# Shopping MMLU Leaderboard
+### Welcome to Shopping MMLU Leaderboard! On this leaderboard we share the evaluation results of LLMs obtained by the OpenSource Framework:
+### [Shopping MMLU: A Massive Multi-Task Online Shopping Benchmark for Large Language Models](https://github.com/KL4805/ShoppingMMLU) 🏆
+### Currently, Shopping MMLU Leaderboard covers {} different LLMs and {} main online shopping skills.
+This leaderboard was last updated: {}.
+Shopping MMLU Leaderboard only includes open-source LLMs or API models that are publicly available. To add your own model to the leaderboard, please create a PR in [Shopping MMLU](https://github.com/KL4805/ShoppingMMLU) to support your LLM and then we will help with the evaluation and updating the leaderboard. For any questions or concerns, please feel free to contact us at [email protected] and [email protected].
+"""
+# CONSTANTS-FIELDS
+META_FIELDS = ['Method', 'Param (B)', 'OpenSource', 'Verified']
+# MAIN_FIELDS = [
+#     'MMBench_V11', 'MMStar', 'MME',
+#     'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
+#     'HallusionBench', 'SEEDBench_IMG', 'MMVet',
+#     'LLaVABench', 'CCBench', 'RealWorldQA', 'POPE', 'ScienceQA_TEST',
+#     'SEEDBench2_Plus', 'MMT-Bench_VAL', 'BLINK'
+# ]
+MAIN_FIELDS = [
+    'Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities'
+]
+# DEFAULT_BENCH = [
+#     'MMBench_V11', 'MMStar', 'MMMU_VAL', 'MathVista', 'OCRBench', 'AI2D',
+#     'HallusionBench', 'MMVet'
+# ]
+DEFAULT_BENCH = ['Shopping Concept Understanding', 'Shopping Knowledge Reasoning', 'User Behavior Alignment','Multi-lingual Abilities']
+MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
+MODEL_TYPE = ['API', 'OpenSource', 'Proprietary']
+# The README file for each benchmark
+LEADERBOARD_MD = {}
+LEADERBOARD_MD['MAIN'] = f"""
+## Included Shopping Skills:
+- Shopping Concept Understanding: Understanding domain-specific short texts in online shopping (e.g. brands, product models).
+- Shopping Knowledge Reasoning: Reasoning over commonsense, numeric, and implicit product-product multi-hop knowledge.
+- User Behavior Alignment: Modeling heterogeneous and implicit user behaviors (e.g. click, query, purchase).
+- Multi-lingual Abilities: Online shopping across marketplaces around the globe.
+## Main Evaluation Results
+- Metrics:
+  - Avg Score: The average score on all 4 online shopping skills (normalized to 0 - 100, the higher the better).
+  - Detailed metrics and evaluation results for each skill are provided in the consequent tabs.
+"""
+LEADERBOARD_MD['Shopping Concept Understanding'] = """
+## Shopping Concept Understanding Evaluation Results
+Online shopping concepts such as brands and product models are domain-specific and not often seen in pre-training. Moreover, they often appear in short texts (e.g. queries, attribute-value pairs) and thus no sufficient contexts are given to help understand them. Hence, failing to understand these concepts compromises the performance of LLMs on downstream tasks.
+The included sub-skills and tasks include:
+- **Concept Normalization**:
+  - Product Category Synonym
+  - Attribute Value Synonym
+- **Elaboration**:
+  - Attribute Explanation
+  - Product Category Explanation
+- **Relational Inference**:
+  - Applicable Attribute to Product Category
+  - Applicable Product Category to Attribute
+  - Inapplicable Attributes
+  - Valid Attribute Value Given Attribute and Product Category
+  - Valid Attribute Given Attribute Value and Product Category
+  - Product Category Classification
+  - Product Category Generation
+- **Sentiment Analysis**:
+  - Aspect-based Sentiment Classification
+  - Aspect-based Review Retrieval
+  - Aspect-based Review Selection
+  - Aspect-based Reviews Overall Sentiment Classification
+- **Information Extraction**:
+  - Attribute Value Extraction
+  - Query Named Entity Recognition
+  - Aspect-based Review Keyphrase Selection
+  - Aspect-based Review Keyphrase Extraction
+- **Summarization**:
+  - Attribute Naming from Decription
+  - Product Category Naming from Description
+  - Review Aspect Retrieval
+  - Single Conversation Topic Selection
+  - Multi-Conversation Topic Retrieval
+  - Product Keyphrase Selection
+  - Product Keyphrase Retrieval
+  - Product Title Generation
+"""
+LEADERBOARD_MD['Shopping Knowledge Reasoning'] = """
+## Shopping Knowledge Reasoning Evaluation Results
+This skill focuses on understanding and applying various implicit knowledge to perform reasoning over products and their attributes. For example, calculations such as the total volume of a product pack require numeric reasoning, and finding compatible products requires multi-hop reasoning among various products over a product knowledge graph.
+The included sub-skills and tasks include:
+- **Numeric Reasoning**:
+  - Unit Conversation
+  - Product Numeric Reasoning
+- **Commonsense Reasoning**
+- **Implicit Multi-Hop Reasoning**:
+  - Product Compatibility
+  - Complementary Product Categories
+  - Implicit Attribute Reasoning
+  - Related Brands Selection
+  - Related Brands Retrieval
+"""
+LEADERBOARD_MD['User Behavior Alignment'] = """
+## User Behavior Alignment Evaluation Results
+Accurately modeling user behaviors is a crucial skill in online shopping. A large variety of user behaviors exist in online shopping, including queries, clicks, add-to-carts, purchases, etc. Moreover, these behaviors are generally implicit and not expressed in text.
+Consequently, LLMs trained with general texts encounter challenges in aligning with the heterogeneous and implicit user behaviors as they rarely observe such inputs during pre-training.
+The included sub-skills and tasks include:
+- **Query-Query Relations**:
+  - Query Re-Writing
+  - Query-Query Intention Selection
+  - Intention-Based Related Query Retrieval
+- **Query-Product Relations**:
+  - Product Category Selection for Query
+  - Query-Product Relation Selection
+  - Query-Product Ranking
+- **Sessions**:
+  - Session-based Query Recommendation
+  - Session-based Next Query Selection
+  - Session-based Next Product Selection
+- **Purchases**:
+  - Product Co-Purchase Selection
+  - Product Co-Purchase Retrieval
+- **Reviews and QA**:
+  - Review Rating Prediction
+  - Aspect-Sentiment-Based Review Generation
+  - Review Helpfulness Selection
+  - Product-Based Question Answering
+"""
+LEADERBOARD_MD['Multi-lingual Abilities'] = """
+## Multi-lingual Abilities Evaluation Results
+Multi-lingual models are desired in online shopping as they can be deployed in multiple marketplaces without re-training.
+The included sub-skills and tasks include:
+- **Multi-lingual Shopping Concept Understanding**:
+  - Multi-lingual Product Title Generation
+  - Multi-lingual Product Keyphrase Selection
+  - Cross-lingual Product Title Translation
+  - Cross-lingual Product Entity Alignment
+- **Multi-lingual User Behavior Alignment**:
+  - Multi-lingual Query-product Relation Selection
+  - Multi-lingual Query-product Ranking
+  - Multi-lingual Session-based Product Recommendation
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio==4.15.0
+numpy>=1.23.4
+pandas>=1.5.3