File size: 4,443 Bytes
df490c1
 
 
 
 
 
 
 
 
dbfe5ff
df490c1
 
 
 
 
 
 
 
 
 
 
 
 
437e5ce
df490c1
 
 
0e7c595
df490c1
0e7c595
df490c1
0e7c595
df490c1
 
f216670
 
df490c1
f216670
df490c1
f216670
df490c1
63a2b2d
 
 
 
 
 
 
df490c1
32dc043
ada2d66
df490c1
f216670
df490c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a53052
600915c
df490c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41ae4ba
df490c1
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import copy as cp
import json
from collections import defaultdict
from urllib.request import urlopen

import gradio as gr
import numpy as np
import pandas as pd

from meta_data import DEFAULT_BENCH, META_FIELDS, RESULTS


def load_results_local():
    with open(RESULTS, 'r') as infile:
        data = json.load(infile)
    return data

def nth_large(val, vals):
    return sum([1 for v in vals if v > val]) + 1



def model_size_flag(sz, FIELDS):
    if pd.isna(sz) and 'Unknown' in FIELDS:
        return True
    if pd.isna(sz):
        return False
    if '7B' in FIELDS and sz == 7:
        return True
    if '13B' in FIELDS and sz == 13:
        return True
    if '70B' in FIELDS and sz == 70:
        return True
    return False


def model_type_flag(line, FIELDS):
    if 'OpenSource' in FIELDS and line['OpenSource'] == 'Yes':
        return True
    if 'API' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'Yes':
        return True
    # if 'Proprietary' in FIELDS and line['OpenSource'] == 'No' and line['Verified'] == 'No':
        # return True
    if 'Commercial LLMs' in FIELDS and line['Commercial LLMs'] == 'Yes':
        return True
    if 'General LLMs' in FIELDS and line['General LLMs'] == 'Yes':
        return True
    if 'Medical LLMs' in FIELDS and line['Medical LLMs'] == 'Yes':
        return True
    if 'SOTA' in FIELDS and line['SOTA'] == 'Yes':
        return True
    return False
    
def BUILD_L1_DF(results, fields):
    check_box = {}
    check_box['essential'] = ['Method', 'Param (B)']
    # revise there to set default dataset
    check_box['required'] = ['Avg Score', 'Avg Rank'] + DEFAULT_BENCH
    check_box['avg'] = ['Avg Score', 'Avg Rank']
    check_box['all'] = check_box['avg'] + fields
    type_map = defaultdict(lambda: 'number')
    type_map['Method'] = 'html'
    type_map['Language Model'] = type_map['Vision Model'] = type_map['OpenSource'] = type_map['Verified'] = 'str'
    check_box['type_map'] = type_map

    df = generate_table(results, fields)
    return df, check_box


def generate_table(results, fields):

    def get_mmbench_v11(item):
        assert 'MMBench_TEST_CN_V11' in item and 'MMBench_TEST_EN_V11' in item
        val = (item['MMBench_TEST_CN_V11']['Overall'] + item['MMBench_TEST_EN_V11']['Overall']) / 2
        val = float(f'{val:.1f}')
        return val

    res = defaultdict(list)
    for i, m in enumerate(results):
        item = results[m]
        meta = item['META']
        for k in META_FIELDS:
            if k == 'Param (B)':
                param = meta['Parameters']
                res[k].append(float(param.replace('B', '')) if param != '' else None)
            elif k == 'Method':
                name = meta['Method'][0]
                res[k].append(f'{name}')
                res['name'].append(name)
            else:
                res[k].append(meta[k])
        scores, ranks = [], []
        for d in fields:
            key_name = 'Overall' if d != 'OCRBench' else 'Final Score'
            # Every Model should have MMBench_V11 results
            if d == 'MMBench_V11':
                val = get_mmbench_v11(item)
                res[d].append(val)
                scores.append(val)
                ranks.append(nth_large(val, [get_mmbench_v11(x) for x in results.values()]))
            elif d in item:
                res[d].append(item[d][key_name])
                if d == 'MME':
                    scores.append(item[d][key_name] / 28)
                elif d == 'OCRBench':
                    scores.append(item[d][key_name] / 10)
                else:
                    scores.append(item[d][key_name])
                ranks.append(nth_large(item[d][key_name], [x[d][key_name] for x in results.values() if d in x]))
            else:
                res[d].append(None)
                scores.append(None)
                ranks.append(None)

        res['Avg Score'].append(round(np.mean(scores), 1) if None not in scores else None)
        res['Avg Rank'].append(round(np.mean(ranks), 2) if None not in ranks else None)

    df = pd.DataFrame(res)
    valid, missing = df[~pd.isna(df['Avg Score'])], df[pd.isna(df['Avg Score'])]
    valid = valid.sort_values('Avg Score')
    valid = valid.iloc[::-1]
    if len(fields):
        missing = missing.sort_values('MMBench_V11' if 'MMBench_V11' in fields else fields[0])
        missing = missing.iloc[::-1]
    df = pd.concat([valid, missing])
    return df