fenglinliu commited on
Commit
62e2373
·
verified ·
1 Parent(s): d98297b

Update ShoppingMMLU_overall.json

Browse files
Files changed (1) hide show
  1. ShoppingMMLU_overall.json +737 -104
ShoppingMMLU_overall.json CHANGED
@@ -190,244 +190,877 @@
190
  "Overall": 67.76
191
  }
192
  },
193
- "LLaMA-3-70B": {
194
  "META": {
195
  "Method": [
196
- "LLaMA-3-70B",
197
- "https://huggingface.co/meta-llama/Meta-Llama-3-70B"
198
  ],
199
- "Parameters": "70B",
200
- "Org": "Meta",
201
- "OpenSource": "Yes",
202
  "Verified": "Yes"
203
  },
204
- "Shopping Concept Understanding": {
205
- "Overall": 69.59
206
  },
207
- "Shopping Knowledge Reasoning": {
208
- "Overall": 63.56
209
  },
210
- "User Behavior Alignment": {
211
- "Overall": 55.77
212
  },
213
- "Multi-lingual Abilities": {
214
- "Overall": 58.95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  }
216
  },
217
- "LLaMA-2-70B": {
218
  "META": {
219
  "Method": [
220
- "LLaMA-2-70B",
221
- "https://huggingface.co/meta-llama/Llama-2-70b-chat-hf"
222
  ],
223
- "Parameters": "70B",
224
- "Org": "Meta",
225
  "OpenSource": "Yes",
226
  "Verified": "Yes"
227
  },
228
  "Shopping Concept Understanding": {
229
- "Overall": 61.84
230
  },
231
  "Shopping Knowledge Reasoning": {
232
- "Overall": 40.73
233
  },
234
  "User Behavior Alignment": {
235
- "Overall": 44.2
236
  },
237
  "Multi-lingual Abilities": {
238
- "Overall": 47.04
239
  }
240
  },
241
- "Vicuna-13B-v1.5": {
242
  "META": {
243
  "Method": [
244
- "Vicuna-13B-v1.5",
245
- "https://huggingface.co/lmsys/vicuna-13b-v1.5"
246
  ],
247
- "Parameters": "13B",
248
- "Org": "LMSys",
249
  "OpenSource": "Yes",
250
  "Verified": "Yes"
251
  },
252
  "Shopping Concept Understanding": {
253
- "Overall": 59.64
254
  },
255
  "Shopping Knowledge Reasoning": {
256
- "Overall": 52.63
257
  },
258
  "User Behavior Alignment": {
259
- "Overall": 49.81
260
  },
261
  "Multi-lingual Abilities": {
262
- "Overall": 49.64
263
  }
264
  },
265
- "LLaMA2-13B-Chat": {
266
  "META": {
267
  "Method": [
268
- "LLaMA2-13B-Chat",
269
- "https://huggingface.co/meta-llama/Llama-2-13b-chat-hf"
270
  ],
271
- "Parameters": "13B",
272
- "Org": "Meta",
273
  "OpenSource": "Yes",
274
  "Verified": "Yes"
275
  },
276
  "Shopping Concept Understanding": {
277
- "Overall": 51.79
278
  },
279
  "Shopping Knowledge Reasoning": {
280
- "Overall": 45.01
281
  },
282
  "User Behavior Alignment": {
283
- "Overall": 39.95
284
  },
285
  "Multi-lingual Abilities": {
286
- "Overall": 42.99
287
  }
288
  },
289
- "LLaMA2-13B": {
290
  "META": {
291
  "Method": [
292
- "LLaMA2-13B",
293
- "https://huggingface.co/meta-llama/Llama-2-13b-hf"
294
  ],
295
  "Parameters": "13B",
296
- "Org": "Meta",
297
  "OpenSource": "Yes",
298
  "Verified": "Yes"
299
  },
300
  "Shopping Concept Understanding": {
301
- "Overall": 45.86
302
  },
303
  "Shopping Knowledge Reasoning": {
304
- "Overall": 39.47
305
  },
306
  "User Behavior Alignment": {
307
- "Overall": 39.43
308
  },
309
  "Multi-lingual Abilities": {
310
- "Overall": 44.23
311
  }
312
  },
313
- "Mistral-7B-Instruct": {
314
  "META": {
315
  "Method": [
316
- "Mistral-7B-Instruct",
317
- "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2"
318
  ],
319
- "Parameters": "7B",
320
- "Org": "MistralAI",
321
  "OpenSource": "Yes",
322
  "Verified": "Yes"
323
  },
324
  "Shopping Concept Understanding": {
325
- "Overall": 62.03
326
  },
327
  "Shopping Knowledge Reasoning": {
328
- "Overall": 46.36
329
  },
330
  "User Behavior Alignment": {
331
- "Overall": 42.21
332
  },
333
  "Multi-lingual Abilities": {
334
- "Overall": 43.32
335
  }
336
  },
337
- "Mistral-7B": {
338
  "META": {
339
  "Method": [
340
- "Mistral-7B",
341
- "https://huggingface.co/mistralai/Mistral-7B-v0.1"
342
  ],
343
- "Parameters": "7B",
344
- "Org": "MistralAI",
345
  "OpenSource": "Yes",
346
  "Verified": "Yes"
347
  },
348
  "Shopping Concept Understanding": {
349
- "Overall": 55.82
350
  },
351
  "Shopping Knowledge Reasoning": {
352
- "Overall": 46.69
353
  },
354
  "User Behavior Alignment": {
355
- "Overall": 46.27
356
  },
357
  "Multi-lingual Abilities": {
358
- "Overall": 41.47
359
  }
360
  },
361
- "Vicuna-7B-v1.5": {
362
  "META": {
363
  "Method": [
364
- "Vicuna-7B-v1.5",
365
- "https://huggingface.co/lmsys/vicuna-7b-v1.5"
366
  ],
367
- "Parameters": "7B",
368
- "Org": "LMSys",
369
  "OpenSource": "Yes",
370
  "Verified": "Yes"
371
  },
372
  "Shopping Concept Understanding": {
373
- "Overall": 53.46
374
  },
375
  "Shopping Knowledge Reasoning": {
376
- "Overall": 45.06
377
  },
378
  "User Behavior Alignment": {
379
- "Overall": 41.11
380
  },
381
  "Multi-lingual Abilities": {
382
- "Overall": 43.82
383
  }
384
  },
385
- "LLaMA2-7B-Chat": {
386
  "META": {
387
  "Method": [
388
- "LLaMA2-7B-Chat",
389
- "https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"
390
  ],
391
- "Parameters": "7B",
392
- "Org": "Meta",
393
- "OpenSource": "Yes",
394
  "Verified": "Yes"
395
  },
396
- "Shopping Concept Understanding": {
397
- "Overall": 51.67
398
  },
399
- "Shopping Knowledge Reasoning": {
400
- "Overall": 43.48
401
  },
402
- "User Behavior Alignment": {
403
- "Overall": 41.42
404
  },
405
- "Multi-lingual Abilities": {
406
- "Overall": 40.43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  }
408
  },
409
- "LLaMA2-7B": {
410
  "META": {
411
  "Method": [
412
- "LLaMA2-7B",
413
- "https://huggingface.co/meta-llama/Llama-2-7b-hf"
414
  ],
415
- "Parameters": "7B",
416
- "Org": "Meta",
417
- "OpenSource": "Yes",
418
  "Verified": "Yes"
419
  },
420
- "Shopping Concept Understanding": {
421
- "Overall": 38.22
422
  },
423
- "Shopping Knowledge Reasoning": {
424
- "Overall": 32.81
425
  },
426
- "User Behavior Alignment": {
427
- "Overall": 32.56
428
  },
429
- "Multi-lingual Abilities": {
430
- "Overall": 27.71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  }
432
  },
433
  }
 
190
  "Overall": 67.76
191
  }
192
  },
193
+ "Alpaca": {
194
  "META": {
195
  "Method": [
196
+ "Alpaca",
 
197
  ],
198
+ "Parameters": "7B",
199
+ "Org": "OpenAI",
200
+ "OpenSource": "No",
201
  "Verified": "Yes"
202
  },
203
+ "MedQA": {
204
+ "Overall": 80.75
205
  },
206
+ "MedMCQA": {
207
+ "Overall": 71.63
208
  },
209
+ "MMLU-Medicine": {
210
+ "Overall": 70.17
211
  },
212
+ "PubMedQA": {
213
+ "Overall": 67.76
214
+ },
215
+ "Referral QA": {
216
+ "Overall": 67.76
217
+ },
218
+ "Treat Recom.": {
219
+ "Overall": 67.76
220
+ },
221
+ "MIMIC": {
222
+ "Overall": 67.76
223
+ },
224
+ "IU-Xray": {
225
+ "Overall": 67.76
226
+ },
227
+ "Hospitaliz. Summari.": {
228
+ "Overall": 67.76
229
+ },
230
+ "Patient Education": {
231
+ "Overall": 67.76
232
+ },
233
+ "BC5": {
234
+ "Overall": 67.76
235
+ },
236
+ "NCBI": {
237
+ "Overall": 67.76
238
+ },
239
+ "DDI": {
240
+ "Overall": 67.76
241
+ },
242
+ "GAD": {
243
+ "Overall": 67.76
244
+ },
245
+ "HoC": {
246
+ "Overall": 67.76
247
+ },
248
+ "Pharma. QA": {
249
+ "Overall": 67.76
250
+ },
251
+ "Drug Inter.": {
252
+ "Overall": 67.76
253
  }
254
  },
255
+ "Vicuna-7B-v1.5": {
256
  "META": {
257
  "Method": [
258
+ "Vicuna-7B-v1.5",
259
+ "https://huggingface.co/lmsys/vicuna-7b-v1.5"
260
  ],
261
+ "Parameters": "7B",
262
+ "Org": "LMSys",
263
  "OpenSource": "Yes",
264
  "Verified": "Yes"
265
  },
266
  "Shopping Concept Understanding": {
267
+ "Overall": 53.46
268
  },
269
  "Shopping Knowledge Reasoning": {
270
+ "Overall": 45.06
271
  },
272
  "User Behavior Alignment": {
273
+ "Overall": 41.11
274
  },
275
  "Multi-lingual Abilities": {
276
+ "Overall": 43.82
277
  }
278
  },
279
+ "LLaMA2-7B-Chat": {
280
  "META": {
281
  "Method": [
282
+ "LLaMA2-7B-Chat",
283
+ "https://huggingface.co/meta-llama/Llama-2-7b-chat-hf"
284
  ],
285
+ "Parameters": "7B",
286
+ "Org": "Meta",
287
  "OpenSource": "Yes",
288
  "Verified": "Yes"
289
  },
290
  "Shopping Concept Understanding": {
291
+ "Overall": 51.67
292
  },
293
  "Shopping Knowledge Reasoning": {
294
+ "Overall": 43.48
295
  },
296
  "User Behavior Alignment": {
297
+ "Overall": 41.42
298
  },
299
  "Multi-lingual Abilities": {
300
+ "Overall": 40.43
301
  }
302
  },
303
+ "Mistral-7B-Instruct": {
304
  "META": {
305
  "Method": [
306
+ "Mistral-7B-Instruct",
307
+ "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2"
308
  ],
309
+ "Parameters": "7B",
310
+ "Org": "MistralAI",
311
  "OpenSource": "Yes",
312
  "Verified": "Yes"
313
  },
314
  "Shopping Concept Understanding": {
315
+ "Overall": 62.03
316
  },
317
  "Shopping Knowledge Reasoning": {
318
+ "Overall": 46.36
319
  },
320
  "User Behavior Alignment": {
321
+ "Overall": 42.21
322
  },
323
  "Multi-lingual Abilities": {
324
+ "Overall": 43.32
325
  }
326
  },
327
+ "Vicuna-13B-v1.5": {
328
  "META": {
329
  "Method": [
330
+ "Vicuna-13B-v1.5",
331
+ "https://huggingface.co/lmsys/vicuna-13b-v1.5"
332
  ],
333
  "Parameters": "13B",
334
+ "Org": "LMSys",
335
  "OpenSource": "Yes",
336
  "Verified": "Yes"
337
  },
338
  "Shopping Concept Understanding": {
339
+ "Overall": 59.64
340
  },
341
  "Shopping Knowledge Reasoning": {
342
+ "Overall": 52.63
343
  },
344
  "User Behavior Alignment": {
345
+ "Overall": 49.81
346
  },
347
  "Multi-lingual Abilities": {
348
+ "Overall": 49.64
349
  }
350
  },
351
+ "LLaMA-2-13B-Chat": {
352
  "META": {
353
  "Method": [
354
+ "LLaMA-2-13B-Chat",
355
+ "https://huggingface.co/meta-llama/Llama-2-13b-chat-hf"
356
  ],
357
+ "Parameters": "13B",
358
+ "Org": "Meta",
359
  "OpenSource": "Yes",
360
  "Verified": "Yes"
361
  },
362
  "Shopping Concept Understanding": {
363
+ "Overall": 51.79
364
  },
365
  "Shopping Knowledge Reasoning": {
366
+ "Overall": 45.01
367
  },
368
  "User Behavior Alignment": {
369
+ "Overall": 39.95
370
  },
371
  "Multi-lingual Abilities": {
372
+ "Overall": 42.99
373
  }
374
  },
375
+ "LLaMA-2-70B": {
376
  "META": {
377
  "Method": [
378
+ "LLaMA-2-70B",
379
+ "https://huggingface.co/meta-llama/Llama-2-70b-chat-hf"
380
  ],
381
+ "Parameters": "70B",
382
+ "Org": "Meta",
383
  "OpenSource": "Yes",
384
  "Verified": "Yes"
385
  },
386
  "Shopping Concept Understanding": {
387
+ "Overall": 61.84
388
  },
389
  "Shopping Knowledge Reasoning": {
390
+ "Overall": 40.73
391
  },
392
  "User Behavior Alignment": {
393
+ "Overall": 44.2
394
  },
395
  "Multi-lingual Abilities": {
396
+ "Overall": 47.04
397
  }
398
  },
399
+ "LLaMA-3-70B": {
400
  "META": {
401
  "Method": [
402
+ "LLaMA-3-70B",
403
+ "https://huggingface.co/meta-llama/Meta-Llama-3-70B"
404
  ],
405
+ "Parameters": "70B",
406
+ "Org": "Meta",
407
  "OpenSource": "Yes",
408
  "Verified": "Yes"
409
  },
410
  "Shopping Concept Understanding": {
411
+ "Overall": 69.59
412
  },
413
  "Shopping Knowledge Reasoning": {
414
+ "Overall": 63.56
415
  },
416
  "User Behavior Alignment": {
417
+ "Overall": 55.77
418
  },
419
  "Multi-lingual Abilities": {
420
+ "Overall": 58.95
421
  }
422
  },
423
+ "Huatuo": {
424
  "META": {
425
  "Method": [
426
+ "Huatuo"
 
427
  ],
428
+ "Parameters": "",
429
+ "Org": "OpenAI",
430
+ "OpenSource": "No",
431
  "Verified": "Yes"
432
  },
433
+ "MedQA": {
434
+ "Overall": 80.75
435
  },
436
+ "MedMCQA": {
437
+ "Overall": 71.63
438
  },
439
+ "MMLU-Medicine": {
440
+ "Overall": 70.17
441
  },
442
+ "PubMedQA": {
443
+ "Overall": 67.76
444
+ },
445
+ "Referral QA": {
446
+ "Overall": 67.76
447
+ },
448
+ "Treat Recom.": {
449
+ "Overall": 67.76
450
+ },
451
+ "MIMIC": {
452
+ "Overall": 67.76
453
+ },
454
+ "IU-Xray": {
455
+ "Overall": 67.76
456
+ },
457
+ "Hospitaliz. Summari.": {
458
+ "Overall": 67.76
459
+ },
460
+ "Patient Education": {
461
+ "Overall": 67.76
462
+ },
463
+ "BC5": {
464
+ "Overall": 67.76
465
+ },
466
+ "NCBI": {
467
+ "Overall": 67.76
468
+ },
469
+ "DDI": {
470
+ "Overall": 67.76
471
+ },
472
+ "GAD": {
473
+ "Overall": 67.76
474
+ },
475
+ "HoC": {
476
+ "Overall": 67.76
477
+ },
478
+ "Pharma. QA": {
479
+ "Overall": 67.76
480
+ },
481
+ "Drug Inter.": {
482
+ "Overall": 67.76
483
  }
484
  },
485
+ "ChatDoctor": {
486
  "META": {
487
  "Method": [
488
+ "ChatDoctor"
 
489
  ],
490
+ "Parameters": "",
491
+ "Org": "OpenAI",
492
+ "OpenSource": "No",
493
  "Verified": "Yes"
494
  },
495
+ "MedQA": {
496
+ "Overall": 80.75
497
  },
498
+ "MedMCQA": {
499
+ "Overall": 71.63
500
  },
501
+ "MMLU-Medicine": {
502
+ "Overall": 70.17
503
  },
504
+ "PubMedQA": {
505
+ "Overall": 67.76
506
+ },
507
+ "Referral QA": {
508
+ "Overall": 67.76
509
+ },
510
+ "Treat Recom.": {
511
+ "Overall": 67.76
512
+ },
513
+ "MIMIC": {
514
+ "Overall": 67.76
515
+ },
516
+ "IU-Xray": {
517
+ "Overall": 67.76
518
+ },
519
+ "Hospitaliz. Summari.": {
520
+ "Overall": 67.76
521
+ },
522
+ "Patient Education": {
523
+ "Overall": 67.76
524
+ },
525
+ "BC5": {
526
+ "Overall": 67.76
527
+ },
528
+ "NCBI": {
529
+ "Overall": 67.76
530
+ },
531
+ "DDI": {
532
+ "Overall": 67.76
533
+ },
534
+ "GAD": {
535
+ "Overall": 67.76
536
+ },
537
+ "HoC": {
538
+ "Overall": 67.76
539
+ },
540
+ "Pharma. QA": {
541
+ "Overall": 67.76
542
+ },
543
+ "Drug Inter.": {
544
+ "Overall": 67.76
545
+ }
546
+ },
547
+ "PMC-LLaMA": {
548
+ "META": {
549
+ "Method": [
550
+ "PMC-LLaMA"
551
+ ],
552
+ "Parameters": "7B",
553
+ "Org": "OpenAI",
554
+ "OpenSource": "No",
555
+ "Verified": "Yes"
556
+ },
557
+ "MedQA": {
558
+ "Overall": 80.75
559
+ },
560
+ "MedMCQA": {
561
+ "Overall": 71.63
562
+ },
563
+ "MMLU-Medicine": {
564
+ "Overall": 70.17
565
+ },
566
+ "PubMedQA": {
567
+ "Overall": 67.76
568
+ },
569
+ "Referral QA": {
570
+ "Overall": 67.76
571
+ },
572
+ "Treat Recom.": {
573
+ "Overall": 67.76
574
+ },
575
+ "MIMIC": {
576
+ "Overall": 67.76
577
+ },
578
+ "IU-Xray": {
579
+ "Overall": 67.76
580
+ },
581
+ "Hospitaliz. Summari.": {
582
+ "Overall": 67.76
583
+ },
584
+ "Patient Education": {
585
+ "Overall": 67.76
586
+ },
587
+ "BC5": {
588
+ "Overall": 67.76
589
+ },
590
+ "NCBI": {
591
+ "Overall": 67.76
592
+ },
593
+ "DDI": {
594
+ "Overall": 67.76
595
+ },
596
+ "GAD": {
597
+ "Overall": 67.76
598
+ },
599
+ "HoC": {
600
+ "Overall": 67.76
601
+ },
602
+ "Pharma. QA": {
603
+ "Overall": 67.76
604
+ },
605
+ "Drug Inter.": {
606
+ "Overall": 67.76
607
+ }
608
+ },
609
+ "Baize-Healthcare": {
610
+ "META": {
611
+ "Method": [
612
+ "Baize-Healthcare"
613
+ ],
614
+ "Parameters": "7B",
615
+ "Org": "OpenAI",
616
+ "OpenSource": "No",
617
+ "Verified": "Yes"
618
+ },
619
+ "MedQA": {
620
+ "Overall": 80.75
621
+ },
622
+ "MedMCQA": {
623
+ "Overall": 71.63
624
+ },
625
+ "MMLU-Medicine": {
626
+ "Overall": 70.17
627
+ },
628
+ "PubMedQA": {
629
+ "Overall": 67.76
630
+ },
631
+ "Referral QA": {
632
+ "Overall": 67.76
633
+ },
634
+ "Treat Recom.": {
635
+ "Overall": 67.76
636
+ },
637
+ "MIMIC": {
638
+ "Overall": 67.76
639
+ },
640
+ "IU-Xray": {
641
+ "Overall": 67.76
642
+ },
643
+ "Hospitaliz. Summari.": {
644
+ "Overall": 67.76
645
+ },
646
+ "Patient Education": {
647
+ "Overall": 67.76
648
+ },
649
+ "BC5": {
650
+ "Overall": 67.76
651
+ },
652
+ "NCBI": {
653
+ "Overall": 67.76
654
+ },
655
+ "DDI": {
656
+ "Overall": 67.76
657
+ },
658
+ "GAD": {
659
+ "Overall": 67.76
660
+ },
661
+ "HoC": {
662
+ "Overall": 67.76
663
+ },
664
+ "Pharma. QA": {
665
+ "Overall": 67.76
666
+ },
667
+ "Drug Inter.": {
668
+ "Overall": 67.76
669
+ }
670
+ },
671
+ "MedAlpaca": {
672
+ "META": {
673
+ "Method": [
674
+ "MedAlpaca"
675
+ ],
676
+ "Parameters": "7B",
677
+ "Org": "OpenAI",
678
+ "OpenSource": "No",
679
+ "Verified": "Yes"
680
+ },
681
+ "MedQA": {
682
+ "Overall": 80.75
683
+ },
684
+ "MedMCQA": {
685
+ "Overall": 71.63
686
+ },
687
+ "MMLU-Medicine": {
688
+ "Overall": 70.17
689
+ },
690
+ "PubMedQA": {
691
+ "Overall": 67.76
692
+ },
693
+ "Referral QA": {
694
+ "Overall": 67.76
695
+ },
696
+ "Treat Recom.": {
697
+ "Overall": 67.76
698
+ },
699
+ "MIMIC": {
700
+ "Overall": 67.76
701
+ },
702
+ "IU-Xray": {
703
+ "Overall": 67.76
704
+ },
705
+ "Hospitaliz. Summari.": {
706
+ "Overall": 67.76
707
+ },
708
+ "Patient Education": {
709
+ "Overall": 67.76
710
+ },
711
+ "BC5": {
712
+ "Overall": 67.76
713
+ },
714
+ "NCBI": {
715
+ "Overall": 67.76
716
+ },
717
+ "DDI": {
718
+ "Overall": 67.76
719
+ },
720
+ "GAD": {
721
+ "Overall": 67.76
722
+ },
723
+ "HoC": {
724
+ "Overall": 67.76
725
+ },
726
+ "Pharma. QA": {
727
+ "Overall": 67.76
728
+ },
729
+ "Drug Inter.": {
730
+ "Overall": 67.76
731
+ }
732
+ },
733
+ "Meditron": {
734
+ "META": {
735
+ "Method": [
736
+ "Meditron"
737
+ ],
738
+ "Parameters": "",
739
+ "Org": "OpenAI",
740
+ "OpenSource": "No",
741
+ "Verified": "Yes"
742
+ },
743
+ "MedQA": {
744
+ "Overall": 80.75
745
+ },
746
+ "MedMCQA": {
747
+ "Overall": 71.63
748
+ },
749
+ "MMLU-Medicine": {
750
+ "Overall": 70.17
751
+ },
752
+ "PubMedQA": {
753
+ "Overall": 67.76
754
+ },
755
+ "Referral QA": {
756
+ "Overall": 67.76
757
+ },
758
+ "Treat Recom.": {
759
+ "Overall": 67.76
760
+ },
761
+ "MIMIC": {
762
+ "Overall": 67.76
763
+ },
764
+ "IU-Xray": {
765
+ "Overall": 67.76
766
+ },
767
+ "Hospitaliz. Summari.": {
768
+ "Overall": 67.76
769
+ },
770
+ "Patient Education": {
771
+ "Overall": 67.76
772
+ },
773
+ "BC5": {
774
+ "Overall": 67.76
775
+ },
776
+ "NCBI": {
777
+ "Overall": 67.76
778
+ },
779
+ "DDI": {
780
+ "Overall": 67.76
781
+ },
782
+ "GAD": {
783
+ "Overall": 67.76
784
+ },
785
+ "HoC": {
786
+ "Overall": 67.76
787
+ },
788
+ "Pharma. QA": {
789
+ "Overall": 67.76
790
+ },
791
+ "Drug Inter.": {
792
+ "Overall": 67.76
793
+ }
794
+ },
795
+ "BioMistral": {
796
+ "META": {
797
+ "Method": [
798
+ "BioMistral"
799
+ ],
800
+ "Parameters": "",
801
+ "Org": "OpenAI",
802
+ "OpenSource": "No",
803
+ "Verified": "Yes"
804
+ },
805
+ "MedQA": {
806
+ "Overall": 80.75
807
+ },
808
+ "MedMCQA": {
809
+ "Overall": 71.63
810
+ },
811
+ "MMLU-Medicine": {
812
+ "Overall": 70.17
813
+ },
814
+ "PubMedQA": {
815
+ "Overall": 67.76
816
+ },
817
+ "Referral QA": {
818
+ "Overall": 67.76
819
+ },
820
+ "Treat Recom.": {
821
+ "Overall": 67.76
822
+ },
823
+ "MIMIC": {
824
+ "Overall": 67.76
825
+ },
826
+ "IU-Xray": {
827
+ "Overall": 67.76
828
+ },
829
+ "Hospitaliz. Summari.": {
830
+ "Overall": 67.76
831
+ },
832
+ "Patient Education": {
833
+ "Overall": 67.76
834
+ },
835
+ "BC5": {
836
+ "Overall": 67.76
837
+ },
838
+ "NCBI": {
839
+ "Overall": 67.76
840
+ },
841
+ "DDI": {
842
+ "Overall": 67.76
843
+ },
844
+ "GAD": {
845
+ "Overall": 67.76
846
+ },
847
+ "HoC": {
848
+ "Overall": 67.76
849
+ },
850
+ "Pharma. QA": {
851
+ "Overall": 67.76
852
+ },
853
+ "Drug Inter.": {
854
+ "Overall": 67.76
855
+ }
856
+ },
857
+ "PMC-LLaMA": {
858
+ "META": {
859
+ "Method": [
860
+ "PMC-LLaMA"
861
+ ],
862
+ "Parameters": "",
863
+ "Org": "OpenAI",
864
+ "OpenSource": "No",
865
+ "Verified": "Yes"
866
+ },
867
+ "MedQA": {
868
+ "Overall": 80.75
869
+ },
870
+ "MedMCQA": {
871
+ "Overall": 71.63
872
+ },
873
+ "MMLU-Medicine": {
874
+ "Overall": 70.17
875
+ },
876
+ "PubMedQA": {
877
+ "Overall": 67.76
878
+ },
879
+ "Referral QA": {
880
+ "Overall": 67.76
881
+ },
882
+ "Treat Recom.": {
883
+ "Overall": 67.76
884
+ },
885
+ "MIMIC": {
886
+ "Overall": 67.76
887
+ },
888
+ "IU-Xray": {
889
+ "Overall": 67.76
890
+ },
891
+ "Hospitaliz. Summari.": {
892
+ "Overall": 67.76
893
+ },
894
+ "Patient Education": {
895
+ "Overall": 67.76
896
+ },
897
+ "BC5": {
898
+ "Overall": 67.76
899
+ },
900
+ "NCBI": {
901
+ "Overall": 67.76
902
+ },
903
+ "DDI": {
904
+ "Overall": 67.76
905
+ },
906
+ "GAD": {
907
+ "Overall": 67.76
908
+ },
909
+ "HoC": {
910
+ "Overall": 67.76
911
+ },
912
+ "Pharma. QA": {
913
+ "Overall": 67.76
914
+ },
915
+ "Drug Inter.": {
916
+ "Overall": 67.76
917
+ }
918
+ },
919
+ "MedAlpaca": {
920
+ "META": {
921
+ "Method": [
922
+ "MedAlpaca"
923
+ ],
924
+ "Parameters": "7B",
925
+ "Org": "OpenAI",
926
+ "OpenSource": "No",
927
+ "Verified": "Yes"
928
+ },
929
+ "MedQA": {
930
+ "Overall": 80.75
931
+ },
932
+ "MedMCQA": {
933
+ "Overall": 71.63
934
+ },
935
+ "MMLU-Medicine": {
936
+ "Overall": 70.17
937
+ },
938
+ "PubMedQA": {
939
+ "Overall": 67.76
940
+ },
941
+ "Referral QA": {
942
+ "Overall": 67.76
943
+ },
944
+ "Treat Recom.": {
945
+ "Overall": 67.76
946
+ },
947
+ "MIMIC": {
948
+ "Overall": 67.76
949
+ },
950
+ "IU-Xray": {
951
+ "Overall": 67.76
952
+ },
953
+ "Hospitaliz. Summari.": {
954
+ "Overall": 67.76
955
+ },
956
+ "Patient Education": {
957
+ "Overall": 67.76
958
+ },
959
+ "BC5": {
960
+ "Overall": 67.76
961
+ },
962
+ "NCBI": {
963
+ "Overall": 67.76
964
+ },
965
+ "DDI": {
966
+ "Overall": 67.76
967
+ },
968
+ "GAD": {
969
+ "Overall": 67.76
970
+ },
971
+ "HoC": {
972
+ "Overall": 67.76
973
+ },
974
+ "Pharma. QA": {
975
+ "Overall": 67.76
976
+ },
977
+ "Drug Inter.": {
978
+ "Overall": 67.76
979
+ }
980
+ },
981
+ "ClinicalCamel": {
982
+ "META": {
983
+ "Method": [
984
+ "ClinicalCamel"
985
+ ],
986
+ "Parameters": "",
987
+ "Org": "OpenAI",
988
+ "OpenSource": "No",
989
+ "Verified": "Yes"
990
+ },
991
+ "MedQA": {
992
+ "Overall": 80.75
993
+ },
994
+ "MedMCQA": {
995
+ "Overall": 71.63
996
+ },
997
+ "MMLU-Medicine": {
998
+ "Overall": 70.17
999
+ },
1000
+ "PubMedQA": {
1001
+ "Overall": 67.76
1002
+ },
1003
+ "Referral QA": {
1004
+ "Overall": 67.76
1005
+ },
1006
+ "Treat Recom.": {
1007
+ "Overall": 67.76
1008
+ },
1009
+ "MIMIC": {
1010
+ "Overall": 67.76
1011
+ },
1012
+ "IU-Xray": {
1013
+ "Overall": 67.76
1014
+ },
1015
+ "Hospitaliz. Summari.": {
1016
+ "Overall": 67.76
1017
+ },
1018
+ "Patient Education": {
1019
+ "Overall": 67.76
1020
+ },
1021
+ "BC5": {
1022
+ "Overall": 67.76
1023
+ },
1024
+ "NCBI": {
1025
+ "Overall": 67.76
1026
+ },
1027
+ "DDI": {
1028
+ "Overall": 67.76
1029
+ },
1030
+ "GAD": {
1031
+ "Overall": 67.76
1032
+ },
1033
+ "HoC": {
1034
+ "Overall": 67.76
1035
+ },
1036
+ "Pharma. QA": {
1037
+ "Overall": 67.76
1038
+ },
1039
+ "Drug Inter.": {
1040
+ "Overall": 67.76
1041
+ }
1042
+ },
1043
+ "Meditron-70B": {
1044
+ "META": {
1045
+ "Method": [
1046
+ "Meditron-70B"
1047
+ ],
1048
+ "Parameters": "13B",
1049
+ "Org": "Meta",
1050
+ "OpenSource": "Yes",
1051
+ "Verified": "Yes"
1052
+ },
1053
+ "Shopping Concept Understanding": {
1054
+ "Overall": 45.86
1055
+ },
1056
+ "Shopping Knowledge Reasoning": {
1057
+ "Overall": 39.47
1058
+ },
1059
+ "User Behavior Alignment": {
1060
+ "Overall": 39.43
1061
+ },
1062
+ "Multi-lingual Abilities": {
1063
+ "Overall": 44.23
1064
  }
1065
  },
1066
  }