Interface administrators, Administrators (Semantic MediaWiki), Curators (Semantic MediaWiki), Editors (Semantic MediaWiki), Suppressors, Administrators
7,785
edits
No edit summary |
No edit summary |
||
Line 121: | Line 121: | ||
| 0th–7th | | 0th–7th | ||
|- | |- | ||
|} | |||
==Benchmarks== | |||
{| class="wikitable" | |||
! Benchmark | |||
! GPT-4 | |||
! Evaluated few-shot | |||
! GPT-3.5 | |||
! Evaluated few-shot | |||
! LM SOTA | |||
! Best external LM evaluated few-shot | |||
! SOTA | |||
! Best external model (includes benchmark-specific training) | |||
|- | |||
| MMLU | |||
| 86.4% | |||
| 5-shot | |||
| 70.0% | |||
| 5-shot | |||
| 70.7% | |||
| 5-shot U-PaLM | |||
| 75.2% | |||
| 5-shot Flan-PaLM | |||
|- | |||
| HellaSwag | |||
| 95.3% | |||
| 10-shot | |||
| 85.5% | |||
| 10-shot | |||
| 84.2% | |||
| LLAMA (validation set) | |||
| 85.6% | |||
| ALUM | |||
|- | |||
| AI2 Reasoning Challenge (ARC) | |||
| 96.3% | |||
| 25-shot | |||
| 85.2% | |||
| 25-shot | |||
| 84.2% | |||
| 8-shot PaLM | |||
| 85.6% | |||
| ST-MOE | |||
|- | |||
| WinoGrande | |||
| 87.5% | |||
| 5-shot | |||
| 81.6% | |||
| 5-shot | |||
| 84.2% | |||
| 5-shot PALM | |||
| 85.6% | |||
| 5-shot PALM | |||
|- | |||
| HumanEval | |||
| 67.0% | |||
| 0-shot | |||
| 48.1% | |||
| 0-shot | |||
| 26.2% | |||
| 0-shot PaLM | |||
| 65.8% | |||
| CodeT + GPT-3.5 | |||
|- | |||
| DROP (f1 score) | |||
| 80.9 | |||
| 3-shot | |||
| 64.1 | |||
| 3-shot | |||
| 70.8 | |||
| 1-shot PaLM | |||
| 88.4 | |||
|} | |} |