1,151
edits
No edit summary |
Startledcat (talk | contribs) No edit summary |
||
(2 intermediate revisions by one other user not shown) | |||
Line 1: | Line 1: | ||
{{see also|GPT-4 Plugins}} | |||
==Exams== | ==Exams== | ||
{| class="wikitable" | {| class="wikitable" | ||
Line 121: | Line 122: | ||
| 0th–7th | | 0th–7th | ||
|- | |- | ||
|} | |||
==Benchmarks== | |||
{| class="wikitable" | |||
! Benchmark | |||
! GPT-4 | |||
! Evaluated few-shot | |||
! GPT-3.5 | |||
! Evaluated few-shot | |||
! LM SOTA | |||
! Best external LM evaluated few-shot | |||
! SOTA | |||
! Best external model (includes benchmark-specific training) | |||
|- | |||
| MMLU | |||
| 86.4% | |||
| 5-shot | |||
| 70.0% | |||
| 5-shot | |||
| 70.7% | |||
| 5-shot U-PaLM | |||
| 75.2% | |||
| 5-shot Flan-PaLM | |||
|- | |||
| HellaSwag | |||
| 95.3% | |||
| 10-shot | |||
| 85.5% | |||
| 10-shot | |||
| 84.2% | |||
| LLAMA (validation set) | |||
| 85.6% | |||
| ALUM | |||
|- | |||
| AI2 Reasoning Challenge (ARC) | |||
| 96.3% | |||
| 25-shot | |||
| 85.2% | |||
| 25-shot | |||
| 84.2% | |||
| 8-shot PaLM | |||
| 85.6% | |||
| ST-MOE | |||
|- | |||
| WinoGrande | |||
| 87.5% | |||
| 5-shot | |||
| 81.6% | |||
| 5-shot | |||
| 84.2% | |||
| 5-shot PALM | |||
| 85.6% | |||
| 5-shot PALM | |||
|- | |||
| HumanEval | |||
| 67.0% | |||
| 0-shot | |||
| 48.1% | |||
| 0-shot | |||
| 26.2% | |||
| 0-shot PaLM | |||
| 65.8% | |||
| CodeT + GPT-3.5 | |||
|- | |||
| DROP (f1 score) | |||
| 80.9 | |||
| 3-shot | |||
| 64.1 | |||
| 3-shot | |||
| 70.8 | |||
| 1-shot PaLM | |||
| 88.4 | |||
|} | |||
==Benmarks (Visual)== | |||
{| class="wikitable" | |||
! Benchmark | |||
! GPT-4 | |||
! Evaluated few-shot | |||
! Few-shot SOTA | |||
! SOTA | |||
! Best external model (includes benchmark-specific training) | |||
|- | |||
| VQAv2 | |||
| 77.2% | |||
| 0-shot | |||
| 67.6% | |||
| Flamingo 32-shot | |||
| 84.3% | |||
| PaLI-17B | |||
|- | |||
| TextVQA | |||
| 78.0% | |||
| 0-shot | |||
| 37.9% | |||
| Flamingo 32-shot | |||
| 71.8% | |||
| PaLI-17B | |||
|- | |||
| ChartQA | |||
| 78.5%A | |||
| - | |||
| 58.6% | |||
| Pix2Struct Large | |||
| - | |||
|- | |||
| AI2 Diagram (AI2D) | |||
| 78.2% | |||
| 0-shot | |||
| - | |||
| 42.1% | |||
| Pix2Struct Large | |||
| - | |||
|- | |||
| DocVQA | |||
| 88.4% | |||
| 0-shot (pixel-only) | |||
| - | |||
| 88.4% | |||
| ERNIE-Layout 2.0 | |||
| - | |||
|- | |||
| Infographic VQA | |||
| 75.1% | |||
| 0-shot (pixel-only) | |||
| - | |||
| 61.2% | |||
| Applica.ai TILT | |||
| - | |||
|- | |||
| TVQA | |||
| 87.3% | |||
| 0-shot | |||
| - | |||
| 86.5% | |||
| MERLOT Reserve Large | |||
| - | |||
|- | |||
| LSMDC | |||
| 45.7% | |||
| 0-shot | |||
| 31.0% | |||
| MERLOT Reserve 0-shot | |||
| 52.9% | |||
| MERLOT | |||
|} | |} |
edits