1,151
edits
(Created page with "==Exams== {| class="wikitable" ! Exam ! GPT-4 Estimated Percentile ! GPT-4 (no vision) Estimated Percentile ! GPT-3.5 Estimated Percentile |- | Uniform Bar Exam (MBE+MEE+MPT)1 | 298 / 400 (~90th) | 298 / 400 (~90th) | 213 / 400 (~10th) |- | LSAT | 163 (~88th) | 161 (~83rd) | 149 (~40th) |- | SAT Evidence-Based Reading & Writing | 710 / 800 (~93rd) | 710 / 800 (~93rd) | 670 / 800 (~87th) |- | SAT Math | 700 / 800 (~89th) | 690 / 800 (~89th)...") |
Startledcat (talk | contribs) No edit summary |
||
(3 intermediate revisions by one other user not shown) | |||
Line 1: | Line 1: | ||
{{see also|GPT-4 Plugins}} | |||
==Exams== | ==Exams== | ||
{| class="wikitable" | {| class="wikitable" | ||
! Exam | ! Exam | ||
! GPT-4 | ! GPT-4 Points | ||
! GPT-4 (no vision) | ! GPT-4 Percentile | ||
! GPT-3.5 | ! GPT-4 (no vision) Points | ||
! GPT-4 (no vision) Percentile | |||
! GPT-3.5 Points | |||
! GPT-3.5 Percentile | |||
|- | |- | ||
| Uniform Bar Exam (MBE+MEE+MPT)1 | | Uniform Bar Exam (MBE+MEE+MPT)1 | ||
| 298 / 400 | | 298 / 400 | ||
| 298 / 400 | | ~90th | ||
| 213 / 400 | | 298 / 400 | ||
| ~90th | |||
| 213 / 400 | |||
| ~10th | |||
|- | |- | ||
| LSAT | | LSAT | ||
| 163 | | 163 | ||
| 161 | | ~88th | ||
| 149 | | 161 | ||
| ~83rd | |||
| 149 | |||
| ~40th | |||
|- | |- | ||
| SAT Evidence-Based Reading & Writing | | SAT Evidence-Based Reading & Writing | ||
| 710 / 800 | | 710 / 800 | ||
| 710 / 800 | | ~93rd | ||
| 670 / 800 | | 710 / 800 | ||
| ~93rd | |||
| 670 / 800 | |||
| ~87th | |||
|- | |- | ||
| SAT Math | | SAT Math | ||
| 700 / 800 | | 700 / 800 | ||
| 690 / 800 | | ~89th | ||
| 590 / 800 | | 690 / 800 | ||
| ~89th | |||
| 590 / 800 | |||
| ~70th | |||
|- | |- | ||
| Graduate Record Examination (GRE) Quantitative | | Graduate Record Examination (GRE) Quantitative | ||
| 163 / 170 | | 163 / 170 | ||
| 157 / 170 | | ~80th | ||
| 147 / 170 | | 157 / 170 | ||
| ~62nd | |||
| 147 / 170 | |||
| ~25th | |||
|- | |- | ||
| Graduate Record Examination (GRE) Verbal | | Graduate Record Examination (GRE) Verbal | ||
| 169 / 170 | | 169 / 170 | ||
| 165 / 170 | | ~99th | ||
| 154 / 170 | | 165 / 170 | ||
| ~96th | |||
| 154 / 170 | |||
| ~63rd | |||
|- | |- | ||
| Graduate Record Examination (GRE) Writing | | Graduate Record Examination (GRE) Writing | ||
| 4 / 6 | | 4 / 6 | ||
| 4 / 6 | | ~54th | ||
| 4 / 6 | | 4 / 6 | ||
| ~54th | |||
| 4 / 6 | |||
| ~54th | |||
|- | |- | ||
| USABO Semifinal Exam 2020 | | USABO Semifinal Exam 2020 | ||
| 87 / 150 | | 87 / 150 | ||
| 87 / 150 | | 99th–100th | ||
| 43 / 150 | | 87 / 150 | ||
| 99th–100th | |||
| 43 / 150 | |||
| 31st–33rd | |||
|- | |- | ||
| USNCO Local Section Exam 2022 | | USNCO Local Section Exam 2022 | ||
| 36 / 60 | | 36 / 60 | ||
| | |||
| 38 / 60 | | 38 / 60 | ||
| | |||
| 24 / 60 | | 24 / 60 | ||
| | |||
|- | |- | ||
| Medical Knowledge Self-Assessment Program | | Medical Knowledge Self-Assessment Program | ||
| 75% | | 75% | ||
| | |||
| 75% | | 75% | ||
| | |||
| 53% | | 53% | ||
| | |||
|- | |- | ||
| Codeforces Rating | | Codeforces Rating | ||
| 392 | | 392 | ||
| 392 | | below 5th | ||
| 260 | | 392 | ||
| below 5th | |||
| 260 | |||
| below 5th | |||
|- | |- | ||
| AP Art History | | AP Art History | ||
| 5 | | 5 | ||
| 5 | | 86th–100th | ||
| 5 | | 5 | ||
| 86th–100th | |||
| 5 | |||
| 86th–100th | |||
|- | |- | ||
| AP Biology | | AP Biology | ||
| 5 | | 5 | ||
| 5 | | 85th–100th | ||
| 4 | | 5 | ||
| 85th–100th | |||
| 4 | |||
| 62nd–85th | |||
|- | |- | ||
| AP Calculus BC | | AP Calculus BC | ||
| 4 | | 4 | ||
| 4 | | 43rd–59th | ||
| 1 | | 4 | ||
| 43rd–59th | |||
| 1 | |||
| 0th–7th | |||
|- | |- | ||
| | |} | ||
| 4 ( | |||
| 4 ( | ==Benchmarks== | ||
| 2 ( | {| class="wikitable" | ||
! Benchmark | |||
! GPT-4 | |||
! Evaluated few-shot | |||
! GPT-3.5 | |||
! Evaluated few-shot | |||
! LM SOTA | |||
! Best external LM evaluated few-shot | |||
! SOTA | |||
! Best external model (includes benchmark-specific training) | |||
|- | |||
| MMLU | |||
| 86.4% | |||
| 5-shot | |||
| 70.0% | |||
| 5-shot | |||
| 70.7% | |||
| 5-shot U-PaLM | |||
| 75.2% | |||
| 5-shot Flan-PaLM | |||
|- | |||
| HellaSwag | |||
| 95.3% | |||
| 10-shot | |||
| 85.5% | |||
| 10-shot | |||
| 84.2% | |||
| LLAMA (validation set) | |||
| 85.6% | |||
| ALUM | |||
|- | |||
| AI2 Reasoning Challenge (ARC) | |||
| 96.3% | |||
| 25-shot | |||
| 85.2% | |||
| 25-shot | |||
| 84.2% | |||
| 8-shot PaLM | |||
| 85.6% | |||
| ST-MOE | |||
|- | |||
| WinoGrande | |||
| 87.5% | |||
| 5-shot | |||
| 81.6% | |||
| 5-shot | |||
| 84.2% | |||
| 5-shot PALM | |||
| 85.6% | |||
| 5-shot PALM | |||
|- | |||
| HumanEval | |||
| 67.0% | |||
| 0-shot | |||
| 48.1% | |||
| 0-shot | |||
| 26.2% | |||
| 0-shot PaLM | |||
| 65.8% | |||
| CodeT + GPT-3.5 | |||
|- | |||
| DROP (f1 score) | |||
| 80.9 | |||
| 3-shot | |||
| 64.1 | |||
| 3-shot | |||
| 70.8 | |||
| 1-shot PaLM | |||
| 88.4 | |||
|} | |||
==Benmarks (Visual)== | |||
{| class="wikitable" | |||
! Benchmark | |||
! GPT-4 | |||
! Evaluated few-shot | |||
! Few-shot SOTA | |||
! SOTA | |||
! Best external model (includes benchmark-specific training) | |||
|- | |||
| VQAv2 | |||
| 77.2% | |||
| 0-shot | |||
| 67.6% | |||
| Flamingo 32-shot | |||
| 84.3% | |||
| PaLI-17B | |||
|- | |||
| TextVQA | |||
| 78.0% | |||
| 0-shot | |||
| 37.9% | |||
| Flamingo 32-shot | |||
| 71.8% | |||
| PaLI-17B | |||
|- | |||
| ChartQA | |||
| 78.5%A | |||
| - | |||
| 58.6% | |||
| Pix2Struct Large | |||
| - | |||
|- | |||
| AI2 Diagram (AI2D) | |||
| 78.2% | |||
| 0-shot | |||
| - | |||
| 42.1% | |||
| Pix2Struct Large | |||
| - | |||
|- | |||
| DocVQA | |||
| 88.4% | |||
| 0-shot (pixel-only) | |||
| - | |||
| 88.4% | |||
| ERNIE-Layout 2.0 | |||
| - | |||
|- | |- | ||
| | | Infographic VQA | ||
| | | 75.1% | ||
| 2 | | 0-shot (pixel-only) | ||
| | | - | ||
| 61.2% | |||
| Applica.ai TILT | |||
| - | |||
|- | |- | ||
| | | TVQA | ||
| | | 87.3% | ||
| | | 0-shot | ||
| | | - | ||
| 86.5% | |||
| MERLOT Reserve Large | |||
| - | |||
|- | |- | ||
| LSMDC | |||
| 45.7% | |||
| 0-shot | |||
| 31.0% | |||
| MERLOT Reserve 0-shot | |||
| 52.9% | |||
| MERLOT | |||
|} | |} |
edits