ABCDEFGHIJKLMNOPQRSTUVWXY
1
TaskTypeModel statusRandomAvg humanBest humanGPT 3-shotPaLM 3-shotGPT >= avg human?PaLM >= avg human?PaLM >= 90% of avg human?GPT >= best human?PaLM >= best human?PaLM > GPT?Best/avg humanPaLM/GPTPaLM/avg humanPaLM/best humanGPT/avg humanGPT/best humanGPT/randomPaLM/randomAvg human / randomBest human / randomCommentary
2
abstract_narrative_understandingmultiple choiceModels at avg human2030604055TRUETRUETRUEFALSEFALSETRUE200%138%183%92%133%67%200%275%150%300%
3
anachronismstrue / falsePaLM at avg human5060905365FALSETRUETRUEFALSEFALSETRUE150%123%108%72%88%59%106%130%120%180%
4
analogical_similaritymultiple choicePaLM at avg human15401002040FALSETRUETRUEFALSEFALSETRUE250%200%100%40%50%20%133%267%267%667%
5
analytic_entailmenttrue / falsePaLM almost at avg human50801006375FALSEFALSETRUEFALSEFALSETRUE125%119%94%75%79%63%126%150%160%200%
6
ascii_word_recognitionexact str matchModels subhuman085100515FALSEFALSEFALSEFALSEFALSETRUE118%300%18%15%6%5%
7
authorship_verificationtrue / falseModels at avg human5050905050TRUETRUETRUEFALSEFALSEFALSE180%100%100%56%100%56%100%100%100%180%Seems like a very hard task for both humans and machines
8
auto_categorizationbleuModels superhuman01.58109TRUETRUETRUETRUETRUEFALSE533%90%600%113%667%125%
9
auto_debuggingexact str matchPaLM at avg human01550045FALSETRUETRUEFALSEFALSETRUE333%300%90%0%0%
10
bridging_anaphora_resolution_barqaexact str matchPaLM superhuman01530033FALSETRUETRUEFALSETRUETRUE200%220%110%0%0%
11
causal_judgmenttrue / falsePaLM almost at avg human50701005565FALSEFALSETRUEFALSEFALSETRUE143%118%93%65%79%55%110%130%140%200%
12
cause_and_effecttrue / falsePaLM almost at avg human50881007085FALSEFALSETRUEFALSEFALSETRUE114%121%97%85%80%70%140%170%176%200%
13
checkmate_in_oneexact str matchModels subhuman087002FALSEFALSEFALSEFALSEFALSETRUE875%25%3%0%0%
14
chess_state_trackingexact str matchModels at avg human020884345TRUETRUETRUEFALSEFALSETRUE440%105%225%51%215%49%
15
chinese_remainder_theoremexact str matchModels cannot do task02510000FALSEFALSEFALSEFALSEFALSEFALSE400%0%0%0%0%
16
cifar10_classificationmultiple choiceModels cannot do task10258520FALSEFALSEFALSEFALSEFALSEFALSE340%0%0%0%8%2%20%0%250%850%Seems like a huge stretch to expect language models to parse base64 images
17
code_line_descriptionmultiple choicePaLM at avg human25601003080FALSETRUETRUEFALSEFALSETRUE167%267%133%80%50%30%120%320%240%400%
18
codenamesbleuPaLM at avg human018851530FALSETRUETRUEFALSEFALSETRUE472%200%167%35%83%18%
19
colormultiple choiceModels subhuman10601001030FALSEFALSEFALSEFALSEFALSETRUE167%300%50%30%17%10%100%300%600%1000%Kinda surprised models are this bad at this task
20
common_morphememultiple choicePaLM superhuman2535705580TRUETRUETRUEFALSETRUETRUE200%145%229%114%157%79%220%320%140%280%
21
conceptual_combinationsmultiple choicePaLM almost at avg human25851003580FALSEFALSETRUEFALSEFALSETRUE118%229%94%80%41%35%140%320%340%400%
22
conlang_translationrougeLsumPaLM superhuman022555265TRUETRUETRUEFALSETRUETRUE250%125%295%118%236%95%
23
crash_blossommultiple choiceModels at avg human2542684555TRUETRUETRUEFALSEFALSETRUE162%122%131%81%107%66%180%220%168%272%
24
crass_aimultiple choicePaLM at avg human25851003590FALSETRUETRUEFALSEFALSETRUE118%257%106%90%41%35%140%360%340%400%
25
cryobiology_spanishtrue / falsePaLM at avg human50701006885FALSETRUETRUEFALSEFALSETRUE143%125%121%85%97%68%136%170%140%200%
26
cryptonitemultiple choiceModels subhuman02585010FALSEFALSEFALSEFALSEFALSETRUE340%40%12%0%0%
27
cs_algorithmstrue / falsePaLM almost at avg human1048903545FALSEFALSETRUEFALSEFALSETRUE188%129%94%50%73%39%350%450%480%900%
28
cycled_lettersexact str matchModels cannot do task02510000FALSEFALSEFALSEFALSEFALSEFALSE400%0%0%0%0%Tokenization may be an issue here? Surprised average human is so low
29
dark_humor_detectiontrue / falseModels subhuman50821005565FALSEFALSEFALSEFALSEFALSETRUE122%118%79%65%67%55%110%130%164%200%
30
date_understandingmultiple choicePaLM almost at avg human15751005572FALSEFALSETRUEFALSEFALSETRUE133%131%96%72%73%55%367%480%500%667%
31
disambiguation_qamultiple choiceModels subhuman3365954352FALSEFALSEFALSEFALSEFALSETRUE146%121%80%55%66%45%130%158%197%288%
32
discourse_marker_predictionmultiple choiceModels subhuman1033801315FALSEFALSEFALSEFALSEFALSETRUE242%115%45%19%39%16%130%150%330%800%
33
disfl_qaexact str matchPaLM at avg human02250525FALSETRUETRUEFALSEFALSETRUE227%500%114%50%23%10%
34
dyck_languagesmultiple choiceModels subhuman1451001530FALSEFALSEFALSEFALSEFALSETRUE222%200%67%30%33%15%1500%3000%4500%10000%
35
elementary_math_qamultiple choiceModels subhuman20601002538FALSEFALSEFALSEFALSEFALSETRUE167%152%63%38%42%25%125%190%300%500%
36
emoji_moviemultiple choicePaLM at avg human20951002095FALSETRUETRUEFALSEFALSETRUE105%475%100%95%21%20%100%475%475%500%
37
emojis_emotion_predictionmultiple choiceModels at avg human2048654855TRUETRUETRUEFALSEFALSETRUE135%115%115%85%100%74%240%275%240%325%
38
empirical_judgmentsmultiple choiceModels at avg human3348804855TRUETRUETRUEFALSEFALSETRUE167%115%115%69%100%60%145%167%145%242%
39
english_proverbsmultiple choicePaLM at avg human25651004590FALSETRUETRUEFALSEFALSETRUE154%200%138%90%69%45%180%360%260%400%
40
english_russian_proverbsmultiple choicePaLM at avg human2542903570FALSETRUETRUEFALSEFALSETRUE214%200%167%78%83%39%140%280%168%360%
41
entailed_polaritytrue / falsePaLM at avg human50851007595FALSETRUETRUEFALSEFALSETRUE118%127%112%95%88%75%150%190%170%200%
42
entailed_polarity_hinditrue / falsePaLM at avg human50711006075FALSETRUETRUEFALSEFALSETRUE141%125%106%75%85%60%120%150%142%200%
43
epistemic_reasoningtrue / falseModels at avg human50531006358TRUETRUETRUEFALSEFALSEFALSE189%92%109%58%119%63%126%116%106%200%The average human is surprisingly bad at this
44
evaluating_information_essentialitymultiple choiceModels subhuman2038702522FALSEFALSEFALSEFALSEFALSEFALSE184%88%58%31%66%36%125%110%190%350%
45
fact_checkertrue / falsePaLM at avg human5073886583FALSETRUETRUEFALSEFALSETRUE121%128%114%94%89%74%130%166%146%176%
46
fantasy_reasoningtrue / falsePaLM at avg human50681006572FALSETRUETRUEFALSEFALSETRUE147%111%106%72%96%65%130%144%136%200%The average human is surprisingly bad at this
47
figure_of_speech_detectionmultiple choicePaLM at avg human1040852565FALSETRUETRUEFALSEFALSETRUE213%260%163%76%63%29%250%650%400%850%
48
formal_fallacies_syllogisms_negationtrue / falsePaLM almost at avg human5054805252FALSEFALSETRUEFALSEFALSEFALSE148%100%96%65%96%65%104%104%108%160%
49
gemrougeLsumModels superhuman022303538TRUETRUETRUETRUETRUETRUE136%109%173%127%159%117%
50
general_knowledgemultiple choicePaLM at avg human15851007090FALSETRUETRUEFALSEFALSETRUE118%129%106%90%82%70%467%600%567%667%
51
geometric_shapesmultiple choiceModels subhuman10551001035FALSEFALSEFALSEFALSEFALSETRUE182%350%64%35%18%10%100%350%550%1000%
Kinda surprised average humans are so bad at this (though some parts do seem hard) and surprised PaLM can do this at all!
52
goal_step_wikihowmultiple choicePaLM at avg human2060954280FALSETRUETRUEFALSEFALSETRUE158%190%133%84%70%44%210%400%300%475%
53
gre_reading_comprehensionmultiple choiceModels at avg human2738804265TRUETRUETRUEFALSEFALSETRUE211%155%171%81%111%53%156%241%141%296%
54
hhh_alignmenttrue / falseModels subhuman5075?4550FALSEFALSEFALSEFALSEFALSETRUE111%67%60%Seems kinda alarming models aren't better at this?
55
hindu_knowledgemultiple choicePaLM at avg human25631005595FALSETRUETRUEFALSEFALSETRUE159%173%151%95%87%55%220%380%252%400%
56
human_organs_sensesmultiple choicePaLM almost at avg human25901006282FALSEFALSETRUEFALSEFALSETRUE111%132%91%82%69%62%248%328%360%400%
57
hyperbatontrue / falseModels subhuman60751005265FALSEFALSEFALSEFALSEFALSETRUE133%125%87%65%69%52%87%108%125%167%
58
identify_math_theoremsmultiple choiceModels at avg human2535603555TRUETRUETRUEFALSEFALSETRUE171%157%157%92%100%58%140%220%140%240%
59
identify_odd_metaphormultiple choicePaLM at avg human25701002580FALSETRUETRUEFALSEFALSETRUE143%320%114%80%36%25%100%320%280%400%
60
implicaturestrue / falsePaLM at avg human50821006092FALSETRUETRUEFALSEFALSETRUE122%153%112%92%73%60%120%184%164%200%
61
implicit_relationsmultiple choicePaLM at avg human535852545FALSETRUETRUEFALSEFALSETRUE243%180%129%53%71%29%500%900%700%1700%
62
intent_recognitionmultiple choicePaLM at avg human10851008390FALSETRUETRUEFALSEFALSETRUE118%108%106%90%98%83%830%900%850%1000%
63
international_phonetic_alphabet_nlimultiple choicePaLM at avg human33421004060FALSETRUETRUEFALSEFALSETRUE238%150%143%60%95%40%121%182%127%303%
64
international_phonetic_alphabet_transliteratebleuPaLM at avg human030652555FALSETRUETRUEFALSEFALSETRUE217%220%183%85%83%38%
65
intersect_geometrymultiple choiceModels at avg human2.56.52013.515TRUETRUETRUEFALSEFALSETRUE308%111%231%75%208%68%540%600%260%800%I'm surprised humans are so bad and that models can do this at all
66
irony_identificationtrue / falsePaLM almost at avg human5072885565FALSEFALSETRUEFALSEFALSETRUE122%118%90%74%76%63%110%130%144%176%
67
kanji_asciiexact str matchModels cannot do task012000FALSEFALSEFALSEFALSEFALSEFALSE2000%0%0%0%0%
68
kannadamultiple choicePaLM at avg human2542802550FALSETRUETRUEFALSEFALSETRUE190%200%119%63%60%31%100%200%168%320%
69
key_value_mapsmultiple choicePaLM at avg human5055885065FALSETRUETRUEFALSEFALSETRUE160%130%118%74%91%57%100%130%110%176%
70
language_gamesbleuPaLM at avg human01530425FALSETRUETRUEFALSEFALSETRUE200%625%167%83%27%13%
71
language_identificationmultiple choicePaLM at avg human818551238FALSETRUETRUEFALSEFALSETRUE306%317%211%69%67%22%150%475%225%688%
72
linguistic_mappingsexact str matchPaLM superhuman04268070FALSETRUETRUEFALSETRUETRUE162%167%103%0%0%
73
logic_grid_puzzlemultiple choicePaLM at avg human33401003545FALSETRUETRUEFALSEFALSETRUE250%129%113%45%88%35%106%136%121%303%
74
logical_argsmultiple choicePaLM at avg human2052883285FALSETRUETRUEFALSEFALSETRUE169%266%163%97%62%36%160%425%260%440%
75
logical_deductionmultiple choiceModels subhuman3340883335FALSEFALSEFALSEFALSEFALSETRUE220%106%88%40%83%38%100%106%121%267%
76
logical_fallacy_detectiontrue / falsePaLM at avg human5063905875FALSETRUETRUEFALSEFALSETRUE143%129%119%83%92%64%116%150%126%180%
77
logical_sequencemultiple choicePaLM at avg human25851004090FALSETRUETRUEFALSEFALSETRUE118%225%106%90%47%40%160%360%340%400%
78
mathematical_inductiontrue / falsePaLM almost at avg human5061905558FALSEFALSETRUEFALSEFALSETRUE148%105%95%64%90%61%110%116%122%180%
79
matrixshapesexact str matchPaLM at avg human0360035FALSETRUETRUEFALSEFALSETRUE2000%1167%58%0%0%PaLM does surprisingly well at this
80
metaphor_booleantrue / falsePaLM at avg human50881006292FALSETRUETRUEFALSEFALSETRUE114%148%105%92%70%62%124%184%176%200%
81
metaphor_understandingmultiple choicePaLM at avg human25631005080FALSETRUETRUEFALSEFALSETRUE159%160%127%80%79%50%200%320%252%400%
82
minute_mysteries_qarougeLsumPaLM at avg human0215.508.1FALSETRUETRUEFALSEFALSETRUE775%405%52%0%0%
83
misconceptionstrue / falsePaLM at avg human5063906081FALSETRUETRUEFALSEFALSETRUE143%135%129%90%95%67%120%162%126%180%
84
misconceptions_russiantrue / falseModels subhuman50651004052FALSEFALSEFALSEFALSEFALSETRUE154%130%80%52%62%40%80%104%130%200%
85
mnist_asciimultiple choiceModels cannot do task108510057FALSEFALSEFALSEFALSEFALSETRUE118%140%8%7%6%5%50%70%850%1000%LLMs still can't see images
86
modified_arithmeticexact str matchPaLM at avg human0581003070FALSETRUETRUEFALSEFALSETRUE172%233%121%70%52%30%Humans are surprisingly bad at this
87
moral_permissibilitytrue / falsePaLM almost at avg human5065905062FALSEFALSETRUEFALSEFALSETRUE138%124%95%69%77%56%100%124%130%180%Should I be worried humans are so bad at this?
88
movie_dialog_same_or_differenttrue / falsePaLM almost at avg human50681005262FALSEFALSETRUEFALSEFALSETRUE147%119%91%62%76%52%104%124%136%200%
89
movie_recommendationmultiple choiceModels subhuman2561903538FALSEFALSEFALSEFALSEFALSETRUE148%109%62%42%57%39%140%152%244%360%
90
natural_instructionsrougeLsumModels superhuman020324555TRUETRUETRUETRUETRUETRUE160%122%275%172%225%141%
91
navigatetrue / falseModels subhuman50851005055FALSEFALSEFALSEFALSEFALSETRUE118%110%65%55%59%50%100%110%170%200%Surprised models are so bad at this. Also surprised some humans are not good at this.
92
nonsense_words_grammarmultiple choicePaLM almost at avg human20701006568FALSEFALSETRUEFALSEFALSETRUE143%105%97%68%93%65%325%340%350%500%
93
novel_conceptsmultiple choicePaLM almost at avg human20651005563FALSEFALSETRUEFALSEFALSETRUE154%115%97%63%85%55%275%315%325%500%
94
object_countingexact str matchModels subhuman085100041FALSEFALSEFALSEFALSEFALSETRUE118%48%41%0%0%
95
odd_one_outmultiple choicePaLM almost at avg human20801003075FALSEFALSETRUEFALSEFALSETRUE125%250%94%75%38%30%150%375%400%500%
96
operatorsexact str matchPaLM at avg human045853060FALSETRUETRUEFALSEFALSETRUE189%200%133%71%67%35%
97
parsinlu_reading_comprehensionexact str matchPaLM superhuman0330045FALSETRUETRUEFALSETRUETRUE1000%1500%150%0%0%
98
penguins_in_a_tableexact str matchModels subhuman070853050FALSEFALSEFALSEFALSEFALSETRUE121%167%71%59%43%35%
99
periodic_elementsexact str matchModels at avg human08751863TRUETRUETRUEFALSEFALSETRUE938%350%788%84%225%24%
100
phrase_relatednessmultiple choicePaLM at avg human25751006090FALSETRUETRUEFALSEFALSETRUE133%150%120%90%80%60%240%360%300%400%