ABCDEFGHIJKLMNOPQRSTUVWXYZ
1
Human Annotation (100 utterances x 3 annotators/utterance)Automatic Evaluation
2
Model NameRankOverall Human RatingInterestingEngagingSpecificRelevantCorrect
Semantically Appropriate
Understandable
FluentUSR USR-MLMUSR-DRcUSR-DRfMETEORBERTScore
3
diverse-cap-model_v014.2813559322.8745762712.8542372882.5627118642.8813559322.667796612.8779661020.9932203392.9118644074.257584587-0.8198696290.9563142020.5875117150.0905534250.849129942
4
more-diverse-response_v314.2804054052.8141891892.8479729732.5439189192.8682432432.6520270272.8682432430.9966101692.8885135144.586287669-0.8484471490.9761609150.8308325880.1311232670.861703992
5
final-results-test-plato-best-1004_v014.2798634812.8395904442.8703071672.5631399322.8976109222.6860068262.9146757680.9965870312.9146757683.863344795-0.9433333070.9455581270.5831199880.0683225970.84358041
6
nofact-data-finetune_v044.2602739732.8184931512.8082191782.6061643842.8493150682.6849315072.80479452112.9075342474.25778592-0.7947828520.9444829470.5792681610.0896379630.851451581
7
dialogpt-fact-dialogrpt-v0_v054.2525597272.8600682592.8771331062.5767918092.8703071672.7167235492.904436860.9965870312.9078498294.832009598-0.6884073930.9521514330.7733437110.1237065510.862131507
8
gpt2-ed-baseline_v44.2310344832.8206896552.8103448282.6206896552.8517241382.7103448282.8482758620.9862068972.8620689664.730429174-0.7569146450.9493816510.8506834850.1231277420.863243632
9
finetune-with-large-model_v44.2294520552.8082191782.7979452052.6301369862.8082191782.6404109592.89726027412.8698630144.477763749-0.83512790.9392944910.8581291780.1395667750.868364188
10
filter-data-finetune_v04.2286689422.8464163822.8259385672.6109215022.8703071672.6996587032.8737201370.9897610922.8805460754.451502643-0.8874755960.9741011140.8163394920.1251308850.85913778
11
diverse-ensemble-model_v04.2157534252.8219178082.8424657532.6198630142.8253424662.6917808222.8767123290.9897260272.8869863014.464064719-0.8308504070.9637877640.7445479390.1213604670.859138791
12
cruzcontrol-baseline-bert-knowledge_v14.2096219932.8487972512.8694158082.5945017182.8213058422.6563573882.8659793810.9931271482.8900343644.534744193-0.8141195220.9502664730.8159801430.1087158730.856524662
13
final-results-test-1004-m-w_v04.2055749132.811846692.8571428572.6132404182.8327526132.6794425092.8919860630.9930313592.8989547044.509827172-0.8023721320.9480608070.7797268910.1599734910.873781577
14
power-dialog-chat-k_v04.1789473682.8105263162.8666666672.4842105262.8350877192.5964912282.8666666670.9929824562.9157894742.595068664-0.9047907480.698970310.4478889210.0739844140.843421203
15
pretrained-diverse-v2_v104.1774744032.8293515362.8054607512.5392491472.8259385672.6245733792.8156996590.9965870312.8805460754.977801334-0.7679588960.990274040.9121525960.1349862560.86494546
16
tx-zp-nofact-ensemble-rerank_v04.1774744032.7781569972.7849829352.5767918092.8464163822.6712328772.8464163820.9931740612.8805460754.618743837-0.6976893260.9454559080.6513377980.1095421890.856891338
17
final-results-test-online-w_v04.1718213062.8109965642.8419243992.5931034482.7972508592.6862068972.8625429550.9965635742.9243986253.867634857-0.9667730730.9386873820.6628951830.0718524370.844197832
18
final-results-test-rpt-all-w_v04.1672354952.8156996592.8054607512.5945017182.7918088742.621160412.8259385670.9863481232.9283276453.958319994-0.9475334960.9442819990.6715612130.0826697270.847489246
19
cruzcontrol-swbd-pd-nrg-topline_v14.1569965872.7781569972.7645051192.5563139932.7679180892.573378842.8327645050.9726962462.9078498293.404301374-0.8615522870.7444786660.8168069220.1130860540.857690959
20
tx-zp-fact-ensemble-rerank-rerank_v04.1522491352.7785467132.7716262982.550173012.8200692042.6989619382.840830450.9930795852.9031141874.863434695-0.6862071110.95972050.7643658110.1228394380.860801831
21
transformer-gpt2-like_v14.1517241382.7896551722.7862068972.5275862072.7827586212.6172413792.858620690.9930795852.9172413792.467925677-0.9104086930.6757691680.4488007860.0732116010.842835034
22
tx-zp-fact-ensemble-rerank_v04.1477663232.8109965642.8178694162.5085910652.8213058422.6872852232.8659793810.9931271482.8865979384.833800817-0.6925528030.9571033380.7646767860.1215138940.861417566
23
dialogpt-large-finetune-fact-head-V1_v14.1399317412.7440273042.7235494882.5665529012.8395904442.6267123292.85273972612.8805460754.551654628-0.7280219120.926308940.7367558690.1106900840.859520007
24
dialogpt-small-fintune-fact-head_v24.1301369862.7979452052.8013698632.541095892.791095892.6575342472.8356164380.9931506852.8972602744.125210576-0.7379074560.8816319520.5939084420.0899122170.85323514
25
final-results-test-1004-sum-w_v04.1296928332.8054607512.7952218432.5358361772.8122866892.6791808872.8532423210.9897610922.9078498294.34279219-0.8305499660.9308176710.7747743740.1471374040.875802402
26
finetune-with-mid-model_v04.1296928332.8225255972.8259385672.5938566552.7883959042.621160412.8532423210.9795221842.8532423214.909336877-0.7249554540.9588531710.8865951130.1561750970.868694613
27
middle-diverse-response_v04.1280276822.8373702422.8166089972.6193771632.8200692042.68166092.8442906570.9896193772.8961937724.409917531-0.9029451310.9747972370.814106480.1199968320.858372706
28
cruzcontrol-bert-sentence-knowledge_v24.1172413792.758620692.82.4931034482.8275862072.5931034482.8241379310.9827586212.93.96149749-0.8439458130.8765825840.711442720.119018960.859797703
29
final-results-test-rpt-w_v04.1118881122.7832167832.7657342662.510489512.7937062942.6083916082.849650350.989510492.8776223784.46641587-0.8118501370.9445619670.7792902140.1540208140.875045863
30
sk-attn-300-test_v04.0893470792.7835051552.7938144332.4845360822.7663230242.6068965522.8625429550.9896907222.8900343641.713615542-0.9448069020.5605479140.370726120.05492130.83887378
31
sk-attn-100-test_v04.0862068972.7517241382.7793103452.4862068972.7758620692.5655172412.841379310.9862068972.9034482761.400817344-0.96659040.5117436870.3583476920.0487782950.836409242
32
GPT-medium-fact-plus-history_v04.0787671232.7876712332.8013698632.5342465752.7945205482.6164383562.8253424660.9760273972.8321917814.686322492-0.7752058650.948132520.8599447350.127716030.859424687
33
cruzcontrol-kd-pd-nrg-topline_v14.0313588852.7700348432.8013937282.5400696862.7804878052.5714285712.7700348430.9895470382.9233449483.318520033-0.9360776760.7796922230.7741531570.0895453290.848289946
34
fine-tune-on-DialoGPT-M_v33.9254237292.667796612.7423728812.4813559322.7016949152.5423728812.7457627120.9625850342.8372881361.674030843-1.1013089760.6305962110.4064368070.0463211220.831202384
35
mdb-ret-20M_v03.8831615122.6391752582.7044673542.4740484432.7388316152.5635738832.7663230240.9518900342.814432991.351498884-1.3448839910.6950624690.4275321560.0327111370.822683892
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100