ABCDEF
1
StepTraining lossRewardReward stdCompletion lengthKullback-Leibler divergence
2
100.0210.051439105.1666720
3
200.0210.051439232.8333440
4
3000263.6666870.000519
5
400.0688330.07647996.50.000627
6
5000210.50.000493
7
60-0.0378330.129257278.8333440.000479
8
700.0833330.204124205.3333440.000566
9
80-0.0060.08023237.1666720.00051
10
90-0.0608330.149011162.3333440.00083
11
100-0.05050.131693225.1666720.000593
12
110-0.0368330.192641337.3333440.000408
13
1200.02050.050215343.3333440.000544
14
1300.0210.0514392230.000401
15
14000163.6666720.000446
16
1500.0208330.051031363.3333440.000637
17
16000329.3333440.000436
18
1700.0413330.064046134.1666720.000595
19
1800.0330.080833312.1666870.000618
20
1900.0210.051439399.1666870.000533
21
2000.38250.93693248.50.000697
22
2100.4751670.996825212.50.000358
23
220-0.3316670.515408354.3333440.000592
24
230-0.0078330.019188234.3333440.000477
25
2400.0416670.06455430.50.000394
26
2500.3741671.077317207.3333440.000429
27
260-0.00250.006124294.6666870.000562
28
270-0.0236670.116644236.3333440.00046
29
2800.0061670.072747213.6666720.000565
30
2900.4783330.988421246.8333440.000371
31
3000.2891.05074351.50.000407
32
3100.1166670.195212136.3333440.00056
33
3200.0076670.120974173.3333440.000527
34
330-0.1371670.469806270.6666870.000823
35
3400.0210.051439152.8333440.000613
36
35000151.8333440.000556
37
3600.1461670.18395320.3333440.00054
38
370-0.0391670.0959381990.000431
39
380-0.0053330.221805227.8333440.000539
40
3900.0156670.131284185.1666720.000642
41
4000.0456670.07138209.6666720.000643
42
4100.4108331.039501137.6666720.000704
43
420-0.0826670.193798262.3333440.000672
44
4300.1178330.233614183.8333440.000602
45
4400.4181671.024295323.50.000599
46
45000313.50.000499
47
4600.0228330.07715891.1666720.000604
48
4700.09350.0865231380.000518
49
4800.0210.051439297.6666870.000383
50
4900.030.051753270.50.000343
51
5000.37550.798718271.1666870.000345
52
5100.0210.0514391910.000554
53
5200.04050.103173251.1666720.000664
54
53000347.6666870.000634
55
5400.0840.065066224.3333440.000437
56
550-0.0063330.111744190.1666720.00035
57
5600.0210.051439328.6666870.000449
58
5700.4471671.008639159.3333440.000502
59
5800.84651.263569226.50.000905
60
5900.43250.938456215.1666720.000731
61
600-0.2241670.5490942880.000711
62
61000242.8333440.000851
63
6200.0096670.023678288.8333440.000557
64
63000308.6666870.000558
65
6400.31250.765466329.8333440.000435
66
6500.5593330.981628206.8333440.000877
67
6600.4356671.067161156.6666720.000593
68
6700.0208330.051031422.3333440.000591
69
6800.0210.051439341.1666870.000506
70
6900.4381671.0128121620.000504
71
7000.0836670.102471352.1666870.000773
72
710-0.0076670.1550512810.000676
73
7200.32550.797309276.6666870.000752
74
7300.0920.262743189.1666720.001047
75
7400.0210.051439245.50.00054
76
7500.07050.172689267.1666870.000633
77
7600.0503330.082376300.8333440.000701
78
7700.01250.030619291.1666870.000751
79
7800.0351670.059674130.6666720.000928
80
790.00010.9078331.357852107.6666720.001437
81
8000.4543330.929755288.1666870.000606
82
8100.4581670.94791295.1666870.000795
83
8200.9503331.274332161.1666720.00087
84
830-0.0163330.113535257.6666870.000922
85
8400.7176671.351253173.3333440.001187
86
85000267.50.00086
87
8600.10450.188538307.1666870.000746
88
870.0001-0.0086670.113706230.3333440.001573
89
8800.0210.0514394520.001017
90
890.00011.3531.393332132.6666720.002484
91
900.00010.0213330.125264321.8333440.001681
92
910.00010.0506670.116754247.6666720.00164
93
9200.0210.051439378.1666870.000786
94
930.00010.0913330.094825156.1666720.001658
95
940.00030.1260100.3333360.004215
96
950.00010.4198331.02174121.3333360.003343
97
960.00010.3366670.948699317.1666870.002308
98
970.00010.06750.125424356.50.002415
99
980.00010.04850.121284153.6666720.00371
100
990.00020.0871670.109183196.1666720.00398