diff --git "a/adapter/checkpoint-610/trainer_state.json" "b/adapter/checkpoint-610/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/adapter/checkpoint-610/trainer_state.json"
@@ -0,0 +1,4291 @@
+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9987720016373312,
+  "eval_steps": 500,
+  "global_step": 610,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.3300960063934326,
+      "learning_rate": 2.9999999999999997e-05,
+      "loss": 0.9966,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.4113194942474365,
+      "learning_rate": 5.9999999999999995e-05,
+      "loss": 1.1253,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 0.2486647665500641,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 1.0721,
+      "step": 3
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.2249160259962082,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 0.9033,
+      "step": 4
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.3706735074520111,
+      "learning_rate": 0.00015,
+      "loss": 1.0498,
+      "step": 5
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.28104931116104126,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.9108,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.27497801184654236,
+      "learning_rate": 0.00020999999999999998,
+      "loss": 0.9038,
+      "step": 7
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.30283215641975403,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 0.8605,
+      "step": 8
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.33457252383232117,
+      "learning_rate": 0.00027,
+      "loss": 0.9049,
+      "step": 9
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.37725692987442017,
+      "learning_rate": 0.0003,
+      "loss": 0.772,
+      "step": 10
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.27986466884613037,
+      "learning_rate": 0.00029975206611570246,
+      "loss": 0.7666,
+      "step": 11
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.30687034130096436,
+      "learning_rate": 0.00029950413223140494,
+      "loss": 0.8312,
+      "step": 12
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.3321741819381714,
+      "learning_rate": 0.0002992561983471074,
+      "loss": 0.8308,
+      "step": 13
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.29080134630203247,
+      "learning_rate": 0.0002990082644628099,
+      "loss": 0.7597,
+      "step": 14
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.33823856711387634,
+      "learning_rate": 0.0002987603305785124,
+      "loss": 0.8693,
+      "step": 15
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.3461182117462158,
+      "learning_rate": 0.0002985123966942149,
+      "loss": 1.0571,
+      "step": 16
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.22306275367736816,
+      "learning_rate": 0.0002982644628099173,
+      "loss": 0.7706,
+      "step": 17
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 154.4940643310547,
+      "learning_rate": 0.0002980165289256198,
+      "loss": 2.6519,
+      "step": 18
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.22956405580043793,
+      "learning_rate": 0.00029776859504132227,
+      "loss": 0.6897,
+      "step": 19
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.25711989402770996,
+      "learning_rate": 0.00029752066115702476,
+      "loss": 0.7338,
+      "step": 20
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.2565441131591797,
+      "learning_rate": 0.00029727272727272724,
+      "loss": 0.8211,
+      "step": 21
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.2437434047460556,
+      "learning_rate": 0.0002970247933884297,
+      "loss": 0.8027,
+      "step": 22
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.21284469962120056,
+      "learning_rate": 0.0002967768595041322,
+      "loss": 0.7944,
+      "step": 23
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.23338356614112854,
+      "learning_rate": 0.0002965289256198347,
+      "loss": 0.7696,
+      "step": 24
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.25512659549713135,
+      "learning_rate": 0.0002962809917355372,
+      "loss": 0.7693,
+      "step": 25
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.19500921666622162,
+      "learning_rate": 0.0002960330578512396,
+      "loss": 0.7599,
+      "step": 26
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.2554054260253906,
+      "learning_rate": 0.00029578512396694214,
+      "loss": 0.966,
+      "step": 27
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.17682747542858124,
+      "learning_rate": 0.0002955371900826446,
+      "loss": 0.676,
+      "step": 28
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.20516635477542877,
+      "learning_rate": 0.0002952892561983471,
+      "loss": 0.8144,
+      "step": 29
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.3275119662284851,
+      "learning_rate": 0.0002950413223140496,
+      "loss": 0.7704,
+      "step": 30
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.22231778502464294,
+      "learning_rate": 0.000294793388429752,
+      "loss": 0.7614,
+      "step": 31
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.17065812647342682,
+      "learning_rate": 0.0002945454545454545,
+      "loss": 0.5634,
+      "step": 32
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.1771956831216812,
+      "learning_rate": 0.000294297520661157,
+      "loss": 0.7607,
+      "step": 33
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.26693442463874817,
+      "learning_rate": 0.00029404958677685947,
+      "loss": 0.8171,
+      "step": 34
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 1.409070611000061,
+      "learning_rate": 0.00029380165289256196,
+      "loss": 0.7791,
+      "step": 35
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.20727217197418213,
+      "learning_rate": 0.00029355371900826444,
+      "loss": 0.7357,
+      "step": 36
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2145707905292511,
+      "learning_rate": 0.0002933057851239669,
+      "loss": 0.8458,
+      "step": 37
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.2068527340888977,
+      "learning_rate": 0.0002930578512396694,
+      "loss": 0.78,
+      "step": 38
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.22432388365268707,
+      "learning_rate": 0.00029280991735537184,
+      "loss": 0.8523,
+      "step": 39
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.19982610642910004,
+      "learning_rate": 0.0002925619834710743,
+      "loss": 0.7372,
+      "step": 40
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 6.248472213745117,
+      "learning_rate": 0.00029231404958677686,
+      "loss": 0.7399,
+      "step": 41
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.2269737422466278,
+      "learning_rate": 0.00029206611570247934,
+      "loss": 0.7842,
+      "step": 42
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.23117898404598236,
+      "learning_rate": 0.0002918181818181818,
+      "loss": 0.7111,
+      "step": 43
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.22466522455215454,
+      "learning_rate": 0.00029157024793388425,
+      "loss": 0.8979,
+      "step": 44
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.20770332217216492,
+      "learning_rate": 0.00029132231404958674,
+      "loss": 0.774,
+      "step": 45
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.2376495748758316,
+      "learning_rate": 0.0002910743801652892,
+      "loss": 0.7216,
+      "step": 46
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.2470778226852417,
+      "learning_rate": 0.0002908264462809917,
+      "loss": 0.7369,
+      "step": 47
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.23465900123119354,
+      "learning_rate": 0.0002905785123966942,
+      "loss": 0.7528,
+      "step": 48
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5718627572059631,
+      "learning_rate": 0.00029033057851239667,
+      "loss": 0.7535,
+      "step": 49
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.21493370831012726,
+      "learning_rate": 0.00029008264462809916,
+      "loss": 0.8593,
+      "step": 50
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.21197210252285004,
+      "learning_rate": 0.00028983471074380164,
+      "loss": 0.8013,
+      "step": 51
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.20836398005485535,
+      "learning_rate": 0.0002895867768595041,
+      "loss": 0.7905,
+      "step": 52
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.2096678912639618,
+      "learning_rate": 0.00028933884297520655,
+      "loss": 0.6754,
+      "step": 53
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.25898435711860657,
+      "learning_rate": 0.00028909090909090904,
+      "loss": 0.7725,
+      "step": 54
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.23370735347270966,
+      "learning_rate": 0.0002888429752066116,
+      "loss": 0.7007,
+      "step": 55
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.23006942868232727,
+      "learning_rate": 0.00028859504132231406,
+      "loss": 0.7534,
+      "step": 56
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.20855402946472168,
+      "learning_rate": 0.0002883471074380165,
+      "loss": 0.9491,
+      "step": 57
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.24340493977069855,
+      "learning_rate": 0.00028809917355371897,
+      "loss": 0.8089,
+      "step": 58
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.20169466733932495,
+      "learning_rate": 0.00028785123966942145,
+      "loss": 0.64,
+      "step": 59
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.23272906243801117,
+      "learning_rate": 0.00028760330578512394,
+      "loss": 0.8456,
+      "step": 60
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.1767100691795349,
+      "learning_rate": 0.0002873553719008264,
+      "loss": 0.6686,
+      "step": 61
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.24511106312274933,
+      "learning_rate": 0.0002871074380165289,
+      "loss": 0.6998,
+      "step": 62
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.22284479439258575,
+      "learning_rate": 0.0002868595041322314,
+      "loss": 0.6699,
+      "step": 63
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.21842750906944275,
+      "learning_rate": 0.00028661157024793387,
+      "loss": 0.7413,
+      "step": 64
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.2669163644313812,
+      "learning_rate": 0.00028636363636363636,
+      "loss": 0.931,
+      "step": 65
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.1864808052778244,
+      "learning_rate": 0.0002861157024793388,
+      "loss": 0.5652,
+      "step": 66
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.18369853496551514,
+      "learning_rate": 0.00028586776859504127,
+      "loss": 0.6847,
+      "step": 67
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.22353056073188782,
+      "learning_rate": 0.00028561983471074375,
+      "loss": 0.598,
+      "step": 68
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.20269523561000824,
+      "learning_rate": 0.0002853719008264463,
+      "loss": 0.8688,
+      "step": 69
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.2291198968887329,
+      "learning_rate": 0.0002851239669421488,
+      "loss": 0.7535,
+      "step": 70
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.22033120691776276,
+      "learning_rate": 0.0002848760330578512,
+      "loss": 0.8377,
+      "step": 71
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.2687983214855194,
+      "learning_rate": 0.0002846280991735537,
+      "loss": 0.6926,
+      "step": 72
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.1933681070804596,
+      "learning_rate": 0.00028438016528925617,
+      "loss": 0.6276,
+      "step": 73
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.2820705473423004,
+      "learning_rate": 0.00028413223140495865,
+      "loss": 0.848,
+      "step": 74
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.19532324373722076,
+      "learning_rate": 0.00028388429752066114,
+      "loss": 0.6198,
+      "step": 75
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.25057846307754517,
+      "learning_rate": 0.0002836363636363636,
+      "loss": 0.6838,
+      "step": 76
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.2168462574481964,
+      "learning_rate": 0.0002833884297520661,
+      "loss": 0.7885,
+      "step": 77
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.2106674313545227,
+      "learning_rate": 0.0002831404958677686,
+      "loss": 0.6757,
+      "step": 78
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.24460363388061523,
+      "learning_rate": 0.000282892561983471,
+      "loss": 0.7414,
+      "step": 79
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.3706071078777313,
+      "learning_rate": 0.0002826446280991735,
+      "loss": 0.621,
+      "step": 80
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.2251998782157898,
+      "learning_rate": 0.000282396694214876,
+      "loss": 0.7453,
+      "step": 81
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.24521738290786743,
+      "learning_rate": 0.00028214876033057847,
+      "loss": 0.6985,
+      "step": 82
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.2262742966413498,
+      "learning_rate": 0.000281900826446281,
+      "loss": 0.6316,
+      "step": 83
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.19723354279994965,
+      "learning_rate": 0.00028165289256198344,
+      "loss": 0.4798,
+      "step": 84
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.20684833824634552,
+      "learning_rate": 0.0002814049586776859,
+      "loss": 0.7993,
+      "step": 85
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.19534814357757568,
+      "learning_rate": 0.0002811570247933884,
+      "loss": 0.7735,
+      "step": 86
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.2585545480251312,
+      "learning_rate": 0.0002809090909090909,
+      "loss": 0.8126,
+      "step": 87
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.2510583996772766,
+      "learning_rate": 0.00028066115702479337,
+      "loss": 0.6973,
+      "step": 88
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.1884051263332367,
+      "learning_rate": 0.00028041322314049585,
+      "loss": 0.701,
+      "step": 89
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.2526257038116455,
+      "learning_rate": 0.00028016528925619834,
+      "loss": 0.7132,
+      "step": 90
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.200734481215477,
+      "learning_rate": 0.0002799173553719008,
+      "loss": 0.7024,
+      "step": 91
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.2404022514820099,
+      "learning_rate": 0.0002796694214876033,
+      "loss": 0.704,
+      "step": 92
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.23063871264457703,
+      "learning_rate": 0.00027942148760330573,
+      "loss": 0.6312,
+      "step": 93
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.1759747564792633,
+      "learning_rate": 0.0002791735537190082,
+      "loss": 0.6577,
+      "step": 94
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2009582370519638,
+      "learning_rate": 0.0002789256198347107,
+      "loss": 0.8036,
+      "step": 95
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2200164943933487,
+      "learning_rate": 0.0002786776859504132,
+      "loss": 0.7101,
+      "step": 96
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.19693537056446075,
+      "learning_rate": 0.00027842975206611567,
+      "loss": 0.6221,
+      "step": 97
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.23269779980182648,
+      "learning_rate": 0.00027818181818181815,
+      "loss": 0.8264,
+      "step": 98
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2440226823091507,
+      "learning_rate": 0.00027793388429752064,
+      "loss": 0.8051,
+      "step": 99
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.2307034134864807,
+      "learning_rate": 0.0002776859504132231,
+      "loss": 0.631,
+      "step": 100
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2530567944049835,
+      "learning_rate": 0.0002774380165289256,
+      "loss": 0.8616,
+      "step": 101
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2808806300163269,
+      "learning_rate": 0.0002771900826446281,
+      "loss": 0.8333,
+      "step": 102
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.20667941868305206,
+      "learning_rate": 0.00027694214876033057,
+      "loss": 0.7212,
+      "step": 103
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.17540781199932098,
+      "learning_rate": 0.00027669421487603305,
+      "loss": 0.5964,
+      "step": 104
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2526637613773346,
+      "learning_rate": 0.00027644628099173554,
+      "loss": 0.6868,
+      "step": 105
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.2137339860200882,
+      "learning_rate": 0.00027619834710743797,
+      "loss": 0.6155,
+      "step": 106
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.21061092615127563,
+      "learning_rate": 0.00027595041322314045,
+      "loss": 0.813,
+      "step": 107
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.21619191765785217,
+      "learning_rate": 0.00027570247933884293,
+      "loss": 0.8046,
+      "step": 108
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.2212170660495758,
+      "learning_rate": 0.0002754545454545454,
+      "loss": 0.6706,
+      "step": 109
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.23427413403987885,
+      "learning_rate": 0.0002752066115702479,
+      "loss": 0.7152,
+      "step": 110
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.20566123723983765,
+      "learning_rate": 0.0002749586776859504,
+      "loss": 0.6568,
+      "step": 111
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.22977930307388306,
+      "learning_rate": 0.00027471074380165287,
+      "loss": 0.7832,
+      "step": 112
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.28307485580444336,
+      "learning_rate": 0.00027446280991735535,
+      "loss": 0.7446,
+      "step": 113
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.19567596912384033,
+      "learning_rate": 0.00027421487603305784,
+      "loss": 0.6394,
+      "step": 114
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.24577689170837402,
+      "learning_rate": 0.0002739669421487603,
+      "loss": 0.6389,
+      "step": 115
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.2180463820695877,
+      "learning_rate": 0.0002737190082644628,
+      "loss": 0.7814,
+      "step": 116
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.19546380639076233,
+      "learning_rate": 0.0002734710743801653,
+      "loss": 0.8312,
+      "step": 117
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.22698360681533813,
+      "learning_rate": 0.00027322314049586777,
+      "loss": 0.7443,
+      "step": 118
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.22987066209316254,
+      "learning_rate": 0.0002729752066115702,
+      "loss": 0.7839,
+      "step": 119
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.20548178255558014,
+      "learning_rate": 0.0002727272727272727,
+      "loss": 0.7805,
+      "step": 120
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.2477702796459198,
+      "learning_rate": 0.00027247933884297517,
+      "loss": 0.5694,
+      "step": 121
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.20593340694904327,
+      "learning_rate": 0.00027223140495867765,
+      "loss": 0.6479,
+      "step": 122
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.23635917901992798,
+      "learning_rate": 0.00027198347107438013,
+      "loss": 0.8107,
+      "step": 123
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.25808119773864746,
+      "learning_rate": 0.0002717355371900826,
+      "loss": 0.7876,
+      "step": 124
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.22156469523906708,
+      "learning_rate": 0.0002714876033057851,
+      "loss": 0.7261,
+      "step": 125
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.19892215728759766,
+      "learning_rate": 0.0002712396694214876,
+      "loss": 0.6874,
+      "step": 126
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.24936752021312714,
+      "learning_rate": 0.00027099173553719007,
+      "loss": 0.6155,
+      "step": 127
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.23287539184093475,
+      "learning_rate": 0.0002707438016528925,
+      "loss": 0.602,
+      "step": 128
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.2086639404296875,
+      "learning_rate": 0.00027049586776859504,
+      "loss": 0.7198,
+      "step": 129
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.24974922835826874,
+      "learning_rate": 0.0002702479338842975,
+      "loss": 0.6873,
+      "step": 130
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.2066827118396759,
+      "learning_rate": 0.00027,
+      "loss": 0.5821,
+      "step": 131
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.28004395961761475,
+      "learning_rate": 0.0002697520661157025,
+      "loss": 0.7864,
+      "step": 132
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.22391608357429504,
+      "learning_rate": 0.0002695041322314049,
+      "loss": 0.6773,
+      "step": 133
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.2821199297904968,
+      "learning_rate": 0.0002692561983471074,
+      "loss": 0.6806,
+      "step": 134
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.21736428141593933,
+      "learning_rate": 0.0002690082644628099,
+      "loss": 0.6662,
+      "step": 135
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.23889939486980438,
+      "learning_rate": 0.00026876033057851237,
+      "loss": 0.6356,
+      "step": 136
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.21096719801425934,
+      "learning_rate": 0.00026851239669421485,
+      "loss": 0.6762,
+      "step": 137
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.22622421383857727,
+      "learning_rate": 0.00026826446280991733,
+      "loss": 0.8085,
+      "step": 138
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.19824957847595215,
+      "learning_rate": 0.0002680165289256198,
+      "loss": 0.6031,
+      "step": 139
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.24482691287994385,
+      "learning_rate": 0.0002677685950413223,
+      "loss": 0.6649,
+      "step": 140
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.21291929483413696,
+      "learning_rate": 0.0002675206611570248,
+      "loss": 0.6671,
+      "step": 141
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.2202674299478531,
+      "learning_rate": 0.0002672727272727272,
+      "loss": 0.6469,
+      "step": 142
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.23572632670402527,
+      "learning_rate": 0.0002670247933884297,
+      "loss": 0.7377,
+      "step": 143
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.2051907777786255,
+      "learning_rate": 0.00026677685950413224,
+      "loss": 0.6217,
+      "step": 144
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.23270072042942047,
+      "learning_rate": 0.0002665289256198347,
+      "loss": 0.7933,
+      "step": 145
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.20652809739112854,
+      "learning_rate": 0.00026628099173553715,
+      "loss": 0.6007,
+      "step": 146
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.23084674775600433,
+      "learning_rate": 0.00026603305785123963,
+      "loss": 0.701,
+      "step": 147
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.25663891434669495,
+      "learning_rate": 0.0002657851239669421,
+      "loss": 0.7271,
+      "step": 148
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.25880497694015503,
+      "learning_rate": 0.0002655371900826446,
+      "loss": 0.6562,
+      "step": 149
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.19349205493927002,
+      "learning_rate": 0.0002652892561983471,
+      "loss": 0.5016,
+      "step": 150
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.2401740401983261,
+      "learning_rate": 0.00026504132231404957,
+      "loss": 0.6978,
+      "step": 151
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.19495394825935364,
+      "learning_rate": 0.00026479338842975205,
+      "loss": 0.5562,
+      "step": 152
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.21485286951065063,
+      "learning_rate": 0.00026454545454545453,
+      "loss": 0.7847,
+      "step": 153
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.241348534822464,
+      "learning_rate": 0.000264297520661157,
+      "loss": 0.7513,
+      "step": 154
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.3316986858844757,
+      "learning_rate": 0.00026404958677685945,
+      "loss": 0.664,
+      "step": 155
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.2419958859682083,
+      "learning_rate": 0.00026380165289256193,
+      "loss": 0.7322,
+      "step": 156
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.2868640124797821,
+      "learning_rate": 0.0002635537190082644,
+      "loss": 0.7004,
+      "step": 157
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.24806949496269226,
+      "learning_rate": 0.00026330578512396695,
+      "loss": 0.6497,
+      "step": 158
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.23873400688171387,
+      "learning_rate": 0.00026305785123966944,
+      "loss": 0.7543,
+      "step": 159
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.2480355203151703,
+      "learning_rate": 0.00026280991735537187,
+      "loss": 0.6048,
+      "step": 160
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.2619112730026245,
+      "learning_rate": 0.00026256198347107435,
+      "loss": 0.762,
+      "step": 161
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.22763262689113617,
+      "learning_rate": 0.00026231404958677683,
+      "loss": 0.6557,
+      "step": 162
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.3291528522968292,
+      "learning_rate": 0.0002620661157024793,
+      "loss": 0.7059,
+      "step": 163
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.2959338426589966,
+      "learning_rate": 0.0002618181818181818,
+      "loss": 0.6622,
+      "step": 164
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.23001112043857574,
+      "learning_rate": 0.0002615702479338843,
+      "loss": 0.6465,
+      "step": 165
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.1998877376317978,
+      "learning_rate": 0.00026132231404958677,
+      "loss": 0.666,
+      "step": 166
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.23009613156318665,
+      "learning_rate": 0.00026107438016528925,
+      "loss": 0.8793,
+      "step": 167
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.24525685608386993,
+      "learning_rate": 0.0002608264462809917,
+      "loss": 0.8009,
+      "step": 168
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.21605077385902405,
+      "learning_rate": 0.00026057851239669416,
+      "loss": 0.5459,
+      "step": 169
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.2576725482940674,
+      "learning_rate": 0.00026033057851239665,
+      "loss": 0.6818,
+      "step": 170
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.23385170102119446,
+      "learning_rate": 0.00026008264462809913,
+      "loss": 0.7559,
+      "step": 171
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.1973017454147339,
+      "learning_rate": 0.00025983471074380167,
+      "loss": 0.6798,
+      "step": 172
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.22262559831142426,
+      "learning_rate": 0.0002595867768595041,
+      "loss": 0.5566,
+      "step": 173
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.23010462522506714,
+      "learning_rate": 0.0002593388429752066,
+      "loss": 0.7101,
+      "step": 174
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.21676452457904816,
+      "learning_rate": 0.00025909090909090907,
+      "loss": 0.7038,
+      "step": 175
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.22475261986255646,
+      "learning_rate": 0.00025884297520661155,
+      "loss": 0.7812,
+      "step": 176
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.28893202543258667,
+      "learning_rate": 0.00025859504132231403,
+      "loss": 0.5925,
+      "step": 177
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.22777552902698517,
+      "learning_rate": 0.0002583471074380165,
+      "loss": 0.7319,
+      "step": 178
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.2287953644990921,
+      "learning_rate": 0.000258099173553719,
+      "loss": 0.7775,
+      "step": 179
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.2049843668937683,
+      "learning_rate": 0.0002578512396694215,
+      "loss": 0.7448,
+      "step": 180
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.22585280239582062,
+      "learning_rate": 0.00025760330578512397,
+      "loss": 0.59,
+      "step": 181
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.23159150779247284,
+      "learning_rate": 0.0002573553719008264,
+      "loss": 0.737,
+      "step": 182
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.3393082320690155,
+      "learning_rate": 0.0002571074380165289,
+      "loss": 0.6948,
+      "step": 183
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.2345617413520813,
+      "learning_rate": 0.00025685950413223136,
+      "loss": 0.6351,
+      "step": 184
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.23474591970443726,
+      "learning_rate": 0.00025661157024793385,
+      "loss": 0.6643,
+      "step": 185
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.2473030984401703,
+      "learning_rate": 0.00025636363636363633,
+      "loss": 0.7663,
+      "step": 186
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2971685230731964,
+      "learning_rate": 0.0002561157024793388,
+      "loss": 0.7449,
+      "step": 187
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2745087742805481,
+      "learning_rate": 0.0002558677685950413,
+      "loss": 0.6125,
+      "step": 188
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.23520545661449432,
+      "learning_rate": 0.0002556198347107438,
+      "loss": 0.573,
+      "step": 189
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.2955464720726013,
+      "learning_rate": 0.00025537190082644627,
+      "loss": 0.5315,
+      "step": 190
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.23987281322479248,
+      "learning_rate": 0.00025512396694214875,
+      "loss": 0.5636,
+      "step": 191
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.24263744056224823,
+      "learning_rate": 0.00025487603305785123,
+      "loss": 0.6047,
+      "step": 192
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.26061922311782837,
+      "learning_rate": 0.0002546280991735537,
+      "loss": 0.7812,
+      "step": 193
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.2458687126636505,
+      "learning_rate": 0.0002543801652892562,
+      "loss": 0.58,
+      "step": 194
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.24598994851112366,
+      "learning_rate": 0.00025413223140495863,
+      "loss": 0.7432,
+      "step": 195
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.248992919921875,
+      "learning_rate": 0.0002538842975206611,
+      "loss": 0.6953,
+      "step": 196
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.2518531382083893,
+      "learning_rate": 0.0002536363636363636,
+      "loss": 0.6707,
+      "step": 197
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.23844210803508759,
+      "learning_rate": 0.0002533884297520661,
+      "loss": 0.6285,
+      "step": 198
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.21948237717151642,
+      "learning_rate": 0.00025314049586776856,
+      "loss": 0.6859,
+      "step": 199
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.2003835141658783,
+      "learning_rate": 0.00025289256198347105,
+      "loss": 0.6305,
+      "step": 200
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.23421582579612732,
+      "learning_rate": 0.00025264462809917353,
+      "loss": 0.7164,
+      "step": 201
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.22344104945659637,
+      "learning_rate": 0.000252396694214876,
+      "loss": 0.6498,
+      "step": 202
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.17792212963104248,
+      "learning_rate": 0.0002521487603305785,
+      "loss": 0.614,
+      "step": 203
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.217886820435524,
+      "learning_rate": 0.000251900826446281,
+      "loss": 0.7033,
+      "step": 204
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.209726020693779,
+      "learning_rate": 0.00025165289256198347,
+      "loss": 0.5913,
+      "step": 205
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.2401910424232483,
+      "learning_rate": 0.00025140495867768595,
+      "loss": 0.6405,
+      "step": 206
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.21315626800060272,
+      "learning_rate": 0.00025115702479338843,
+      "loss": 0.7369,
+      "step": 207
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20102320611476898,
+      "learning_rate": 0.00025090909090909086,
+      "loss": 0.6245,
+      "step": 208
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.20447981357574463,
+      "learning_rate": 0.00025066115702479335,
+      "loss": 0.5423,
+      "step": 209
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.24979281425476074,
+      "learning_rate": 0.00025041322314049583,
+      "loss": 0.8078,
+      "step": 210
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.20141547918319702,
+      "learning_rate": 0.0002501652892561983,
+      "loss": 0.7386,
+      "step": 211
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.2538990378379822,
+      "learning_rate": 0.0002499173553719008,
+      "loss": 0.7219,
+      "step": 212
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.2613961100578308,
+      "learning_rate": 0.0002496694214876033,
+      "loss": 0.7903,
+      "step": 213
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.24777857959270477,
+      "learning_rate": 0.00024942148760330576,
+      "loss": 0.664,
+      "step": 214
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.21958425641059875,
+      "learning_rate": 0.00024917355371900825,
+      "loss": 0.6755,
+      "step": 215
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.2218528538942337,
+      "learning_rate": 0.00024892561983471073,
+      "loss": 0.5568,
+      "step": 216
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.23632755875587463,
+      "learning_rate": 0.00024867768595041316,
+      "loss": 0.6858,
+      "step": 217
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2641279697418213,
+      "learning_rate": 0.0002484297520661157,
+      "loss": 0.7783,
+      "step": 218
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.3147680163383484,
+      "learning_rate": 0.0002481818181818182,
+      "loss": 0.662,
+      "step": 219
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.27947697043418884,
+      "learning_rate": 0.00024793388429752067,
+      "loss": 0.6477,
+      "step": 220
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.2297278195619583,
+      "learning_rate": 0.00024768595041322315,
+      "loss": 0.5895,
+      "step": 221
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.23085851967334747,
+      "learning_rate": 0.0002474380165289256,
+      "loss": 0.5806,
+      "step": 222
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.19654251635074615,
+      "learning_rate": 0.00024719008264462806,
+      "loss": 0.5942,
+      "step": 223
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.2467166632413864,
+      "learning_rate": 0.00024694214876033055,
+      "loss": 0.5059,
+      "step": 224
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.22614917159080505,
+      "learning_rate": 0.00024669421487603303,
+      "loss": 0.643,
+      "step": 225
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.2622920274734497,
+      "learning_rate": 0.0002464462809917355,
+      "loss": 0.6257,
+      "step": 226
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.21843163669109344,
+      "learning_rate": 0.000246198347107438,
+      "loss": 0.6057,
+      "step": 227
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.2294640988111496,
+      "learning_rate": 0.0002459504132231405,
+      "loss": 0.6876,
+      "step": 228
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.1791463941335678,
+      "learning_rate": 0.00024570247933884296,
+      "loss": 0.5348,
+      "step": 229
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.17243699729442596,
+      "learning_rate": 0.00024545454545454545,
+      "loss": 0.5966,
+      "step": 230
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.22769273817539215,
+      "learning_rate": 0.0002452066115702479,
+      "loss": 0.7912,
+      "step": 231
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.2325255423784256,
+      "learning_rate": 0.0002449586776859504,
+      "loss": 0.7441,
+      "step": 232
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.24277740716934204,
+      "learning_rate": 0.0002447107438016529,
+      "loss": 0.6653,
+      "step": 233
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.21596141159534454,
+      "learning_rate": 0.0002444628099173554,
+      "loss": 0.6668,
+      "step": 234
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.20814135670661926,
+      "learning_rate": 0.0002442148760330578,
+      "loss": 0.6306,
+      "step": 235
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.25570017099380493,
+      "learning_rate": 0.0002439669421487603,
+      "loss": 0.6524,
+      "step": 236
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.2502390146255493,
+      "learning_rate": 0.00024371900826446278,
+      "loss": 0.6048,
+      "step": 237
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.23688243329524994,
+      "learning_rate": 0.0002434710743801653,
+      "loss": 0.568,
+      "step": 238
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.21041709184646606,
+      "learning_rate": 0.00024322314049586777,
+      "loss": 0.6908,
+      "step": 239
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.21656759083271027,
+      "learning_rate": 0.00024297520661157023,
+      "loss": 0.4993,
+      "step": 240
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.25133028626441956,
+      "learning_rate": 0.0002427272727272727,
+      "loss": 0.718,
+      "step": 241
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.22228790819644928,
+      "learning_rate": 0.0002424793388429752,
+      "loss": 0.6146,
+      "step": 242
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.26273205876350403,
+      "learning_rate": 0.00024223140495867768,
+      "loss": 0.7459,
+      "step": 243
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2156606763601303,
+      "learning_rate": 0.00024198347107438014,
+      "loss": 0.6692,
+      "step": 244
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.2075020670890808,
+      "learning_rate": 0.00024173553719008262,
+      "loss": 0.6427,
+      "step": 245
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.25821176171302795,
+      "learning_rate": 0.0002414876033057851,
+      "loss": 0.7964,
+      "step": 246
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.23016126453876495,
+      "learning_rate": 0.0002412396694214876,
+      "loss": 0.536,
+      "step": 247
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.23115016520023346,
+      "learning_rate": 0.00024099173553719004,
+      "loss": 0.6053,
+      "step": 248
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.18249157071113586,
+      "learning_rate": 0.00024074380165289253,
+      "loss": 0.6574,
+      "step": 249
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.28391778469085693,
+      "learning_rate": 0.000240495867768595,
+      "loss": 0.7152,
+      "step": 250
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.2581539452075958,
+      "learning_rate": 0.0002402479338842975,
+      "loss": 0.8476,
+      "step": 251
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.2304867058992386,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 0.5781,
+      "step": 252
+    },
+    {
+      "epoch": 0.41,
+      "grad_norm": 0.239717036485672,
+      "learning_rate": 0.00023975206611570244,
+      "loss": 0.6543,
+      "step": 253
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.22493794560432434,
+      "learning_rate": 0.00023950413223140495,
+      "loss": 0.7048,
+      "step": 254
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.22085991501808167,
+      "learning_rate": 0.00023925619834710743,
+      "loss": 0.5572,
+      "step": 255
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.35917988419532776,
+      "learning_rate": 0.0002390082644628099,
+      "loss": 0.8485,
+      "step": 256
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.28269943594932556,
+      "learning_rate": 0.00023876033057851237,
+      "loss": 0.5732,
+      "step": 257
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.26313093304634094,
+      "learning_rate": 0.00023851239669421485,
+      "loss": 0.8212,
+      "step": 258
+    },
+    {
+      "epoch": 0.42,
+      "grad_norm": 0.30286532640457153,
+      "learning_rate": 0.00023826446280991734,
+      "loss": 0.5878,
+      "step": 259
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.22270837426185608,
+      "learning_rate": 0.00023801652892561982,
+      "loss": 0.6933,
+      "step": 260
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.29011014103889465,
+      "learning_rate": 0.0002377685950413223,
+      "loss": 0.6188,
+      "step": 261
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.2390982061624527,
+      "learning_rate": 0.00023752066115702476,
+      "loss": 0.6426,
+      "step": 262
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.3416346609592438,
+      "learning_rate": 0.00023727272727272724,
+      "loss": 0.8845,
+      "step": 263
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.25051388144493103,
+      "learning_rate": 0.00023702479338842973,
+      "loss": 0.7286,
+      "step": 264
+    },
+    {
+      "epoch": 0.43,
+      "grad_norm": 0.2497546523809433,
+      "learning_rate": 0.0002367768595041322,
+      "loss": 0.6027,
+      "step": 265
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.23835037648677826,
+      "learning_rate": 0.00023652892561983467,
+      "loss": 0.7052,
+      "step": 266
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.22467398643493652,
+      "learning_rate": 0.00023628099173553715,
+      "loss": 0.5806,
+      "step": 267
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.2663390338420868,
+      "learning_rate": 0.00023603305785123964,
+      "loss": 0.6943,
+      "step": 268
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.22997191548347473,
+      "learning_rate": 0.00023578512396694215,
+      "loss": 0.6411,
+      "step": 269
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.23266558349132538,
+      "learning_rate": 0.00023553719008264463,
+      "loss": 0.6068,
+      "step": 270
+    },
+    {
+      "epoch": 0.44,
+      "grad_norm": 0.2304474264383316,
+      "learning_rate": 0.00023528925619834709,
+      "loss": 0.6427,
+      "step": 271
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.28231826424598694,
+      "learning_rate": 0.00023504132231404957,
+      "loss": 0.8011,
+      "step": 272
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.28013259172439575,
+      "learning_rate": 0.00023479338842975205,
+      "loss": 0.5988,
+      "step": 273
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.22702372074127197,
+      "learning_rate": 0.00023454545454545454,
+      "loss": 0.6737,
+      "step": 274
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.27958643436431885,
+      "learning_rate": 0.000234297520661157,
+      "loss": 0.6621,
+      "step": 275
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.23902451992034912,
+      "learning_rate": 0.00023404958677685948,
+      "loss": 0.6525,
+      "step": 276
+    },
+    {
+      "epoch": 0.45,
+      "grad_norm": 0.2778523564338684,
+      "learning_rate": 0.00023380165289256196,
+      "loss": 0.6697,
+      "step": 277
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.2382276952266693,
+      "learning_rate": 0.00023355371900826444,
+      "loss": 0.6281,
+      "step": 278
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.24487091600894928,
+      "learning_rate": 0.00023330578512396693,
+      "loss": 0.6842,
+      "step": 279
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.2063397765159607,
+      "learning_rate": 0.00023305785123966938,
+      "loss": 0.6554,
+      "step": 280
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.21523278951644897,
+      "learning_rate": 0.00023280991735537187,
+      "loss": 0.632,
+      "step": 281
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.2420080006122589,
+      "learning_rate": 0.00023256198347107435,
+      "loss": 0.6001,
+      "step": 282
+    },
+    {
+      "epoch": 0.46,
+      "grad_norm": 0.2390110194683075,
+      "learning_rate": 0.00023231404958677686,
+      "loss": 0.5648,
+      "step": 283
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.24080687761306763,
+      "learning_rate": 0.0002320661157024793,
+      "loss": 0.86,
+      "step": 284
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.29456445574760437,
+      "learning_rate": 0.0002318181818181818,
+      "loss": 0.7418,
+      "step": 285
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.23326683044433594,
+      "learning_rate": 0.00023157024793388429,
+      "loss": 0.6967,
+      "step": 286
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.20866093039512634,
+      "learning_rate": 0.00023132231404958677,
+      "loss": 0.5205,
+      "step": 287
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.3158474266529083,
+      "learning_rate": 0.00023107438016528925,
+      "loss": 0.7879,
+      "step": 288
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.2730140686035156,
+      "learning_rate": 0.0002308264462809917,
+      "loss": 0.7292,
+      "step": 289
+    },
+    {
+      "epoch": 0.47,
+      "grad_norm": 0.25384965538978577,
+      "learning_rate": 0.0002305785123966942,
+      "loss": 0.7258,
+      "step": 290
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.20765069127082825,
+      "learning_rate": 0.00023033057851239668,
+      "loss": 0.7108,
+      "step": 291
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.25662195682525635,
+      "learning_rate": 0.00023008264462809916,
+      "loss": 0.7473,
+      "step": 292
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.300243616104126,
+      "learning_rate": 0.00022983471074380162,
+      "loss": 0.6902,
+      "step": 293
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.23513919115066528,
+      "learning_rate": 0.0002295867768595041,
+      "loss": 0.5888,
+      "step": 294
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.2077571451663971,
+      "learning_rate": 0.00022933884297520658,
+      "loss": 0.6256,
+      "step": 295
+    },
+    {
+      "epoch": 0.48,
+      "grad_norm": 0.266201376914978,
+      "learning_rate": 0.00022909090909090907,
+      "loss": 0.6913,
+      "step": 296
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.2239614725112915,
+      "learning_rate": 0.00022884297520661152,
+      "loss": 0.7369,
+      "step": 297
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.21509824693202972,
+      "learning_rate": 0.000228595041322314,
+      "loss": 0.4445,
+      "step": 298
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.21956239640712738,
+      "learning_rate": 0.00022834710743801652,
+      "loss": 0.6732,
+      "step": 299
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.18832357227802277,
+      "learning_rate": 0.000228099173553719,
+      "loss": 0.6808,
+      "step": 300
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.21115505695343018,
+      "learning_rate": 0.0002278512396694215,
+      "loss": 0.5323,
+      "step": 301
+    },
+    {
+      "epoch": 0.49,
+      "grad_norm": 0.23715418577194214,
+      "learning_rate": 0.00022760330578512394,
+      "loss": 0.8333,
+      "step": 302
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.29385048151016235,
+      "learning_rate": 0.00022735537190082643,
+      "loss": 0.6,
+      "step": 303
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.26947689056396484,
+      "learning_rate": 0.0002271074380165289,
+      "loss": 0.8788,
+      "step": 304
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.2778269946575165,
+      "learning_rate": 0.0002268595041322314,
+      "loss": 0.7073,
+      "step": 305
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.20938479900360107,
+      "learning_rate": 0.00022661157024793385,
+      "loss": 0.6422,
+      "step": 306
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.2777106761932373,
+      "learning_rate": 0.00022636363636363633,
+      "loss": 0.7495,
+      "step": 307
+    },
+    {
+      "epoch": 0.5,
+      "grad_norm": 0.20872819423675537,
+      "learning_rate": 0.00022611570247933882,
+      "loss": 0.6492,
+      "step": 308
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.2752722501754761,
+      "learning_rate": 0.0002258677685950413,
+      "loss": 0.6014,
+      "step": 309
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.24615786969661713,
+      "learning_rate": 0.00022561983471074378,
+      "loss": 0.6287,
+      "step": 310
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.24146385490894318,
+      "learning_rate": 0.00022537190082644624,
+      "loss": 0.6151,
+      "step": 311
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.24762235581874847,
+      "learning_rate": 0.00022512396694214872,
+      "loss": 0.6377,
+      "step": 312
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.24630331993103027,
+      "learning_rate": 0.00022487603305785124,
+      "loss": 0.7255,
+      "step": 313
+    },
+    {
+      "epoch": 0.51,
+      "grad_norm": 0.2922554612159729,
+      "learning_rate": 0.00022462809917355372,
+      "loss": 0.6645,
+      "step": 314
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.21686063706874847,
+      "learning_rate": 0.00022438016528925618,
+      "loss": 0.5606,
+      "step": 315
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.2216208428144455,
+      "learning_rate": 0.00022413223140495866,
+      "loss": 0.5126,
+      "step": 316
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.25635436177253723,
+      "learning_rate": 0.00022388429752066114,
+      "loss": 0.7387,
+      "step": 317
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.2786000669002533,
+      "learning_rate": 0.00022363636363636363,
+      "loss": 0.5941,
+      "step": 318
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.26092806458473206,
+      "learning_rate": 0.0002233884297520661,
+      "loss": 0.7851,
+      "step": 319
+    },
+    {
+      "epoch": 0.52,
+      "grad_norm": 0.23881889879703522,
+      "learning_rate": 0.00022314049586776857,
+      "loss": 0.598,
+      "step": 320
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.23304526507854462,
+      "learning_rate": 0.00022289256198347105,
+      "loss": 0.7165,
+      "step": 321
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.2340225875377655,
+      "learning_rate": 0.00022264462809917353,
+      "loss": 0.6608,
+      "step": 322
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.31176140904426575,
+      "learning_rate": 0.00022239669421487602,
+      "loss": 0.6711,
+      "step": 323
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.23832640051841736,
+      "learning_rate": 0.00022214876033057847,
+      "loss": 0.732,
+      "step": 324
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.28845977783203125,
+      "learning_rate": 0.00022190082644628096,
+      "loss": 0.7968,
+      "step": 325
+    },
+    {
+      "epoch": 0.53,
+      "grad_norm": 0.1978536993265152,
+      "learning_rate": 0.00022165289256198344,
+      "loss": 0.6592,
+      "step": 326
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.26940053701400757,
+      "learning_rate": 0.00022140495867768595,
+      "loss": 0.7953,
+      "step": 327
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.20393389463424683,
+      "learning_rate": 0.00022115702479338844,
+      "loss": 0.4871,
+      "step": 328
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.27152347564697266,
+      "learning_rate": 0.0002209090909090909,
+      "loss": 0.5583,
+      "step": 329
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.2883144021034241,
+      "learning_rate": 0.00022066115702479338,
+      "loss": 0.6156,
+      "step": 330
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.1987351030111313,
+      "learning_rate": 0.00022041322314049586,
+      "loss": 0.5196,
+      "step": 331
+    },
+    {
+      "epoch": 0.54,
+      "grad_norm": 0.2651583254337311,
+      "learning_rate": 0.00022016528925619834,
+      "loss": 0.6099,
+      "step": 332
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.2574511468410492,
+      "learning_rate": 0.0002199173553719008,
+      "loss": 0.6925,
+      "step": 333
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.27730292081832886,
+      "learning_rate": 0.00021966942148760328,
+      "loss": 0.6752,
+      "step": 334
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.2001207172870636,
+      "learning_rate": 0.00021942148760330577,
+      "loss": 0.75,
+      "step": 335
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.24222363531589508,
+      "learning_rate": 0.00021917355371900825,
+      "loss": 0.6364,
+      "step": 336
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.26326724886894226,
+      "learning_rate": 0.0002189256198347107,
+      "loss": 0.673,
+      "step": 337
+    },
+    {
+      "epoch": 0.55,
+      "grad_norm": 0.2272881418466568,
+      "learning_rate": 0.0002186776859504132,
+      "loss": 0.561,
+      "step": 338
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.24880024790763855,
+      "learning_rate": 0.00021842975206611567,
+      "loss": 0.5552,
+      "step": 339
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2593706548213959,
+      "learning_rate": 0.00021818181818181816,
+      "loss": 0.5417,
+      "step": 340
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.19063642621040344,
+      "learning_rate": 0.00021793388429752067,
+      "loss": 0.5694,
+      "step": 341
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2146475464105606,
+      "learning_rate": 0.0002176859504132231,
+      "loss": 0.4314,
+      "step": 342
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.25150927901268005,
+      "learning_rate": 0.0002174380165289256,
+      "loss": 0.631,
+      "step": 343
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.2753889858722687,
+      "learning_rate": 0.0002171900826446281,
+      "loss": 0.6859,
+      "step": 344
+    },
+    {
+      "epoch": 0.56,
+      "grad_norm": 0.20773079991340637,
+      "learning_rate": 0.00021694214876033058,
+      "loss": 0.7515,
+      "step": 345
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.2547062635421753,
+      "learning_rate": 0.00021669421487603303,
+      "loss": 0.7582,
+      "step": 346
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.24687208235263824,
+      "learning_rate": 0.00021644628099173552,
+      "loss": 0.5865,
+      "step": 347
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.24116279184818268,
+      "learning_rate": 0.000216198347107438,
+      "loss": 0.4841,
+      "step": 348
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.2270282804965973,
+      "learning_rate": 0.00021595041322314048,
+      "loss": 0.5933,
+      "step": 349
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.21436922252178192,
+      "learning_rate": 0.00021570247933884297,
+      "loss": 0.6959,
+      "step": 350
+    },
+    {
+      "epoch": 0.57,
+      "grad_norm": 0.25802701711654663,
+      "learning_rate": 0.00021545454545454542,
+      "loss": 0.729,
+      "step": 351
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.23808260262012482,
+      "learning_rate": 0.0002152066115702479,
+      "loss": 0.6346,
+      "step": 352
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.23161651194095612,
+      "learning_rate": 0.0002149586776859504,
+      "loss": 0.6459,
+      "step": 353
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.2442287802696228,
+      "learning_rate": 0.00021471074380165287,
+      "loss": 0.6803,
+      "step": 354
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.19150683283805847,
+      "learning_rate": 0.00021446280991735533,
+      "loss": 0.4375,
+      "step": 355
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.23142127692699432,
+      "learning_rate": 0.00021421487603305781,
+      "loss": 0.5505,
+      "step": 356
+    },
+    {
+      "epoch": 0.58,
+      "grad_norm": 0.22447548806667328,
+      "learning_rate": 0.00021396694214876033,
+      "loss": 0.6368,
+      "step": 357
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.25168758630752563,
+      "learning_rate": 0.0002137190082644628,
+      "loss": 0.6322,
+      "step": 358
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.25538235902786255,
+      "learning_rate": 0.0002134710743801653,
+      "loss": 0.5317,
+      "step": 359
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.2565425634384155,
+      "learning_rate": 0.00021322314049586775,
+      "loss": 0.6261,
+      "step": 360
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.25399863719940186,
+      "learning_rate": 0.00021297520661157023,
+      "loss": 0.596,
+      "step": 361
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.27143988013267517,
+      "learning_rate": 0.00021272727272727272,
+      "loss": 0.6691,
+      "step": 362
+    },
+    {
+      "epoch": 0.59,
+      "grad_norm": 0.2387736439704895,
+      "learning_rate": 0.0002124793388429752,
+      "loss": 0.5288,
+      "step": 363
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2549780607223511,
+      "learning_rate": 0.00021223140495867766,
+      "loss": 0.7455,
+      "step": 364
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2740858793258667,
+      "learning_rate": 0.00021198347107438014,
+      "loss": 0.4921,
+      "step": 365
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.25273847579956055,
+      "learning_rate": 0.00021173553719008262,
+      "loss": 0.7965,
+      "step": 366
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.25858959555625916,
+      "learning_rate": 0.0002114876033057851,
+      "loss": 0.7303,
+      "step": 367
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.2599296271800995,
+      "learning_rate": 0.0002112396694214876,
+      "loss": 0.6342,
+      "step": 368
+    },
+    {
+      "epoch": 0.6,
+      "grad_norm": 0.21084599196910858,
+      "learning_rate": 0.00021099173553719005,
+      "loss": 0.633,
+      "step": 369
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.24272632598876953,
+      "learning_rate": 0.00021074380165289253,
+      "loss": 0.6213,
+      "step": 370
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.26323699951171875,
+      "learning_rate": 0.00021049586776859501,
+      "loss": 0.563,
+      "step": 371
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.20646587014198303,
+      "learning_rate": 0.00021024793388429753,
+      "loss": 0.6248,
+      "step": 372
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.21778297424316406,
+      "learning_rate": 0.00020999999999999998,
+      "loss": 0.7186,
+      "step": 373
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.21315112709999084,
+      "learning_rate": 0.00020975206611570247,
+      "loss": 0.5961,
+      "step": 374
+    },
+    {
+      "epoch": 0.61,
+      "grad_norm": 0.20787106454372406,
+      "learning_rate": 0.00020950413223140495,
+      "loss": 0.5917,
+      "step": 375
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.23541009426116943,
+      "learning_rate": 0.00020925619834710743,
+      "loss": 0.7803,
+      "step": 376
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.22649626433849335,
+      "learning_rate": 0.00020900826446280992,
+      "loss": 0.5895,
+      "step": 377
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.23644742369651794,
+      "learning_rate": 0.00020876033057851237,
+      "loss": 0.6656,
+      "step": 378
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.22934262454509735,
+      "learning_rate": 0.00020851239669421486,
+      "loss": 0.5933,
+      "step": 379
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.289989709854126,
+      "learning_rate": 0.00020826446280991734,
+      "loss": 0.6852,
+      "step": 380
+    },
+    {
+      "epoch": 0.62,
+      "grad_norm": 0.24489325284957886,
+      "learning_rate": 0.00020801652892561982,
+      "loss": 0.5546,
+      "step": 381
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.27165278792381287,
+      "learning_rate": 0.00020776859504132228,
+      "loss": 0.6845,
+      "step": 382
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.19467370212078094,
+      "learning_rate": 0.00020752066115702476,
+      "loss": 0.5587,
+      "step": 383
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.27320200204849243,
+      "learning_rate": 0.00020727272727272725,
+      "loss": 0.7144,
+      "step": 384
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.28100526332855225,
+      "learning_rate": 0.00020702479338842973,
+      "loss": 0.6914,
+      "step": 385
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.3059975504875183,
+      "learning_rate": 0.0002067768595041322,
+      "loss": 0.6075,
+      "step": 386
+    },
+    {
+      "epoch": 0.63,
+      "grad_norm": 0.24904222786426544,
+      "learning_rate": 0.00020652892561983467,
+      "loss": 0.5543,
+      "step": 387
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.24768255650997162,
+      "learning_rate": 0.00020628099173553718,
+      "loss": 0.607,
+      "step": 388
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.25083738565444946,
+      "learning_rate": 0.00020603305785123967,
+      "loss": 0.7961,
+      "step": 389
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.26338303089141846,
+      "learning_rate": 0.00020578512396694215,
+      "loss": 0.6467,
+      "step": 390
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.25761598348617554,
+      "learning_rate": 0.0002055371900826446,
+      "loss": 0.5891,
+      "step": 391
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.2616937756538391,
+      "learning_rate": 0.0002052892561983471,
+      "loss": 0.5706,
+      "step": 392
+    },
+    {
+      "epoch": 0.64,
+      "grad_norm": 0.18980839848518372,
+      "learning_rate": 0.00020504132231404957,
+      "loss": 0.4479,
+      "step": 393
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.250431627035141,
+      "learning_rate": 0.00020479338842975206,
+      "loss": 0.6006,
+      "step": 394
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.2146655172109604,
+      "learning_rate": 0.0002045454545454545,
+      "loss": 0.7113,
+      "step": 395
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.2195209115743637,
+      "learning_rate": 0.000204297520661157,
+      "loss": 0.5354,
+      "step": 396
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.24879257380962372,
+      "learning_rate": 0.00020404958677685948,
+      "loss": 0.5478,
+      "step": 397
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.27159082889556885,
+      "learning_rate": 0.00020380165289256196,
+      "loss": 0.7681,
+      "step": 398
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.20614947378635406,
+      "learning_rate": 0.00020355371900826445,
+      "loss": 0.6357,
+      "step": 399
+    },
+    {
+      "epoch": 0.65,
+      "grad_norm": 0.25690051913261414,
+      "learning_rate": 0.0002033057851239669,
+      "loss": 0.5731,
+      "step": 400
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.24473583698272705,
+      "learning_rate": 0.0002030578512396694,
+      "loss": 0.6784,
+      "step": 401
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.32395297288894653,
+      "learning_rate": 0.0002028099173553719,
+      "loss": 0.7118,
+      "step": 402
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.2975274324417114,
+      "learning_rate": 0.00020256198347107438,
+      "loss": 0.6504,
+      "step": 403
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.2652553915977478,
+      "learning_rate": 0.00020231404958677684,
+      "loss": 0.6986,
+      "step": 404
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.29475778341293335,
+      "learning_rate": 0.00020206611570247932,
+      "loss": 0.6525,
+      "step": 405
+    },
+    {
+      "epoch": 0.66,
+      "grad_norm": 0.24549973011016846,
+      "learning_rate": 0.0002018181818181818,
+      "loss": 0.5408,
+      "step": 406
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.2181435376405716,
+      "learning_rate": 0.0002015702479338843,
+      "loss": 0.6146,
+      "step": 407
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.2682584226131439,
+      "learning_rate": 0.00020132231404958677,
+      "loss": 0.6368,
+      "step": 408
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.2641114592552185,
+      "learning_rate": 0.00020107438016528923,
+      "loss": 0.51,
+      "step": 409
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.27871838212013245,
+      "learning_rate": 0.0002008264462809917,
+      "loss": 0.7269,
+      "step": 410
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.23890569806098938,
+      "learning_rate": 0.0002005785123966942,
+      "loss": 0.6444,
+      "step": 411
+    },
+    {
+      "epoch": 0.67,
+      "grad_norm": 0.2451583445072174,
+      "learning_rate": 0.00020033057851239668,
+      "loss": 0.5806,
+      "step": 412
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.2743864953517914,
+      "learning_rate": 0.00020008264462809914,
+      "loss": 0.6305,
+      "step": 413
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.2626914978027344,
+      "learning_rate": 0.00019983471074380162,
+      "loss": 0.5765,
+      "step": 414
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.2874875068664551,
+      "learning_rate": 0.0001995867768595041,
+      "loss": 0.5928,
+      "step": 415
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.30499163269996643,
+      "learning_rate": 0.00019933884297520661,
+      "loss": 0.6271,
+      "step": 416
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.30474454164505005,
+      "learning_rate": 0.0001990909090909091,
+      "loss": 0.6755,
+      "step": 417
+    },
+    {
+      "epoch": 0.68,
+      "grad_norm": 0.1819755882024765,
+      "learning_rate": 0.00019884297520661155,
+      "loss": 0.394,
+      "step": 418
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.25470343232154846,
+      "learning_rate": 0.00019859504132231404,
+      "loss": 0.7121,
+      "step": 419
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.26749151945114136,
+      "learning_rate": 0.00019834710743801652,
+      "loss": 0.6487,
+      "step": 420
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.20643912255764008,
+      "learning_rate": 0.000198099173553719,
+      "loss": 0.4585,
+      "step": 421
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.2576930522918701,
+      "learning_rate": 0.00019785123966942146,
+      "loss": 0.5235,
+      "step": 422
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.2899012863636017,
+      "learning_rate": 0.00019760330578512395,
+      "loss": 0.6292,
+      "step": 423
+    },
+    {
+      "epoch": 0.69,
+      "grad_norm": 0.2541065216064453,
+      "learning_rate": 0.00019735537190082643,
+      "loss": 0.648,
+      "step": 424
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.24382047355175018,
+      "learning_rate": 0.0001971074380165289,
+      "loss": 0.5939,
+      "step": 425
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.22931940853595734,
+      "learning_rate": 0.00019685950413223137,
+      "loss": 0.6812,
+      "step": 426
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.2592567205429077,
+      "learning_rate": 0.00019661157024793385,
+      "loss": 0.69,
+      "step": 427
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.2516980767250061,
+      "learning_rate": 0.00019636363636363634,
+      "loss": 0.5707,
+      "step": 428
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.23515059053897858,
+      "learning_rate": 0.00019611570247933882,
+      "loss": 0.6739,
+      "step": 429
+    },
+    {
+      "epoch": 0.7,
+      "grad_norm": 0.24742184579372406,
+      "learning_rate": 0.00019586776859504133,
+      "loss": 0.6761,
+      "step": 430
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.26232922077178955,
+      "learning_rate": 0.00019561983471074376,
+      "loss": 0.7071,
+      "step": 431
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.2853042781352997,
+      "learning_rate": 0.00019537190082644627,
+      "loss": 0.7667,
+      "step": 432
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.251169353723526,
+      "learning_rate": 0.00019512396694214875,
+      "loss": 0.6518,
+      "step": 433
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.2321665734052658,
+      "learning_rate": 0.00019487603305785124,
+      "loss": 0.4377,
+      "step": 434
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.25216928124427795,
+      "learning_rate": 0.0001946280991735537,
+      "loss": 0.7173,
+      "step": 435
+    },
+    {
+      "epoch": 0.71,
+      "grad_norm": 0.19498330354690552,
+      "learning_rate": 0.00019438016528925618,
+      "loss": 0.5584,
+      "step": 436
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.32786309719085693,
+      "learning_rate": 0.00019413223140495866,
+      "loss": 0.6583,
+      "step": 437
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.25834760069847107,
+      "learning_rate": 0.00019388429752066115,
+      "loss": 0.4957,
+      "step": 438
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.3462083041667938,
+      "learning_rate": 0.00019363636363636363,
+      "loss": 0.5205,
+      "step": 439
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.27106693387031555,
+      "learning_rate": 0.00019338842975206609,
+      "loss": 0.6803,
+      "step": 440
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.28165388107299805,
+      "learning_rate": 0.00019314049586776857,
+      "loss": 0.7049,
+      "step": 441
+    },
+    {
+      "epoch": 0.72,
+      "grad_norm": 0.20732273161411285,
+      "learning_rate": 0.00019289256198347105,
+      "loss": 0.6407,
+      "step": 442
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2609116733074188,
+      "learning_rate": 0.00019264462809917354,
+      "loss": 0.5377,
+      "step": 443
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2561998963356018,
+      "learning_rate": 0.000192396694214876,
+      "loss": 0.6212,
+      "step": 444
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.27699044346809387,
+      "learning_rate": 0.00019214876033057848,
+      "loss": 0.5482,
+      "step": 445
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2426328808069229,
+      "learning_rate": 0.000191900826446281,
+      "loss": 0.6444,
+      "step": 446
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.26187026500701904,
+      "learning_rate": 0.00019165289256198347,
+      "loss": 0.5443,
+      "step": 447
+    },
+    {
+      "epoch": 0.73,
+      "grad_norm": 0.2719630002975464,
+      "learning_rate": 0.00019140495867768595,
+      "loss": 0.6886,
+      "step": 448
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.18477971851825714,
+      "learning_rate": 0.0001911570247933884,
+      "loss": 0.5292,
+      "step": 449
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2144313007593155,
+      "learning_rate": 0.0001909090909090909,
+      "loss": 0.4613,
+      "step": 450
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.2580784857273102,
+      "learning_rate": 0.00019066115702479338,
+      "loss": 0.5606,
+      "step": 451
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.3073588013648987,
+      "learning_rate": 0.00019041322314049586,
+      "loss": 0.6123,
+      "step": 452
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.21787844598293304,
+      "learning_rate": 0.00019016528925619832,
+      "loss": 0.5939,
+      "step": 453
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.255750447511673,
+      "learning_rate": 0.0001899173553719008,
+      "loss": 0.5739,
+      "step": 454
+    },
+    {
+      "epoch": 0.74,
+      "grad_norm": 0.24147820472717285,
+      "learning_rate": 0.00018966942148760329,
+      "loss": 0.6026,
+      "step": 455
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.26172590255737305,
+      "learning_rate": 0.00018942148760330577,
+      "loss": 0.5166,
+      "step": 456
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.2710455358028412,
+      "learning_rate": 0.00018917355371900825,
+      "loss": 0.6429,
+      "step": 457
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.1971074640750885,
+      "learning_rate": 0.0001889256198347107,
+      "loss": 0.4799,
+      "step": 458
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.23394368588924408,
+      "learning_rate": 0.0001886776859504132,
+      "loss": 0.5491,
+      "step": 459
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.22820048034191132,
+      "learning_rate": 0.0001884297520661157,
+      "loss": 0.5343,
+      "step": 460
+    },
+    {
+      "epoch": 0.75,
+      "grad_norm": 0.23169974982738495,
+      "learning_rate": 0.0001881818181818182,
+      "loss": 0.5852,
+      "step": 461
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.24015003442764282,
+      "learning_rate": 0.00018793388429752064,
+      "loss": 0.6209,
+      "step": 462
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.2230776697397232,
+      "learning_rate": 0.00018768595041322313,
+      "loss": 0.6296,
+      "step": 463
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.2518354654312134,
+      "learning_rate": 0.0001874380165289256,
+      "loss": 0.6167,
+      "step": 464
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.338256299495697,
+      "learning_rate": 0.0001871900826446281,
+      "loss": 0.6512,
+      "step": 465
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.23796728253364563,
+      "learning_rate": 0.00018694214876033055,
+      "loss": 0.8155,
+      "step": 466
+    },
+    {
+      "epoch": 0.76,
+      "grad_norm": 0.31516361236572266,
+      "learning_rate": 0.00018669421487603303,
+      "loss": 0.8023,
+      "step": 467
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.2371574491262436,
+      "learning_rate": 0.00018644628099173552,
+      "loss": 0.5613,
+      "step": 468
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.2822033762931824,
+      "learning_rate": 0.000186198347107438,
+      "loss": 0.5549,
+      "step": 469
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.25953295826911926,
+      "learning_rate": 0.00018595041322314049,
+      "loss": 0.6199,
+      "step": 470
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.2478639930486679,
+      "learning_rate": 0.00018570247933884294,
+      "loss": 0.5806,
+      "step": 471
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.2439350187778473,
+      "learning_rate": 0.00018545454545454543,
+      "loss": 0.6222,
+      "step": 472
+    },
+    {
+      "epoch": 0.77,
+      "grad_norm": 0.24993474781513214,
+      "learning_rate": 0.0001852066115702479,
+      "loss": 0.6048,
+      "step": 473
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.24781496822834015,
+      "learning_rate": 0.00018495867768595042,
+      "loss": 0.5941,
+      "step": 474
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.1847202032804489,
+      "learning_rate": 0.00018471074380165285,
+      "loss": 0.609,
+      "step": 475
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.21596528589725494,
+      "learning_rate": 0.00018446280991735536,
+      "loss": 0.4457,
+      "step": 476
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.240879625082016,
+      "learning_rate": 0.00018421487603305784,
+      "loss": 0.6118,
+      "step": 477
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.2898111641407013,
+      "learning_rate": 0.00018396694214876033,
+      "loss": 0.7725,
+      "step": 478
+    },
+    {
+      "epoch": 0.78,
+      "grad_norm": 0.27428382635116577,
+      "learning_rate": 0.0001837190082644628,
+      "loss": 0.5366,
+      "step": 479
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.23467296361923218,
+      "learning_rate": 0.00018347107438016527,
+      "loss": 0.6018,
+      "step": 480
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.2190561592578888,
+      "learning_rate": 0.00018322314049586775,
+      "loss": 0.5249,
+      "step": 481
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.2240625023841858,
+      "learning_rate": 0.00018297520661157024,
+      "loss": 0.6891,
+      "step": 482
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.24726848304271698,
+      "learning_rate": 0.00018272727272727272,
+      "loss": 0.5545,
+      "step": 483
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.3318251371383667,
+      "learning_rate": 0.00018247933884297518,
+      "loss": 0.4809,
+      "step": 484
+    },
+    {
+      "epoch": 0.79,
+      "grad_norm": 0.2396695613861084,
+      "learning_rate": 0.00018223140495867766,
+      "loss": 0.4942,
+      "step": 485
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.25009942054748535,
+      "learning_rate": 0.00018198347107438014,
+      "loss": 0.7381,
+      "step": 486
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.22655311226844788,
+      "learning_rate": 0.00018173553719008263,
+      "loss": 0.4729,
+      "step": 487
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.23187695443630219,
+      "learning_rate": 0.0001814876033057851,
+      "loss": 0.5719,
+      "step": 488
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2703653573989868,
+      "learning_rate": 0.00018123966942148757,
+      "loss": 0.6031,
+      "step": 489
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2207796424627304,
+      "learning_rate": 0.00018099173553719008,
+      "loss": 0.5361,
+      "step": 490
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.24914169311523438,
+      "learning_rate": 0.00018074380165289256,
+      "loss": 0.6547,
+      "step": 491
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.2714746594429016,
+      "learning_rate": 0.00018049586776859504,
+      "loss": 0.5702,
+      "step": 492
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.3201580047607422,
+      "learning_rate": 0.0001802479338842975,
+      "loss": 0.6119,
+      "step": 493
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.2548397183418274,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.5251,
+      "step": 494
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.28669115900993347,
+      "learning_rate": 0.00017975206611570247,
+      "loss": 0.5773,
+      "step": 495
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.26253971457481384,
+      "learning_rate": 0.00017950413223140495,
+      "loss": 0.6504,
+      "step": 496
+    },
+    {
+      "epoch": 0.81,
+      "grad_norm": 0.22113384306430817,
+      "learning_rate": 0.00017925619834710744,
+      "loss": 0.4741,
+      "step": 497
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.261636346578598,
+      "learning_rate": 0.0001790082644628099,
+      "loss": 0.6241,
+      "step": 498
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.1780402809381485,
+      "learning_rate": 0.00017876033057851238,
+      "loss": 0.5207,
+      "step": 499
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.26149195432662964,
+      "learning_rate": 0.00017851239669421486,
+      "loss": 0.5872,
+      "step": 500
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.26113009452819824,
+      "learning_rate": 0.00017826446280991734,
+      "loss": 0.6163,
+      "step": 501
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.21397502720355988,
+      "learning_rate": 0.0001780165289256198,
+      "loss": 0.479,
+      "step": 502
+    },
+    {
+      "epoch": 0.82,
+      "grad_norm": 0.21250088512897491,
+      "learning_rate": 0.00017776859504132228,
+      "loss": 0.6978,
+      "step": 503
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.2556426525115967,
+      "learning_rate": 0.00017752066115702477,
+      "loss": 0.6128,
+      "step": 504
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.24139715731143951,
+      "learning_rate": 0.00017727272727272728,
+      "loss": 0.5066,
+      "step": 505
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.23671215772628784,
+      "learning_rate": 0.00017702479338842976,
+      "loss": 0.5183,
+      "step": 506
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.23494285345077515,
+      "learning_rate": 0.00017677685950413222,
+      "loss": 0.5181,
+      "step": 507
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.2547609806060791,
+      "learning_rate": 0.0001765289256198347,
+      "loss": 0.5406,
+      "step": 508
+    },
+    {
+      "epoch": 0.83,
+      "grad_norm": 0.3042651414871216,
+      "learning_rate": 0.00017628099173553718,
+      "loss": 0.5551,
+      "step": 509
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.22910748422145844,
+      "learning_rate": 0.00017603305785123967,
+      "loss": 0.6373,
+      "step": 510
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.19777967035770416,
+      "learning_rate": 0.00017578512396694212,
+      "loss": 0.5471,
+      "step": 511
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.31034502387046814,
+      "learning_rate": 0.0001755371900826446,
+      "loss": 0.7017,
+      "step": 512
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.3504410684108734,
+      "learning_rate": 0.0001752892561983471,
+      "loss": 0.7208,
+      "step": 513
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.24271292984485626,
+      "learning_rate": 0.00017504132231404958,
+      "loss": 0.5563,
+      "step": 514
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.27147865295410156,
+      "learning_rate": 0.00017479338842975203,
+      "loss": 0.5869,
+      "step": 515
+    },
+    {
+      "epoch": 0.84,
+      "grad_norm": 0.2976628839969635,
+      "learning_rate": 0.00017454545454545452,
+      "loss": 0.5471,
+      "step": 516
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.28489646315574646,
+      "learning_rate": 0.000174297520661157,
+      "loss": 0.6053,
+      "step": 517
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.30020108819007874,
+      "learning_rate": 0.00017404958677685948,
+      "loss": 0.6178,
+      "step": 518
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.23986253142356873,
+      "learning_rate": 0.000173801652892562,
+      "loss": 0.5896,
+      "step": 519
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.2667832374572754,
+      "learning_rate": 0.00017355371900826442,
+      "loss": 0.5375,
+      "step": 520
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.22176356613636017,
+      "learning_rate": 0.00017330578512396693,
+      "loss": 0.5723,
+      "step": 521
+    },
+    {
+      "epoch": 0.85,
+      "grad_norm": 0.263257771730423,
+      "learning_rate": 0.00017305785123966942,
+      "loss": 0.7317,
+      "step": 522
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.24838753044605255,
+      "learning_rate": 0.0001728099173553719,
+      "loss": 0.5849,
+      "step": 523
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.24839664995670319,
+      "learning_rate": 0.00017256198347107436,
+      "loss": 0.6678,
+      "step": 524
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.2849573493003845,
+      "learning_rate": 0.00017231404958677684,
+      "loss": 0.7144,
+      "step": 525
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.26900768280029297,
+      "learning_rate": 0.00017206611570247932,
+      "loss": 0.5156,
+      "step": 526
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.2212425172328949,
+      "learning_rate": 0.0001718181818181818,
+      "loss": 0.4551,
+      "step": 527
+    },
+    {
+      "epoch": 0.86,
+      "grad_norm": 0.2066129595041275,
+      "learning_rate": 0.0001715702479338843,
+      "loss": 0.4193,
+      "step": 528
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.2838365137577057,
+      "learning_rate": 0.00017132231404958675,
+      "loss": 0.6078,
+      "step": 529
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.239679753780365,
+      "learning_rate": 0.00017107438016528923,
+      "loss": 0.616,
+      "step": 530
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.23269398510456085,
+      "learning_rate": 0.00017082644628099172,
+      "loss": 0.542,
+      "step": 531
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.23838558793067932,
+      "learning_rate": 0.0001705785123966942,
+      "loss": 0.5147,
+      "step": 532
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.2819415330886841,
+      "learning_rate": 0.00017033057851239666,
+      "loss": 0.6437,
+      "step": 533
+    },
+    {
+      "epoch": 0.87,
+      "grad_norm": 0.243398055434227,
+      "learning_rate": 0.00017008264462809914,
+      "loss": 0.6611,
+      "step": 534
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.22569122910499573,
+      "learning_rate": 0.00016983471074380165,
+      "loss": 0.3979,
+      "step": 535
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.33265820145606995,
+      "learning_rate": 0.00016958677685950413,
+      "loss": 0.6005,
+      "step": 536
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.26828673481941223,
+      "learning_rate": 0.00016933884297520662,
+      "loss": 0.608,
+      "step": 537
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.24439513683319092,
+      "learning_rate": 0.00016909090909090907,
+      "loss": 0.5572,
+      "step": 538
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.22491876780986786,
+      "learning_rate": 0.00016884297520661156,
+      "loss": 0.7226,
+      "step": 539
+    },
+    {
+      "epoch": 0.88,
+      "grad_norm": 0.24468480050563812,
+      "learning_rate": 0.00016859504132231404,
+      "loss": 0.4582,
+      "step": 540
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.23392945528030396,
+      "learning_rate": 0.00016834710743801652,
+      "loss": 0.6477,
+      "step": 541
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.27548858523368835,
+      "learning_rate": 0.00016809917355371898,
+      "loss": 0.5846,
+      "step": 542
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.2861180603504181,
+      "learning_rate": 0.00016785123966942146,
+      "loss": 0.6412,
+      "step": 543
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.24700766801834106,
+      "learning_rate": 0.00016760330578512395,
+      "loss": 0.6947,
+      "step": 544
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.2600953280925751,
+      "learning_rate": 0.00016735537190082643,
+      "loss": 0.6165,
+      "step": 545
+    },
+    {
+      "epoch": 0.89,
+      "grad_norm": 0.26876646280288696,
+      "learning_rate": 0.00016710743801652892,
+      "loss": 0.6855,
+      "step": 546
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.26161080598831177,
+      "learning_rate": 0.00016685950413223137,
+      "loss": 0.5066,
+      "step": 547
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.25190046429634094,
+      "learning_rate": 0.00016661157024793386,
+      "loss": 0.5902,
+      "step": 548
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.25269225239753723,
+      "learning_rate": 0.00016636363636363637,
+      "loss": 0.7017,
+      "step": 549
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.28042706847190857,
+      "learning_rate": 0.00016611570247933885,
+      "loss": 0.6264,
+      "step": 550
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.2767360508441925,
+      "learning_rate": 0.0001658677685950413,
+      "loss": 0.7562,
+      "step": 551
+    },
+    {
+      "epoch": 0.9,
+      "grad_norm": 0.2771216034889221,
+      "learning_rate": 0.0001656198347107438,
+      "loss": 0.5333,
+      "step": 552
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.189210906624794,
+      "learning_rate": 0.00016537190082644627,
+      "loss": 0.5378,
+      "step": 553
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.22517065703868866,
+      "learning_rate": 0.00016512396694214876,
+      "loss": 0.5292,
+      "step": 554
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.2390165776014328,
+      "learning_rate": 0.00016487603305785121,
+      "loss": 0.4407,
+      "step": 555
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.21548262238502502,
+      "learning_rate": 0.0001646280991735537,
+      "loss": 0.4504,
+      "step": 556
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.20831167697906494,
+      "learning_rate": 0.00016438016528925618,
+      "loss": 0.6848,
+      "step": 557
+    },
+    {
+      "epoch": 0.91,
+      "grad_norm": 0.271257609128952,
+      "learning_rate": 0.00016413223140495866,
+      "loss": 0.535,
+      "step": 558
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.32008254528045654,
+      "learning_rate": 0.00016388429752066115,
+      "loss": 0.5107,
+      "step": 559
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.34058302640914917,
+      "learning_rate": 0.0001636363636363636,
+      "loss": 0.5708,
+      "step": 560
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.28070059418678284,
+      "learning_rate": 0.0001633884297520661,
+      "loss": 0.5086,
+      "step": 561
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.25487688183784485,
+      "learning_rate": 0.00016314049586776857,
+      "loss": 0.5184,
+      "step": 562
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.3240332007408142,
+      "learning_rate": 0.00016289256198347108,
+      "loss": 0.6774,
+      "step": 563
+    },
+    {
+      "epoch": 0.92,
+      "grad_norm": 0.30744409561157227,
+      "learning_rate": 0.0001626446280991735,
+      "loss": 0.5314,
+      "step": 564
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.25220754742622375,
+      "learning_rate": 0.00016239669421487602,
+      "loss": 0.6308,
+      "step": 565
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.29116958379745483,
+      "learning_rate": 0.0001621487603305785,
+      "loss": 0.5685,
+      "step": 566
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.23250073194503784,
+      "learning_rate": 0.000161900826446281,
+      "loss": 0.4318,
+      "step": 567
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.2808091640472412,
+      "learning_rate": 0.00016165289256198347,
+      "loss": 0.6313,
+      "step": 568
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.2711193561553955,
+      "learning_rate": 0.00016140495867768593,
+      "loss": 0.4651,
+      "step": 569
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.29540935158729553,
+      "learning_rate": 0.00016115702479338841,
+      "loss": 0.6663,
+      "step": 570
+    },
+    {
+      "epoch": 0.93,
+      "grad_norm": 0.23418714106082916,
+      "learning_rate": 0.0001609090909090909,
+      "loss": 0.448,
+      "step": 571
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.21675793826580048,
+      "learning_rate": 0.00016066115702479338,
+      "loss": 0.5034,
+      "step": 572
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.22451865673065186,
+      "learning_rate": 0.00016041322314049584,
+      "loss": 0.4476,
+      "step": 573
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.26300856471061707,
+      "learning_rate": 0.00016016528925619832,
+      "loss": 0.6646,
+      "step": 574
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.3377116918563843,
+      "learning_rate": 0.0001599173553719008,
+      "loss": 0.6029,
+      "step": 575
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.23391880095005035,
+      "learning_rate": 0.0001596694214876033,
+      "loss": 0.6277,
+      "step": 576
+    },
+    {
+      "epoch": 0.94,
+      "grad_norm": 0.19620922207832336,
+      "learning_rate": 0.0001594214876033058,
+      "loss": 0.4638,
+      "step": 577
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.22981096804141998,
+      "learning_rate": 0.00015917355371900823,
+      "loss": 0.5826,
+      "step": 578
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.34321555495262146,
+      "learning_rate": 0.00015892561983471074,
+      "loss": 0.5618,
+      "step": 579
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.28461968898773193,
+      "learning_rate": 0.00015867768595041322,
+      "loss": 0.5129,
+      "step": 580
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.24368269741535187,
+      "learning_rate": 0.0001584297520661157,
+      "loss": 0.5866,
+      "step": 581
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.282255083322525,
+      "learning_rate": 0.00015818181818181816,
+      "loss": 0.6274,
+      "step": 582
+    },
+    {
+      "epoch": 0.95,
+      "grad_norm": 0.26298072934150696,
+      "learning_rate": 0.00015793388429752065,
+      "loss": 0.5187,
+      "step": 583
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2671455144882202,
+      "learning_rate": 0.00015768595041322313,
+      "loss": 0.6878,
+      "step": 584
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2681390643119812,
+      "learning_rate": 0.00015743801652892561,
+      "loss": 0.5469,
+      "step": 585
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.38484248518943787,
+      "learning_rate": 0.0001571900826446281,
+      "loss": 0.6364,
+      "step": 586
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.23353587090969086,
+      "learning_rate": 0.00015694214876033055,
+      "loss": 0.4844,
+      "step": 587
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.29452502727508545,
+      "learning_rate": 0.00015669421487603304,
+      "loss": 0.5059,
+      "step": 588
+    },
+    {
+      "epoch": 0.96,
+      "grad_norm": 0.2460879236459732,
+      "learning_rate": 0.00015644628099173552,
+      "loss": 0.6495,
+      "step": 589
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.30693721771240234,
+      "learning_rate": 0.000156198347107438,
+      "loss": 0.5165,
+      "step": 590
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2171495109796524,
+      "learning_rate": 0.00015595041322314046,
+      "loss": 0.6172,
+      "step": 591
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.24301984906196594,
+      "learning_rate": 0.00015570247933884294,
+      "loss": 0.6786,
+      "step": 592
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2288222461938858,
+      "learning_rate": 0.00015545454545454546,
+      "loss": 0.5669,
+      "step": 593
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2407921552658081,
+      "learning_rate": 0.00015520661157024794,
+      "loss": 0.5968,
+      "step": 594
+    },
+    {
+      "epoch": 0.97,
+      "grad_norm": 0.2591527998447418,
+      "learning_rate": 0.0001549586776859504,
+      "loss": 0.544,
+      "step": 595
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.25770679116249084,
+      "learning_rate": 0.00015471074380165288,
+      "loss": 0.7177,
+      "step": 596
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.2528848648071289,
+      "learning_rate": 0.00015446280991735536,
+      "loss": 0.4703,
+      "step": 597
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.24993537366390228,
+      "learning_rate": 0.00015421487603305785,
+      "loss": 0.6003,
+      "step": 598
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.25807908177375793,
+      "learning_rate": 0.00015396694214876033,
+      "loss": 0.465,
+      "step": 599
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.3142452836036682,
+      "learning_rate": 0.0001537190082644628,
+      "loss": 0.6122,
+      "step": 600
+    },
+    {
+      "epoch": 0.98,
+      "grad_norm": 0.27111849188804626,
+      "learning_rate": 0.00015347107438016527,
+      "loss": 0.5962,
+      "step": 601
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.28503674268722534,
+      "learning_rate": 0.00015322314049586775,
+      "loss": 0.6667,
+      "step": 602
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.27074381709098816,
+      "learning_rate": 0.00015297520661157024,
+      "loss": 0.6115,
+      "step": 603
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.25918465852737427,
+      "learning_rate": 0.0001527272727272727,
+      "loss": 0.4483,
+      "step": 604
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.24476633965969086,
+      "learning_rate": 0.00015247933884297518,
+      "loss": 0.6501,
+      "step": 605
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.21205200254917145,
+      "learning_rate": 0.00015223140495867766,
+      "loss": 0.3914,
+      "step": 606
+    },
+    {
+      "epoch": 0.99,
+      "grad_norm": 0.25496751070022583,
+      "learning_rate": 0.00015198347107438017,
+      "loss": 0.5335,
+      "step": 607
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.27991780638694763,
+      "learning_rate": 0.00015173553719008266,
+      "loss": 0.6083,
+      "step": 608
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.23995639383792877,
+      "learning_rate": 0.0001514876033057851,
+      "loss": 0.55,
+      "step": 609
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 0.2349666953086853,
+      "learning_rate": 0.0001512396694214876,
+      "loss": 0.7054,
+      "step": 610
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 1220,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 610,
+  "total_flos": 1.313101299619971e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}