diff --git "a/adapter/checkpoint-610/trainer_state.json" "b/adapter/checkpoint-610/trainer_state.json" new file mode 100644--- /dev/null +++ "b/adapter/checkpoint-610/trainer_state.json" @@ -0,0 +1,4291 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9987720016373312, + "eval_steps": 500, + "global_step": 610, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.3300960063934326, + "learning_rate": 2.9999999999999997e-05, + "loss": 0.9966, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.4113194942474365, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.1253, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 0.2486647665500641, + "learning_rate": 8.999999999999999e-05, + "loss": 1.0721, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 0.2249160259962082, + "learning_rate": 0.00011999999999999999, + "loss": 0.9033, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 0.3706735074520111, + "learning_rate": 0.00015, + "loss": 1.0498, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 0.28104931116104126, + "learning_rate": 0.00017999999999999998, + "loss": 0.9108, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 0.27497801184654236, + "learning_rate": 0.00020999999999999998, + "loss": 0.9038, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 0.30283215641975403, + "learning_rate": 0.00023999999999999998, + "loss": 0.8605, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 0.33457252383232117, + "learning_rate": 0.00027, + "loss": 0.9049, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 0.37725692987442017, + "learning_rate": 0.0003, + "loss": 0.772, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 0.27986466884613037, + "learning_rate": 0.00029975206611570246, + "loss": 0.7666, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 0.30687034130096436, + "learning_rate": 0.00029950413223140494, + "loss": 0.8312, + "step": 12 + }, + { + "epoch": 0.02, + "grad_norm": 0.3321741819381714, + "learning_rate": 0.0002992561983471074, + "loss": 0.8308, + "step": 13 + }, + { + "epoch": 0.02, + "grad_norm": 0.29080134630203247, + "learning_rate": 0.0002990082644628099, + "loss": 0.7597, + "step": 14 + }, + { + "epoch": 0.02, + "grad_norm": 0.33823856711387634, + "learning_rate": 0.0002987603305785124, + "loss": 0.8693, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 0.3461182117462158, + "learning_rate": 0.0002985123966942149, + "loss": 1.0571, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 0.22306275367736816, + "learning_rate": 0.0002982644628099173, + "loss": 0.7706, + "step": 17 + }, + { + "epoch": 0.03, + "grad_norm": 154.4940643310547, + "learning_rate": 0.0002980165289256198, + "loss": 2.6519, + "step": 18 + }, + { + "epoch": 0.03, + "grad_norm": 0.22956405580043793, + "learning_rate": 0.00029776859504132227, + "loss": 0.6897, + "step": 19 + }, + { + "epoch": 0.03, + "grad_norm": 0.25711989402770996, + "learning_rate": 0.00029752066115702476, + "loss": 0.7338, + "step": 20 + }, + { + "epoch": 0.03, + "grad_norm": 0.2565441131591797, + "learning_rate": 0.00029727272727272724, + "loss": 0.8211, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.2437434047460556, + "learning_rate": 0.0002970247933884297, + "loss": 0.8027, + "step": 22 + }, + { + "epoch": 0.04, + "grad_norm": 0.21284469962120056, + "learning_rate": 0.0002967768595041322, + "loss": 0.7944, + "step": 23 + }, + { + "epoch": 0.04, + "grad_norm": 0.23338356614112854, + "learning_rate": 0.0002965289256198347, + "loss": 0.7696, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.25512659549713135, + "learning_rate": 0.0002962809917355372, + "loss": 0.7693, + "step": 25 + }, + { + "epoch": 0.04, + "grad_norm": 0.19500921666622162, + "learning_rate": 0.0002960330578512396, + "loss": 0.7599, + "step": 26 + }, + { + "epoch": 0.04, + "grad_norm": 0.2554054260253906, + "learning_rate": 0.00029578512396694214, + "loss": 0.966, + "step": 27 + }, + { + "epoch": 0.05, + "grad_norm": 0.17682747542858124, + "learning_rate": 0.0002955371900826446, + "loss": 0.676, + "step": 28 + }, + { + "epoch": 0.05, + "grad_norm": 0.20516635477542877, + "learning_rate": 0.0002952892561983471, + "loss": 0.8144, + "step": 29 + }, + { + "epoch": 0.05, + "grad_norm": 0.3275119662284851, + "learning_rate": 0.0002950413223140496, + "loss": 0.7704, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 0.22231778502464294, + "learning_rate": 0.000294793388429752, + "loss": 0.7614, + "step": 31 + }, + { + "epoch": 0.05, + "grad_norm": 0.17065812647342682, + "learning_rate": 0.0002945454545454545, + "loss": 0.5634, + "step": 32 + }, + { + "epoch": 0.05, + "grad_norm": 0.1771956831216812, + "learning_rate": 0.000294297520661157, + "loss": 0.7607, + "step": 33 + }, + { + "epoch": 0.06, + "grad_norm": 0.26693442463874817, + "learning_rate": 0.00029404958677685947, + "loss": 0.8171, + "step": 34 + }, + { + "epoch": 0.06, + "grad_norm": 1.409070611000061, + "learning_rate": 0.00029380165289256196, + "loss": 0.7791, + "step": 35 + }, + { + "epoch": 0.06, + "grad_norm": 0.20727217197418213, + "learning_rate": 0.00029355371900826444, + "loss": 0.7357, + "step": 36 + }, + { + "epoch": 0.06, + "grad_norm": 0.2145707905292511, + "learning_rate": 0.0002933057851239669, + "loss": 0.8458, + "step": 37 + }, + { + "epoch": 0.06, + "grad_norm": 0.2068527340888977, + "learning_rate": 0.0002930578512396694, + "loss": 0.78, + "step": 38 + }, + { + "epoch": 0.06, + "grad_norm": 0.22432388365268707, + "learning_rate": 0.00029280991735537184, + "loss": 0.8523, + "step": 39 + }, + { + "epoch": 0.07, + "grad_norm": 0.19982610642910004, + "learning_rate": 0.0002925619834710743, + "loss": 0.7372, + "step": 40 + }, + { + "epoch": 0.07, + "grad_norm": 6.248472213745117, + "learning_rate": 0.00029231404958677686, + "loss": 0.7399, + "step": 41 + }, + { + "epoch": 0.07, + "grad_norm": 0.2269737422466278, + "learning_rate": 0.00029206611570247934, + "loss": 0.7842, + "step": 42 + }, + { + "epoch": 0.07, + "grad_norm": 0.23117898404598236, + "learning_rate": 0.0002918181818181818, + "loss": 0.7111, + "step": 43 + }, + { + "epoch": 0.07, + "grad_norm": 0.22466522455215454, + "learning_rate": 0.00029157024793388425, + "loss": 0.8979, + "step": 44 + }, + { + "epoch": 0.07, + "grad_norm": 0.20770332217216492, + "learning_rate": 0.00029132231404958674, + "loss": 0.774, + "step": 45 + }, + { + "epoch": 0.08, + "grad_norm": 0.2376495748758316, + "learning_rate": 0.0002910743801652892, + "loss": 0.7216, + "step": 46 + }, + { + "epoch": 0.08, + "grad_norm": 0.2470778226852417, + "learning_rate": 0.0002908264462809917, + "loss": 0.7369, + "step": 47 + }, + { + "epoch": 0.08, + "grad_norm": 0.23465900123119354, + "learning_rate": 0.0002905785123966942, + "loss": 0.7528, + "step": 48 + }, + { + "epoch": 0.08, + "grad_norm": 0.5718627572059631, + "learning_rate": 0.00029033057851239667, + "loss": 0.7535, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.21493370831012726, + "learning_rate": 0.00029008264462809916, + "loss": 0.8593, + "step": 50 + }, + { + "epoch": 0.08, + "grad_norm": 0.21197210252285004, + "learning_rate": 0.00028983471074380164, + "loss": 0.8013, + "step": 51 + }, + { + "epoch": 0.09, + "grad_norm": 0.20836398005485535, + "learning_rate": 0.0002895867768595041, + "loss": 0.7905, + "step": 52 + }, + { + "epoch": 0.09, + "grad_norm": 0.2096678912639618, + "learning_rate": 0.00028933884297520655, + "loss": 0.6754, + "step": 53 + }, + { + "epoch": 0.09, + "grad_norm": 0.25898435711860657, + "learning_rate": 0.00028909090909090904, + "loss": 0.7725, + "step": 54 + }, + { + "epoch": 0.09, + "grad_norm": 0.23370735347270966, + "learning_rate": 0.0002888429752066116, + "loss": 0.7007, + "step": 55 + }, + { + "epoch": 0.09, + "grad_norm": 0.23006942868232727, + "learning_rate": 0.00028859504132231406, + "loss": 0.7534, + "step": 56 + }, + { + "epoch": 0.09, + "grad_norm": 0.20855402946472168, + "learning_rate": 0.0002883471074380165, + "loss": 0.9491, + "step": 57 + }, + { + "epoch": 0.09, + "grad_norm": 0.24340493977069855, + "learning_rate": 0.00028809917355371897, + "loss": 0.8089, + "step": 58 + }, + { + "epoch": 0.1, + "grad_norm": 0.20169466733932495, + "learning_rate": 0.00028785123966942145, + "loss": 0.64, + "step": 59 + }, + { + "epoch": 0.1, + "grad_norm": 0.23272906243801117, + "learning_rate": 0.00028760330578512394, + "loss": 0.8456, + "step": 60 + }, + { + "epoch": 0.1, + "grad_norm": 0.1767100691795349, + "learning_rate": 0.0002873553719008264, + "loss": 0.6686, + "step": 61 + }, + { + "epoch": 0.1, + "grad_norm": 0.24511106312274933, + "learning_rate": 0.0002871074380165289, + "loss": 0.6998, + "step": 62 + }, + { + "epoch": 0.1, + "grad_norm": 0.22284479439258575, + "learning_rate": 0.0002868595041322314, + "loss": 0.6699, + "step": 63 + }, + { + "epoch": 0.1, + "grad_norm": 0.21842750906944275, + "learning_rate": 0.00028661157024793387, + "loss": 0.7413, + "step": 64 + }, + { + "epoch": 0.11, + "grad_norm": 0.2669163644313812, + "learning_rate": 0.00028636363636363636, + "loss": 0.931, + "step": 65 + }, + { + "epoch": 0.11, + "grad_norm": 0.1864808052778244, + "learning_rate": 0.0002861157024793388, + "loss": 0.5652, + "step": 66 + }, + { + "epoch": 0.11, + "grad_norm": 0.18369853496551514, + "learning_rate": 0.00028586776859504127, + "loss": 0.6847, + "step": 67 + }, + { + "epoch": 0.11, + "grad_norm": 0.22353056073188782, + "learning_rate": 0.00028561983471074375, + "loss": 0.598, + "step": 68 + }, + { + "epoch": 0.11, + "grad_norm": 0.20269523561000824, + "learning_rate": 0.0002853719008264463, + "loss": 0.8688, + "step": 69 + }, + { + "epoch": 0.11, + "grad_norm": 0.2291198968887329, + "learning_rate": 0.0002851239669421488, + "loss": 0.7535, + "step": 70 + }, + { + "epoch": 0.12, + "grad_norm": 0.22033120691776276, + "learning_rate": 0.0002848760330578512, + "loss": 0.8377, + "step": 71 + }, + { + "epoch": 0.12, + "grad_norm": 0.2687983214855194, + "learning_rate": 0.0002846280991735537, + "loss": 0.6926, + "step": 72 + }, + { + "epoch": 0.12, + "grad_norm": 0.1933681070804596, + "learning_rate": 0.00028438016528925617, + "loss": 0.6276, + "step": 73 + }, + { + "epoch": 0.12, + "grad_norm": 0.2820705473423004, + "learning_rate": 0.00028413223140495865, + "loss": 0.848, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.19532324373722076, + "learning_rate": 0.00028388429752066114, + "loss": 0.6198, + "step": 75 + }, + { + "epoch": 0.12, + "grad_norm": 0.25057846307754517, + "learning_rate": 0.0002836363636363636, + "loss": 0.6838, + "step": 76 + }, + { + "epoch": 0.13, + "grad_norm": 0.2168462574481964, + "learning_rate": 0.0002833884297520661, + "loss": 0.7885, + "step": 77 + }, + { + "epoch": 0.13, + "grad_norm": 0.2106674313545227, + "learning_rate": 0.0002831404958677686, + "loss": 0.6757, + "step": 78 + }, + { + "epoch": 0.13, + "grad_norm": 0.24460363388061523, + "learning_rate": 0.000282892561983471, + "loss": 0.7414, + "step": 79 + }, + { + "epoch": 0.13, + "grad_norm": 0.3706071078777313, + "learning_rate": 0.0002826446280991735, + "loss": 0.621, + "step": 80 + }, + { + "epoch": 0.13, + "grad_norm": 0.2251998782157898, + "learning_rate": 0.000282396694214876, + "loss": 0.7453, + "step": 81 + }, + { + "epoch": 0.13, + "grad_norm": 0.24521738290786743, + "learning_rate": 0.00028214876033057847, + "loss": 0.6985, + "step": 82 + }, + { + "epoch": 0.14, + "grad_norm": 0.2262742966413498, + "learning_rate": 0.000281900826446281, + "loss": 0.6316, + "step": 83 + }, + { + "epoch": 0.14, + "grad_norm": 0.19723354279994965, + "learning_rate": 0.00028165289256198344, + "loss": 0.4798, + "step": 84 + }, + { + "epoch": 0.14, + "grad_norm": 0.20684833824634552, + "learning_rate": 0.0002814049586776859, + "loss": 0.7993, + "step": 85 + }, + { + "epoch": 0.14, + "grad_norm": 0.19534814357757568, + "learning_rate": 0.0002811570247933884, + "loss": 0.7735, + "step": 86 + }, + { + "epoch": 0.14, + "grad_norm": 0.2585545480251312, + "learning_rate": 0.0002809090909090909, + "loss": 0.8126, + "step": 87 + }, + { + "epoch": 0.14, + "grad_norm": 0.2510583996772766, + "learning_rate": 0.00028066115702479337, + "loss": 0.6973, + "step": 88 + }, + { + "epoch": 0.15, + "grad_norm": 0.1884051263332367, + "learning_rate": 0.00028041322314049585, + "loss": 0.701, + "step": 89 + }, + { + "epoch": 0.15, + "grad_norm": 0.2526257038116455, + "learning_rate": 0.00028016528925619834, + "loss": 0.7132, + "step": 90 + }, + { + "epoch": 0.15, + "grad_norm": 0.200734481215477, + "learning_rate": 0.0002799173553719008, + "loss": 0.7024, + "step": 91 + }, + { + "epoch": 0.15, + "grad_norm": 0.2404022514820099, + "learning_rate": 0.0002796694214876033, + "loss": 0.704, + "step": 92 + }, + { + "epoch": 0.15, + "grad_norm": 0.23063871264457703, + "learning_rate": 0.00027942148760330573, + "loss": 0.6312, + "step": 93 + }, + { + "epoch": 0.15, + "grad_norm": 0.1759747564792633, + "learning_rate": 0.0002791735537190082, + "loss": 0.6577, + "step": 94 + }, + { + "epoch": 0.16, + "grad_norm": 0.2009582370519638, + "learning_rate": 0.0002789256198347107, + "loss": 0.8036, + "step": 95 + }, + { + "epoch": 0.16, + "grad_norm": 0.2200164943933487, + "learning_rate": 0.0002786776859504132, + "loss": 0.7101, + "step": 96 + }, + { + "epoch": 0.16, + "grad_norm": 0.19693537056446075, + "learning_rate": 0.00027842975206611567, + "loss": 0.6221, + "step": 97 + }, + { + "epoch": 0.16, + "grad_norm": 0.23269779980182648, + "learning_rate": 0.00027818181818181815, + "loss": 0.8264, + "step": 98 + }, + { + "epoch": 0.16, + "grad_norm": 0.2440226823091507, + "learning_rate": 0.00027793388429752064, + "loss": 0.8051, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.2307034134864807, + "learning_rate": 0.0002776859504132231, + "loss": 0.631, + "step": 100 + }, + { + "epoch": 0.17, + "grad_norm": 0.2530567944049835, + "learning_rate": 0.0002774380165289256, + "loss": 0.8616, + "step": 101 + }, + { + "epoch": 0.17, + "grad_norm": 0.2808806300163269, + "learning_rate": 0.0002771900826446281, + "loss": 0.8333, + "step": 102 + }, + { + "epoch": 0.17, + "grad_norm": 0.20667941868305206, + "learning_rate": 0.00027694214876033057, + "loss": 0.7212, + "step": 103 + }, + { + "epoch": 0.17, + "grad_norm": 0.17540781199932098, + "learning_rate": 0.00027669421487603305, + "loss": 0.5964, + "step": 104 + }, + { + "epoch": 0.17, + "grad_norm": 0.2526637613773346, + "learning_rate": 0.00027644628099173554, + "loss": 0.6868, + "step": 105 + }, + { + "epoch": 0.17, + "grad_norm": 0.2137339860200882, + "learning_rate": 0.00027619834710743797, + "loss": 0.6155, + "step": 106 + }, + { + "epoch": 0.18, + "grad_norm": 0.21061092615127563, + "learning_rate": 0.00027595041322314045, + "loss": 0.813, + "step": 107 + }, + { + "epoch": 0.18, + "grad_norm": 0.21619191765785217, + "learning_rate": 0.00027570247933884293, + "loss": 0.8046, + "step": 108 + }, + { + "epoch": 0.18, + "grad_norm": 0.2212170660495758, + "learning_rate": 0.0002754545454545454, + "loss": 0.6706, + "step": 109 + }, + { + "epoch": 0.18, + "grad_norm": 0.23427413403987885, + "learning_rate": 0.0002752066115702479, + "loss": 0.7152, + "step": 110 + }, + { + "epoch": 0.18, + "grad_norm": 0.20566123723983765, + "learning_rate": 0.0002749586776859504, + "loss": 0.6568, + "step": 111 + }, + { + "epoch": 0.18, + "grad_norm": 0.22977930307388306, + "learning_rate": 0.00027471074380165287, + "loss": 0.7832, + "step": 112 + }, + { + "epoch": 0.19, + "grad_norm": 0.28307485580444336, + "learning_rate": 0.00027446280991735535, + "loss": 0.7446, + "step": 113 + }, + { + "epoch": 0.19, + "grad_norm": 0.19567596912384033, + "learning_rate": 0.00027421487603305784, + "loss": 0.6394, + "step": 114 + }, + { + "epoch": 0.19, + "grad_norm": 0.24577689170837402, + "learning_rate": 0.0002739669421487603, + "loss": 0.6389, + "step": 115 + }, + { + "epoch": 0.19, + "grad_norm": 0.2180463820695877, + "learning_rate": 0.0002737190082644628, + "loss": 0.7814, + "step": 116 + }, + { + "epoch": 0.19, + "grad_norm": 0.19546380639076233, + "learning_rate": 0.0002734710743801653, + "loss": 0.8312, + "step": 117 + }, + { + "epoch": 0.19, + "grad_norm": 0.22698360681533813, + "learning_rate": 0.00027322314049586777, + "loss": 0.7443, + "step": 118 + }, + { + "epoch": 0.19, + "grad_norm": 0.22987066209316254, + "learning_rate": 0.0002729752066115702, + "loss": 0.7839, + "step": 119 + }, + { + "epoch": 0.2, + "grad_norm": 0.20548178255558014, + "learning_rate": 0.0002727272727272727, + "loss": 0.7805, + "step": 120 + }, + { + "epoch": 0.2, + "grad_norm": 0.2477702796459198, + "learning_rate": 0.00027247933884297517, + "loss": 0.5694, + "step": 121 + }, + { + "epoch": 0.2, + "grad_norm": 0.20593340694904327, + "learning_rate": 0.00027223140495867765, + "loss": 0.6479, + "step": 122 + }, + { + "epoch": 0.2, + "grad_norm": 0.23635917901992798, + "learning_rate": 0.00027198347107438013, + "loss": 0.8107, + "step": 123 + }, + { + "epoch": 0.2, + "grad_norm": 0.25808119773864746, + "learning_rate": 0.0002717355371900826, + "loss": 0.7876, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.22156469523906708, + "learning_rate": 0.0002714876033057851, + "loss": 0.7261, + "step": 125 + }, + { + "epoch": 0.21, + "grad_norm": 0.19892215728759766, + "learning_rate": 0.0002712396694214876, + "loss": 0.6874, + "step": 126 + }, + { + "epoch": 0.21, + "grad_norm": 0.24936752021312714, + "learning_rate": 0.00027099173553719007, + "loss": 0.6155, + "step": 127 + }, + { + "epoch": 0.21, + "grad_norm": 0.23287539184093475, + "learning_rate": 0.0002707438016528925, + "loss": 0.602, + "step": 128 + }, + { + "epoch": 0.21, + "grad_norm": 0.2086639404296875, + "learning_rate": 0.00027049586776859504, + "loss": 0.7198, + "step": 129 + }, + { + "epoch": 0.21, + "grad_norm": 0.24974922835826874, + "learning_rate": 0.0002702479338842975, + "loss": 0.6873, + "step": 130 + }, + { + "epoch": 0.21, + "grad_norm": 0.2066827118396759, + "learning_rate": 0.00027, + "loss": 0.5821, + "step": 131 + }, + { + "epoch": 0.22, + "grad_norm": 0.28004395961761475, + "learning_rate": 0.0002697520661157025, + "loss": 0.7864, + "step": 132 + }, + { + "epoch": 0.22, + "grad_norm": 0.22391608357429504, + "learning_rate": 0.0002695041322314049, + "loss": 0.6773, + "step": 133 + }, + { + "epoch": 0.22, + "grad_norm": 0.2821199297904968, + "learning_rate": 0.0002692561983471074, + "loss": 0.6806, + "step": 134 + }, + { + "epoch": 0.22, + "grad_norm": 0.21736428141593933, + "learning_rate": 0.0002690082644628099, + "loss": 0.6662, + "step": 135 + }, + { + "epoch": 0.22, + "grad_norm": 0.23889939486980438, + "learning_rate": 0.00026876033057851237, + "loss": 0.6356, + "step": 136 + }, + { + "epoch": 0.22, + "grad_norm": 0.21096719801425934, + "learning_rate": 0.00026851239669421485, + "loss": 0.6762, + "step": 137 + }, + { + "epoch": 0.23, + "grad_norm": 0.22622421383857727, + "learning_rate": 0.00026826446280991733, + "loss": 0.8085, + "step": 138 + }, + { + "epoch": 0.23, + "grad_norm": 0.19824957847595215, + "learning_rate": 0.0002680165289256198, + "loss": 0.6031, + "step": 139 + }, + { + "epoch": 0.23, + "grad_norm": 0.24482691287994385, + "learning_rate": 0.0002677685950413223, + "loss": 0.6649, + "step": 140 + }, + { + "epoch": 0.23, + "grad_norm": 0.21291929483413696, + "learning_rate": 0.0002675206611570248, + "loss": 0.6671, + "step": 141 + }, + { + "epoch": 0.23, + "grad_norm": 0.2202674299478531, + "learning_rate": 0.0002672727272727272, + "loss": 0.6469, + "step": 142 + }, + { + "epoch": 0.23, + "grad_norm": 0.23572632670402527, + "learning_rate": 0.0002670247933884297, + "loss": 0.7377, + "step": 143 + }, + { + "epoch": 0.24, + "grad_norm": 0.2051907777786255, + "learning_rate": 0.00026677685950413224, + "loss": 0.6217, + "step": 144 + }, + { + "epoch": 0.24, + "grad_norm": 0.23270072042942047, + "learning_rate": 0.0002665289256198347, + "loss": 0.7933, + "step": 145 + }, + { + "epoch": 0.24, + "grad_norm": 0.20652809739112854, + "learning_rate": 0.00026628099173553715, + "loss": 0.6007, + "step": 146 + }, + { + "epoch": 0.24, + "grad_norm": 0.23084674775600433, + "learning_rate": 0.00026603305785123963, + "loss": 0.701, + "step": 147 + }, + { + "epoch": 0.24, + "grad_norm": 0.25663891434669495, + "learning_rate": 0.0002657851239669421, + "loss": 0.7271, + "step": 148 + }, + { + "epoch": 0.24, + "grad_norm": 0.25880497694015503, + "learning_rate": 0.0002655371900826446, + "loss": 0.6562, + "step": 149 + }, + { + "epoch": 0.25, + "grad_norm": 0.19349205493927002, + "learning_rate": 0.0002652892561983471, + "loss": 0.5016, + "step": 150 + }, + { + "epoch": 0.25, + "grad_norm": 0.2401740401983261, + "learning_rate": 0.00026504132231404957, + "loss": 0.6978, + "step": 151 + }, + { + "epoch": 0.25, + "grad_norm": 0.19495394825935364, + "learning_rate": 0.00026479338842975205, + "loss": 0.5562, + "step": 152 + }, + { + "epoch": 0.25, + "grad_norm": 0.21485286951065063, + "learning_rate": 0.00026454545454545453, + "loss": 0.7847, + "step": 153 + }, + { + "epoch": 0.25, + "grad_norm": 0.241348534822464, + "learning_rate": 0.000264297520661157, + "loss": 0.7513, + "step": 154 + }, + { + "epoch": 0.25, + "grad_norm": 0.3316986858844757, + "learning_rate": 0.00026404958677685945, + "loss": 0.664, + "step": 155 + }, + { + "epoch": 0.26, + "grad_norm": 0.2419958859682083, + "learning_rate": 0.00026380165289256193, + "loss": 0.7322, + "step": 156 + }, + { + "epoch": 0.26, + "grad_norm": 0.2868640124797821, + "learning_rate": 0.0002635537190082644, + "loss": 0.7004, + "step": 157 + }, + { + "epoch": 0.26, + "grad_norm": 0.24806949496269226, + "learning_rate": 0.00026330578512396695, + "loss": 0.6497, + "step": 158 + }, + { + "epoch": 0.26, + "grad_norm": 0.23873400688171387, + "learning_rate": 0.00026305785123966944, + "loss": 0.7543, + "step": 159 + }, + { + "epoch": 0.26, + "grad_norm": 0.2480355203151703, + "learning_rate": 0.00026280991735537187, + "loss": 0.6048, + "step": 160 + }, + { + "epoch": 0.26, + "grad_norm": 0.2619112730026245, + "learning_rate": 0.00026256198347107435, + "loss": 0.762, + "step": 161 + }, + { + "epoch": 0.27, + "grad_norm": 0.22763262689113617, + "learning_rate": 0.00026231404958677683, + "loss": 0.6557, + "step": 162 + }, + { + "epoch": 0.27, + "grad_norm": 0.3291528522968292, + "learning_rate": 0.0002620661157024793, + "loss": 0.7059, + "step": 163 + }, + { + "epoch": 0.27, + "grad_norm": 0.2959338426589966, + "learning_rate": 0.0002618181818181818, + "loss": 0.6622, + "step": 164 + }, + { + "epoch": 0.27, + "grad_norm": 0.23001112043857574, + "learning_rate": 0.0002615702479338843, + "loss": 0.6465, + "step": 165 + }, + { + "epoch": 0.27, + "grad_norm": 0.1998877376317978, + "learning_rate": 0.00026132231404958677, + "loss": 0.666, + "step": 166 + }, + { + "epoch": 0.27, + "grad_norm": 0.23009613156318665, + "learning_rate": 0.00026107438016528925, + "loss": 0.8793, + "step": 167 + }, + { + "epoch": 0.28, + "grad_norm": 0.24525685608386993, + "learning_rate": 0.0002608264462809917, + "loss": 0.8009, + "step": 168 + }, + { + "epoch": 0.28, + "grad_norm": 0.21605077385902405, + "learning_rate": 0.00026057851239669416, + "loss": 0.5459, + "step": 169 + }, + { + "epoch": 0.28, + "grad_norm": 0.2576725482940674, + "learning_rate": 0.00026033057851239665, + "loss": 0.6818, + "step": 170 + }, + { + "epoch": 0.28, + "grad_norm": 0.23385170102119446, + "learning_rate": 0.00026008264462809913, + "loss": 0.7559, + "step": 171 + }, + { + "epoch": 0.28, + "grad_norm": 0.1973017454147339, + "learning_rate": 0.00025983471074380167, + "loss": 0.6798, + "step": 172 + }, + { + "epoch": 0.28, + "grad_norm": 0.22262559831142426, + "learning_rate": 0.0002595867768595041, + "loss": 0.5566, + "step": 173 + }, + { + "epoch": 0.28, + "grad_norm": 0.23010462522506714, + "learning_rate": 0.0002593388429752066, + "loss": 0.7101, + "step": 174 + }, + { + "epoch": 0.29, + "grad_norm": 0.21676452457904816, + "learning_rate": 0.00025909090909090907, + "loss": 0.7038, + "step": 175 + }, + { + "epoch": 0.29, + "grad_norm": 0.22475261986255646, + "learning_rate": 0.00025884297520661155, + "loss": 0.7812, + "step": 176 + }, + { + "epoch": 0.29, + "grad_norm": 0.28893202543258667, + "learning_rate": 0.00025859504132231403, + "loss": 0.5925, + "step": 177 + }, + { + "epoch": 0.29, + "grad_norm": 0.22777552902698517, + "learning_rate": 0.0002583471074380165, + "loss": 0.7319, + "step": 178 + }, + { + "epoch": 0.29, + "grad_norm": 0.2287953644990921, + "learning_rate": 0.000258099173553719, + "loss": 0.7775, + "step": 179 + }, + { + "epoch": 0.29, + "grad_norm": 0.2049843668937683, + "learning_rate": 0.0002578512396694215, + "loss": 0.7448, + "step": 180 + }, + { + "epoch": 0.3, + "grad_norm": 0.22585280239582062, + "learning_rate": 0.00025760330578512397, + "loss": 0.59, + "step": 181 + }, + { + "epoch": 0.3, + "grad_norm": 0.23159150779247284, + "learning_rate": 0.0002573553719008264, + "loss": 0.737, + "step": 182 + }, + { + "epoch": 0.3, + "grad_norm": 0.3393082320690155, + "learning_rate": 0.0002571074380165289, + "loss": 0.6948, + "step": 183 + }, + { + "epoch": 0.3, + "grad_norm": 0.2345617413520813, + "learning_rate": 0.00025685950413223136, + "loss": 0.6351, + "step": 184 + }, + { + "epoch": 0.3, + "grad_norm": 0.23474591970443726, + "learning_rate": 0.00025661157024793385, + "loss": 0.6643, + "step": 185 + }, + { + "epoch": 0.3, + "grad_norm": 0.2473030984401703, + "learning_rate": 0.00025636363636363633, + "loss": 0.7663, + "step": 186 + }, + { + "epoch": 0.31, + "grad_norm": 0.2971685230731964, + "learning_rate": 0.0002561157024793388, + "loss": 0.7449, + "step": 187 + }, + { + "epoch": 0.31, + "grad_norm": 0.2745087742805481, + "learning_rate": 0.0002558677685950413, + "loss": 0.6125, + "step": 188 + }, + { + "epoch": 0.31, + "grad_norm": 0.23520545661449432, + "learning_rate": 0.0002556198347107438, + "loss": 0.573, + "step": 189 + }, + { + "epoch": 0.31, + "grad_norm": 0.2955464720726013, + "learning_rate": 0.00025537190082644627, + "loss": 0.5315, + "step": 190 + }, + { + "epoch": 0.31, + "grad_norm": 0.23987281322479248, + "learning_rate": 0.00025512396694214875, + "loss": 0.5636, + "step": 191 + }, + { + "epoch": 0.31, + "grad_norm": 0.24263744056224823, + "learning_rate": 0.00025487603305785123, + "loss": 0.6047, + "step": 192 + }, + { + "epoch": 0.32, + "grad_norm": 0.26061922311782837, + "learning_rate": 0.0002546280991735537, + "loss": 0.7812, + "step": 193 + }, + { + "epoch": 0.32, + "grad_norm": 0.2458687126636505, + "learning_rate": 0.0002543801652892562, + "loss": 0.58, + "step": 194 + }, + { + "epoch": 0.32, + "grad_norm": 0.24598994851112366, + "learning_rate": 0.00025413223140495863, + "loss": 0.7432, + "step": 195 + }, + { + "epoch": 0.32, + "grad_norm": 0.248992919921875, + "learning_rate": 0.0002538842975206611, + "loss": 0.6953, + "step": 196 + }, + { + "epoch": 0.32, + "grad_norm": 0.2518531382083893, + "learning_rate": 0.0002536363636363636, + "loss": 0.6707, + "step": 197 + }, + { + "epoch": 0.32, + "grad_norm": 0.23844210803508759, + "learning_rate": 0.0002533884297520661, + "loss": 0.6285, + "step": 198 + }, + { + "epoch": 0.33, + "grad_norm": 0.21948237717151642, + "learning_rate": 0.00025314049586776856, + "loss": 0.6859, + "step": 199 + }, + { + "epoch": 0.33, + "grad_norm": 0.2003835141658783, + "learning_rate": 0.00025289256198347105, + "loss": 0.6305, + "step": 200 + }, + { + "epoch": 0.33, + "grad_norm": 0.23421582579612732, + "learning_rate": 0.00025264462809917353, + "loss": 0.7164, + "step": 201 + }, + { + "epoch": 0.33, + "grad_norm": 0.22344104945659637, + "learning_rate": 0.000252396694214876, + "loss": 0.6498, + "step": 202 + }, + { + "epoch": 0.33, + "grad_norm": 0.17792212963104248, + "learning_rate": 0.0002521487603305785, + "loss": 0.614, + "step": 203 + }, + { + "epoch": 0.33, + "grad_norm": 0.217886820435524, + "learning_rate": 0.000251900826446281, + "loss": 0.7033, + "step": 204 + }, + { + "epoch": 0.34, + "grad_norm": 0.209726020693779, + "learning_rate": 0.00025165289256198347, + "loss": 0.5913, + "step": 205 + }, + { + "epoch": 0.34, + "grad_norm": 0.2401910424232483, + "learning_rate": 0.00025140495867768595, + "loss": 0.6405, + "step": 206 + }, + { + "epoch": 0.34, + "grad_norm": 0.21315626800060272, + "learning_rate": 0.00025115702479338843, + "loss": 0.7369, + "step": 207 + }, + { + "epoch": 0.34, + "grad_norm": 0.20102320611476898, + "learning_rate": 0.00025090909090909086, + "loss": 0.6245, + "step": 208 + }, + { + "epoch": 0.34, + "grad_norm": 0.20447981357574463, + "learning_rate": 0.00025066115702479335, + "loss": 0.5423, + "step": 209 + }, + { + "epoch": 0.34, + "grad_norm": 0.24979281425476074, + "learning_rate": 0.00025041322314049583, + "loss": 0.8078, + "step": 210 + }, + { + "epoch": 0.35, + "grad_norm": 0.20141547918319702, + "learning_rate": 0.0002501652892561983, + "loss": 0.7386, + "step": 211 + }, + { + "epoch": 0.35, + "grad_norm": 0.2538990378379822, + "learning_rate": 0.0002499173553719008, + "loss": 0.7219, + "step": 212 + }, + { + "epoch": 0.35, + "grad_norm": 0.2613961100578308, + "learning_rate": 0.0002496694214876033, + "loss": 0.7903, + "step": 213 + }, + { + "epoch": 0.35, + "grad_norm": 0.24777857959270477, + "learning_rate": 0.00024942148760330576, + "loss": 0.664, + "step": 214 + }, + { + "epoch": 0.35, + "grad_norm": 0.21958425641059875, + "learning_rate": 0.00024917355371900825, + "loss": 0.6755, + "step": 215 + }, + { + "epoch": 0.35, + "grad_norm": 0.2218528538942337, + "learning_rate": 0.00024892561983471073, + "loss": 0.5568, + "step": 216 + }, + { + "epoch": 0.36, + "grad_norm": 0.23632755875587463, + "learning_rate": 0.00024867768595041316, + "loss": 0.6858, + "step": 217 + }, + { + "epoch": 0.36, + "grad_norm": 0.2641279697418213, + "learning_rate": 0.0002484297520661157, + "loss": 0.7783, + "step": 218 + }, + { + "epoch": 0.36, + "grad_norm": 0.3147680163383484, + "learning_rate": 0.0002481818181818182, + "loss": 0.662, + "step": 219 + }, + { + "epoch": 0.36, + "grad_norm": 0.27947697043418884, + "learning_rate": 0.00024793388429752067, + "loss": 0.6477, + "step": 220 + }, + { + "epoch": 0.36, + "grad_norm": 0.2297278195619583, + "learning_rate": 0.00024768595041322315, + "loss": 0.5895, + "step": 221 + }, + { + "epoch": 0.36, + "grad_norm": 0.23085851967334747, + "learning_rate": 0.0002474380165289256, + "loss": 0.5806, + "step": 222 + }, + { + "epoch": 0.37, + "grad_norm": 0.19654251635074615, + "learning_rate": 0.00024719008264462806, + "loss": 0.5942, + "step": 223 + }, + { + "epoch": 0.37, + "grad_norm": 0.2467166632413864, + "learning_rate": 0.00024694214876033055, + "loss": 0.5059, + "step": 224 + }, + { + "epoch": 0.37, + "grad_norm": 0.22614917159080505, + "learning_rate": 0.00024669421487603303, + "loss": 0.643, + "step": 225 + }, + { + "epoch": 0.37, + "grad_norm": 0.2622920274734497, + "learning_rate": 0.0002464462809917355, + "loss": 0.6257, + "step": 226 + }, + { + "epoch": 0.37, + "grad_norm": 0.21843163669109344, + "learning_rate": 0.000246198347107438, + "loss": 0.6057, + "step": 227 + }, + { + "epoch": 0.37, + "grad_norm": 0.2294640988111496, + "learning_rate": 0.0002459504132231405, + "loss": 0.6876, + "step": 228 + }, + { + "epoch": 0.37, + "grad_norm": 0.1791463941335678, + "learning_rate": 0.00024570247933884296, + "loss": 0.5348, + "step": 229 + }, + { + "epoch": 0.38, + "grad_norm": 0.17243699729442596, + "learning_rate": 0.00024545454545454545, + "loss": 0.5966, + "step": 230 + }, + { + "epoch": 0.38, + "grad_norm": 0.22769273817539215, + "learning_rate": 0.0002452066115702479, + "loss": 0.7912, + "step": 231 + }, + { + "epoch": 0.38, + "grad_norm": 0.2325255423784256, + "learning_rate": 0.0002449586776859504, + "loss": 0.7441, + "step": 232 + }, + { + "epoch": 0.38, + "grad_norm": 0.24277740716934204, + "learning_rate": 0.0002447107438016529, + "loss": 0.6653, + "step": 233 + }, + { + "epoch": 0.38, + "grad_norm": 0.21596141159534454, + "learning_rate": 0.0002444628099173554, + "loss": 0.6668, + "step": 234 + }, + { + "epoch": 0.38, + "grad_norm": 0.20814135670661926, + "learning_rate": 0.0002442148760330578, + "loss": 0.6306, + "step": 235 + }, + { + "epoch": 0.39, + "grad_norm": 0.25570017099380493, + "learning_rate": 0.0002439669421487603, + "loss": 0.6524, + "step": 236 + }, + { + "epoch": 0.39, + "grad_norm": 0.2502390146255493, + "learning_rate": 0.00024371900826446278, + "loss": 0.6048, + "step": 237 + }, + { + "epoch": 0.39, + "grad_norm": 0.23688243329524994, + "learning_rate": 0.0002434710743801653, + "loss": 0.568, + "step": 238 + }, + { + "epoch": 0.39, + "grad_norm": 0.21041709184646606, + "learning_rate": 0.00024322314049586777, + "loss": 0.6908, + "step": 239 + }, + { + "epoch": 0.39, + "grad_norm": 0.21656759083271027, + "learning_rate": 0.00024297520661157023, + "loss": 0.4993, + "step": 240 + }, + { + "epoch": 0.39, + "grad_norm": 0.25133028626441956, + "learning_rate": 0.0002427272727272727, + "loss": 0.718, + "step": 241 + }, + { + "epoch": 0.4, + "grad_norm": 0.22228790819644928, + "learning_rate": 0.0002424793388429752, + "loss": 0.6146, + "step": 242 + }, + { + "epoch": 0.4, + "grad_norm": 0.26273205876350403, + "learning_rate": 0.00024223140495867768, + "loss": 0.7459, + "step": 243 + }, + { + "epoch": 0.4, + "grad_norm": 0.2156606763601303, + "learning_rate": 0.00024198347107438014, + "loss": 0.6692, + "step": 244 + }, + { + "epoch": 0.4, + "grad_norm": 0.2075020670890808, + "learning_rate": 0.00024173553719008262, + "loss": 0.6427, + "step": 245 + }, + { + "epoch": 0.4, + "grad_norm": 0.25821176171302795, + "learning_rate": 0.0002414876033057851, + "loss": 0.7964, + "step": 246 + }, + { + "epoch": 0.4, + "grad_norm": 0.23016126453876495, + "learning_rate": 0.0002412396694214876, + "loss": 0.536, + "step": 247 + }, + { + "epoch": 0.41, + "grad_norm": 0.23115016520023346, + "learning_rate": 0.00024099173553719004, + "loss": 0.6053, + "step": 248 + }, + { + "epoch": 0.41, + "grad_norm": 0.18249157071113586, + "learning_rate": 0.00024074380165289253, + "loss": 0.6574, + "step": 249 + }, + { + "epoch": 0.41, + "grad_norm": 0.28391778469085693, + "learning_rate": 0.000240495867768595, + "loss": 0.7152, + "step": 250 + }, + { + "epoch": 0.41, + "grad_norm": 0.2581539452075958, + "learning_rate": 0.0002402479338842975, + "loss": 0.8476, + "step": 251 + }, + { + "epoch": 0.41, + "grad_norm": 0.2304867058992386, + "learning_rate": 0.00023999999999999998, + "loss": 0.5781, + "step": 252 + }, + { + "epoch": 0.41, + "grad_norm": 0.239717036485672, + "learning_rate": 0.00023975206611570244, + "loss": 0.6543, + "step": 253 + }, + { + "epoch": 0.42, + "grad_norm": 0.22493794560432434, + "learning_rate": 0.00023950413223140495, + "loss": 0.7048, + "step": 254 + }, + { + "epoch": 0.42, + "grad_norm": 0.22085991501808167, + "learning_rate": 0.00023925619834710743, + "loss": 0.5572, + "step": 255 + }, + { + "epoch": 0.42, + "grad_norm": 0.35917988419532776, + "learning_rate": 0.0002390082644628099, + "loss": 0.8485, + "step": 256 + }, + { + "epoch": 0.42, + "grad_norm": 0.28269943594932556, + "learning_rate": 0.00023876033057851237, + "loss": 0.5732, + "step": 257 + }, + { + "epoch": 0.42, + "grad_norm": 0.26313093304634094, + "learning_rate": 0.00023851239669421485, + "loss": 0.8212, + "step": 258 + }, + { + "epoch": 0.42, + "grad_norm": 0.30286532640457153, + "learning_rate": 0.00023826446280991734, + "loss": 0.5878, + "step": 259 + }, + { + "epoch": 0.43, + "grad_norm": 0.22270837426185608, + "learning_rate": 0.00023801652892561982, + "loss": 0.6933, + "step": 260 + }, + { + "epoch": 0.43, + "grad_norm": 0.29011014103889465, + "learning_rate": 0.0002377685950413223, + "loss": 0.6188, + "step": 261 + }, + { + "epoch": 0.43, + "grad_norm": 0.2390982061624527, + "learning_rate": 0.00023752066115702476, + "loss": 0.6426, + "step": 262 + }, + { + "epoch": 0.43, + "grad_norm": 0.3416346609592438, + "learning_rate": 0.00023727272727272724, + "loss": 0.8845, + "step": 263 + }, + { + "epoch": 0.43, + "grad_norm": 0.25051388144493103, + "learning_rate": 0.00023702479338842973, + "loss": 0.7286, + "step": 264 + }, + { + "epoch": 0.43, + "grad_norm": 0.2497546523809433, + "learning_rate": 0.0002367768595041322, + "loss": 0.6027, + "step": 265 + }, + { + "epoch": 0.44, + "grad_norm": 0.23835037648677826, + "learning_rate": 0.00023652892561983467, + "loss": 0.7052, + "step": 266 + }, + { + "epoch": 0.44, + "grad_norm": 0.22467398643493652, + "learning_rate": 0.00023628099173553715, + "loss": 0.5806, + "step": 267 + }, + { + "epoch": 0.44, + "grad_norm": 0.2663390338420868, + "learning_rate": 0.00023603305785123964, + "loss": 0.6943, + "step": 268 + }, + { + "epoch": 0.44, + "grad_norm": 0.22997191548347473, + "learning_rate": 0.00023578512396694215, + "loss": 0.6411, + "step": 269 + }, + { + "epoch": 0.44, + "grad_norm": 0.23266558349132538, + "learning_rate": 0.00023553719008264463, + "loss": 0.6068, + "step": 270 + }, + { + "epoch": 0.44, + "grad_norm": 0.2304474264383316, + "learning_rate": 0.00023528925619834709, + "loss": 0.6427, + "step": 271 + }, + { + "epoch": 0.45, + "grad_norm": 0.28231826424598694, + "learning_rate": 0.00023504132231404957, + "loss": 0.8011, + "step": 272 + }, + { + "epoch": 0.45, + "grad_norm": 0.28013259172439575, + "learning_rate": 0.00023479338842975205, + "loss": 0.5988, + "step": 273 + }, + { + "epoch": 0.45, + "grad_norm": 0.22702372074127197, + "learning_rate": 0.00023454545454545454, + "loss": 0.6737, + "step": 274 + }, + { + "epoch": 0.45, + "grad_norm": 0.27958643436431885, + "learning_rate": 0.000234297520661157, + "loss": 0.6621, + "step": 275 + }, + { + "epoch": 0.45, + "grad_norm": 0.23902451992034912, + "learning_rate": 0.00023404958677685948, + "loss": 0.6525, + "step": 276 + }, + { + "epoch": 0.45, + "grad_norm": 0.2778523564338684, + "learning_rate": 0.00023380165289256196, + "loss": 0.6697, + "step": 277 + }, + { + "epoch": 0.46, + "grad_norm": 0.2382276952266693, + "learning_rate": 0.00023355371900826444, + "loss": 0.6281, + "step": 278 + }, + { + "epoch": 0.46, + "grad_norm": 0.24487091600894928, + "learning_rate": 0.00023330578512396693, + "loss": 0.6842, + "step": 279 + }, + { + "epoch": 0.46, + "grad_norm": 0.2063397765159607, + "learning_rate": 0.00023305785123966938, + "loss": 0.6554, + "step": 280 + }, + { + "epoch": 0.46, + "grad_norm": 0.21523278951644897, + "learning_rate": 0.00023280991735537187, + "loss": 0.632, + "step": 281 + }, + { + "epoch": 0.46, + "grad_norm": 0.2420080006122589, + "learning_rate": 0.00023256198347107435, + "loss": 0.6001, + "step": 282 + }, + { + "epoch": 0.46, + "grad_norm": 0.2390110194683075, + "learning_rate": 0.00023231404958677686, + "loss": 0.5648, + "step": 283 + }, + { + "epoch": 0.47, + "grad_norm": 0.24080687761306763, + "learning_rate": 0.0002320661157024793, + "loss": 0.86, + "step": 284 + }, + { + "epoch": 0.47, + "grad_norm": 0.29456445574760437, + "learning_rate": 0.0002318181818181818, + "loss": 0.7418, + "step": 285 + }, + { + "epoch": 0.47, + "grad_norm": 0.23326683044433594, + "learning_rate": 0.00023157024793388429, + "loss": 0.6967, + "step": 286 + }, + { + "epoch": 0.47, + "grad_norm": 0.20866093039512634, + "learning_rate": 0.00023132231404958677, + "loss": 0.5205, + "step": 287 + }, + { + "epoch": 0.47, + "grad_norm": 0.3158474266529083, + "learning_rate": 0.00023107438016528925, + "loss": 0.7879, + "step": 288 + }, + { + "epoch": 0.47, + "grad_norm": 0.2730140686035156, + "learning_rate": 0.0002308264462809917, + "loss": 0.7292, + "step": 289 + }, + { + "epoch": 0.47, + "grad_norm": 0.25384965538978577, + "learning_rate": 0.0002305785123966942, + "loss": 0.7258, + "step": 290 + }, + { + "epoch": 0.48, + "grad_norm": 0.20765069127082825, + "learning_rate": 0.00023033057851239668, + "loss": 0.7108, + "step": 291 + }, + { + "epoch": 0.48, + "grad_norm": 0.25662195682525635, + "learning_rate": 0.00023008264462809916, + "loss": 0.7473, + "step": 292 + }, + { + "epoch": 0.48, + "grad_norm": 0.300243616104126, + "learning_rate": 0.00022983471074380162, + "loss": 0.6902, + "step": 293 + }, + { + "epoch": 0.48, + "grad_norm": 0.23513919115066528, + "learning_rate": 0.0002295867768595041, + "loss": 0.5888, + "step": 294 + }, + { + "epoch": 0.48, + "grad_norm": 0.2077571451663971, + "learning_rate": 0.00022933884297520658, + "loss": 0.6256, + "step": 295 + }, + { + "epoch": 0.48, + "grad_norm": 0.266201376914978, + "learning_rate": 0.00022909090909090907, + "loss": 0.6913, + "step": 296 + }, + { + "epoch": 0.49, + "grad_norm": 0.2239614725112915, + "learning_rate": 0.00022884297520661152, + "loss": 0.7369, + "step": 297 + }, + { + "epoch": 0.49, + "grad_norm": 0.21509824693202972, + "learning_rate": 0.000228595041322314, + "loss": 0.4445, + "step": 298 + }, + { + "epoch": 0.49, + "grad_norm": 0.21956239640712738, + "learning_rate": 0.00022834710743801652, + "loss": 0.6732, + "step": 299 + }, + { + "epoch": 0.49, + "grad_norm": 0.18832357227802277, + "learning_rate": 0.000228099173553719, + "loss": 0.6808, + "step": 300 + }, + { + "epoch": 0.49, + "grad_norm": 0.21115505695343018, + "learning_rate": 0.0002278512396694215, + "loss": 0.5323, + "step": 301 + }, + { + "epoch": 0.49, + "grad_norm": 0.23715418577194214, + "learning_rate": 0.00022760330578512394, + "loss": 0.8333, + "step": 302 + }, + { + "epoch": 0.5, + "grad_norm": 0.29385048151016235, + "learning_rate": 0.00022735537190082643, + "loss": 0.6, + "step": 303 + }, + { + "epoch": 0.5, + "grad_norm": 0.26947689056396484, + "learning_rate": 0.0002271074380165289, + "loss": 0.8788, + "step": 304 + }, + { + "epoch": 0.5, + "grad_norm": 0.2778269946575165, + "learning_rate": 0.0002268595041322314, + "loss": 0.7073, + "step": 305 + }, + { + "epoch": 0.5, + "grad_norm": 0.20938479900360107, + "learning_rate": 0.00022661157024793385, + "loss": 0.6422, + "step": 306 + }, + { + "epoch": 0.5, + "grad_norm": 0.2777106761932373, + "learning_rate": 0.00022636363636363633, + "loss": 0.7495, + "step": 307 + }, + { + "epoch": 0.5, + "grad_norm": 0.20872819423675537, + "learning_rate": 0.00022611570247933882, + "loss": 0.6492, + "step": 308 + }, + { + "epoch": 0.51, + "grad_norm": 0.2752722501754761, + "learning_rate": 0.0002258677685950413, + "loss": 0.6014, + "step": 309 + }, + { + "epoch": 0.51, + "grad_norm": 0.24615786969661713, + "learning_rate": 0.00022561983471074378, + "loss": 0.6287, + "step": 310 + }, + { + "epoch": 0.51, + "grad_norm": 0.24146385490894318, + "learning_rate": 0.00022537190082644624, + "loss": 0.6151, + "step": 311 + }, + { + "epoch": 0.51, + "grad_norm": 0.24762235581874847, + "learning_rate": 0.00022512396694214872, + "loss": 0.6377, + "step": 312 + }, + { + "epoch": 0.51, + "grad_norm": 0.24630331993103027, + "learning_rate": 0.00022487603305785124, + "loss": 0.7255, + "step": 313 + }, + { + "epoch": 0.51, + "grad_norm": 0.2922554612159729, + "learning_rate": 0.00022462809917355372, + "loss": 0.6645, + "step": 314 + }, + { + "epoch": 0.52, + "grad_norm": 0.21686063706874847, + "learning_rate": 0.00022438016528925618, + "loss": 0.5606, + "step": 315 + }, + { + "epoch": 0.52, + "grad_norm": 0.2216208428144455, + "learning_rate": 0.00022413223140495866, + "loss": 0.5126, + "step": 316 + }, + { + "epoch": 0.52, + "grad_norm": 0.25635436177253723, + "learning_rate": 0.00022388429752066114, + "loss": 0.7387, + "step": 317 + }, + { + "epoch": 0.52, + "grad_norm": 0.2786000669002533, + "learning_rate": 0.00022363636363636363, + "loss": 0.5941, + "step": 318 + }, + { + "epoch": 0.52, + "grad_norm": 0.26092806458473206, + "learning_rate": 0.0002233884297520661, + "loss": 0.7851, + "step": 319 + }, + { + "epoch": 0.52, + "grad_norm": 0.23881889879703522, + "learning_rate": 0.00022314049586776857, + "loss": 0.598, + "step": 320 + }, + { + "epoch": 0.53, + "grad_norm": 0.23304526507854462, + "learning_rate": 0.00022289256198347105, + "loss": 0.7165, + "step": 321 + }, + { + "epoch": 0.53, + "grad_norm": 0.2340225875377655, + "learning_rate": 0.00022264462809917353, + "loss": 0.6608, + "step": 322 + }, + { + "epoch": 0.53, + "grad_norm": 0.31176140904426575, + "learning_rate": 0.00022239669421487602, + "loss": 0.6711, + "step": 323 + }, + { + "epoch": 0.53, + "grad_norm": 0.23832640051841736, + "learning_rate": 0.00022214876033057847, + "loss": 0.732, + "step": 324 + }, + { + "epoch": 0.53, + "grad_norm": 0.28845977783203125, + "learning_rate": 0.00022190082644628096, + "loss": 0.7968, + "step": 325 + }, + { + "epoch": 0.53, + "grad_norm": 0.1978536993265152, + "learning_rate": 0.00022165289256198344, + "loss": 0.6592, + "step": 326 + }, + { + "epoch": 0.54, + "grad_norm": 0.26940053701400757, + "learning_rate": 0.00022140495867768595, + "loss": 0.7953, + "step": 327 + }, + { + "epoch": 0.54, + "grad_norm": 0.20393389463424683, + "learning_rate": 0.00022115702479338844, + "loss": 0.4871, + "step": 328 + }, + { + "epoch": 0.54, + "grad_norm": 0.27152347564697266, + "learning_rate": 0.0002209090909090909, + "loss": 0.5583, + "step": 329 + }, + { + "epoch": 0.54, + "grad_norm": 0.2883144021034241, + "learning_rate": 0.00022066115702479338, + "loss": 0.6156, + "step": 330 + }, + { + "epoch": 0.54, + "grad_norm": 0.1987351030111313, + "learning_rate": 0.00022041322314049586, + "loss": 0.5196, + "step": 331 + }, + { + "epoch": 0.54, + "grad_norm": 0.2651583254337311, + "learning_rate": 0.00022016528925619834, + "loss": 0.6099, + "step": 332 + }, + { + "epoch": 0.55, + "grad_norm": 0.2574511468410492, + "learning_rate": 0.0002199173553719008, + "loss": 0.6925, + "step": 333 + }, + { + "epoch": 0.55, + "grad_norm": 0.27730292081832886, + "learning_rate": 0.00021966942148760328, + "loss": 0.6752, + "step": 334 + }, + { + "epoch": 0.55, + "grad_norm": 0.2001207172870636, + "learning_rate": 0.00021942148760330577, + "loss": 0.75, + "step": 335 + }, + { + "epoch": 0.55, + "grad_norm": 0.24222363531589508, + "learning_rate": 0.00021917355371900825, + "loss": 0.6364, + "step": 336 + }, + { + "epoch": 0.55, + "grad_norm": 0.26326724886894226, + "learning_rate": 0.0002189256198347107, + "loss": 0.673, + "step": 337 + }, + { + "epoch": 0.55, + "grad_norm": 0.2272881418466568, + "learning_rate": 0.0002186776859504132, + "loss": 0.561, + "step": 338 + }, + { + "epoch": 0.56, + "grad_norm": 0.24880024790763855, + "learning_rate": 0.00021842975206611567, + "loss": 0.5552, + "step": 339 + }, + { + "epoch": 0.56, + "grad_norm": 0.2593706548213959, + "learning_rate": 0.00021818181818181816, + "loss": 0.5417, + "step": 340 + }, + { + "epoch": 0.56, + "grad_norm": 0.19063642621040344, + "learning_rate": 0.00021793388429752067, + "loss": 0.5694, + "step": 341 + }, + { + "epoch": 0.56, + "grad_norm": 0.2146475464105606, + "learning_rate": 0.0002176859504132231, + "loss": 0.4314, + "step": 342 + }, + { + "epoch": 0.56, + "grad_norm": 0.25150927901268005, + "learning_rate": 0.0002174380165289256, + "loss": 0.631, + "step": 343 + }, + { + "epoch": 0.56, + "grad_norm": 0.2753889858722687, + "learning_rate": 0.0002171900826446281, + "loss": 0.6859, + "step": 344 + }, + { + "epoch": 0.56, + "grad_norm": 0.20773079991340637, + "learning_rate": 0.00021694214876033058, + "loss": 0.7515, + "step": 345 + }, + { + "epoch": 0.57, + "grad_norm": 0.2547062635421753, + "learning_rate": 0.00021669421487603303, + "loss": 0.7582, + "step": 346 + }, + { + "epoch": 0.57, + "grad_norm": 0.24687208235263824, + "learning_rate": 0.00021644628099173552, + "loss": 0.5865, + "step": 347 + }, + { + "epoch": 0.57, + "grad_norm": 0.24116279184818268, + "learning_rate": 0.000216198347107438, + "loss": 0.4841, + "step": 348 + }, + { + "epoch": 0.57, + "grad_norm": 0.2270282804965973, + "learning_rate": 0.00021595041322314048, + "loss": 0.5933, + "step": 349 + }, + { + "epoch": 0.57, + "grad_norm": 0.21436922252178192, + "learning_rate": 0.00021570247933884297, + "loss": 0.6959, + "step": 350 + }, + { + "epoch": 0.57, + "grad_norm": 0.25802701711654663, + "learning_rate": 0.00021545454545454542, + "loss": 0.729, + "step": 351 + }, + { + "epoch": 0.58, + "grad_norm": 0.23808260262012482, + "learning_rate": 0.0002152066115702479, + "loss": 0.6346, + "step": 352 + }, + { + "epoch": 0.58, + "grad_norm": 0.23161651194095612, + "learning_rate": 0.0002149586776859504, + "loss": 0.6459, + "step": 353 + }, + { + "epoch": 0.58, + "grad_norm": 0.2442287802696228, + "learning_rate": 0.00021471074380165287, + "loss": 0.6803, + "step": 354 + }, + { + "epoch": 0.58, + "grad_norm": 0.19150683283805847, + "learning_rate": 0.00021446280991735533, + "loss": 0.4375, + "step": 355 + }, + { + "epoch": 0.58, + "grad_norm": 0.23142127692699432, + "learning_rate": 0.00021421487603305781, + "loss": 0.5505, + "step": 356 + }, + { + "epoch": 0.58, + "grad_norm": 0.22447548806667328, + "learning_rate": 0.00021396694214876033, + "loss": 0.6368, + "step": 357 + }, + { + "epoch": 0.59, + "grad_norm": 0.25168758630752563, + "learning_rate": 0.0002137190082644628, + "loss": 0.6322, + "step": 358 + }, + { + "epoch": 0.59, + "grad_norm": 0.25538235902786255, + "learning_rate": 0.0002134710743801653, + "loss": 0.5317, + "step": 359 + }, + { + "epoch": 0.59, + "grad_norm": 0.2565425634384155, + "learning_rate": 0.00021322314049586775, + "loss": 0.6261, + "step": 360 + }, + { + "epoch": 0.59, + "grad_norm": 0.25399863719940186, + "learning_rate": 0.00021297520661157023, + "loss": 0.596, + "step": 361 + }, + { + "epoch": 0.59, + "grad_norm": 0.27143988013267517, + "learning_rate": 0.00021272727272727272, + "loss": 0.6691, + "step": 362 + }, + { + "epoch": 0.59, + "grad_norm": 0.2387736439704895, + "learning_rate": 0.0002124793388429752, + "loss": 0.5288, + "step": 363 + }, + { + "epoch": 0.6, + "grad_norm": 0.2549780607223511, + "learning_rate": 0.00021223140495867766, + "loss": 0.7455, + "step": 364 + }, + { + "epoch": 0.6, + "grad_norm": 0.2740858793258667, + "learning_rate": 0.00021198347107438014, + "loss": 0.4921, + "step": 365 + }, + { + "epoch": 0.6, + "grad_norm": 0.25273847579956055, + "learning_rate": 0.00021173553719008262, + "loss": 0.7965, + "step": 366 + }, + { + "epoch": 0.6, + "grad_norm": 0.25858959555625916, + "learning_rate": 0.0002114876033057851, + "loss": 0.7303, + "step": 367 + }, + { + "epoch": 0.6, + "grad_norm": 0.2599296271800995, + "learning_rate": 0.0002112396694214876, + "loss": 0.6342, + "step": 368 + }, + { + "epoch": 0.6, + "grad_norm": 0.21084599196910858, + "learning_rate": 0.00021099173553719005, + "loss": 0.633, + "step": 369 + }, + { + "epoch": 0.61, + "grad_norm": 0.24272632598876953, + "learning_rate": 0.00021074380165289253, + "loss": 0.6213, + "step": 370 + }, + { + "epoch": 0.61, + "grad_norm": 0.26323699951171875, + "learning_rate": 0.00021049586776859501, + "loss": 0.563, + "step": 371 + }, + { + "epoch": 0.61, + "grad_norm": 0.20646587014198303, + "learning_rate": 0.00021024793388429753, + "loss": 0.6248, + "step": 372 + }, + { + "epoch": 0.61, + "grad_norm": 0.21778297424316406, + "learning_rate": 0.00020999999999999998, + "loss": 0.7186, + "step": 373 + }, + { + "epoch": 0.61, + "grad_norm": 0.21315112709999084, + "learning_rate": 0.00020975206611570247, + "loss": 0.5961, + "step": 374 + }, + { + "epoch": 0.61, + "grad_norm": 0.20787106454372406, + "learning_rate": 0.00020950413223140495, + "loss": 0.5917, + "step": 375 + }, + { + "epoch": 0.62, + "grad_norm": 0.23541009426116943, + "learning_rate": 0.00020925619834710743, + "loss": 0.7803, + "step": 376 + }, + { + "epoch": 0.62, + "grad_norm": 0.22649626433849335, + "learning_rate": 0.00020900826446280992, + "loss": 0.5895, + "step": 377 + }, + { + "epoch": 0.62, + "grad_norm": 0.23644742369651794, + "learning_rate": 0.00020876033057851237, + "loss": 0.6656, + "step": 378 + }, + { + "epoch": 0.62, + "grad_norm": 0.22934262454509735, + "learning_rate": 0.00020851239669421486, + "loss": 0.5933, + "step": 379 + }, + { + "epoch": 0.62, + "grad_norm": 0.289989709854126, + "learning_rate": 0.00020826446280991734, + "loss": 0.6852, + "step": 380 + }, + { + "epoch": 0.62, + "grad_norm": 0.24489325284957886, + "learning_rate": 0.00020801652892561982, + "loss": 0.5546, + "step": 381 + }, + { + "epoch": 0.63, + "grad_norm": 0.27165278792381287, + "learning_rate": 0.00020776859504132228, + "loss": 0.6845, + "step": 382 + }, + { + "epoch": 0.63, + "grad_norm": 0.19467370212078094, + "learning_rate": 0.00020752066115702476, + "loss": 0.5587, + "step": 383 + }, + { + "epoch": 0.63, + "grad_norm": 0.27320200204849243, + "learning_rate": 0.00020727272727272725, + "loss": 0.7144, + "step": 384 + }, + { + "epoch": 0.63, + "grad_norm": 0.28100526332855225, + "learning_rate": 0.00020702479338842973, + "loss": 0.6914, + "step": 385 + }, + { + "epoch": 0.63, + "grad_norm": 0.3059975504875183, + "learning_rate": 0.0002067768595041322, + "loss": 0.6075, + "step": 386 + }, + { + "epoch": 0.63, + "grad_norm": 0.24904222786426544, + "learning_rate": 0.00020652892561983467, + "loss": 0.5543, + "step": 387 + }, + { + "epoch": 0.64, + "grad_norm": 0.24768255650997162, + "learning_rate": 0.00020628099173553718, + "loss": 0.607, + "step": 388 + }, + { + "epoch": 0.64, + "grad_norm": 0.25083738565444946, + "learning_rate": 0.00020603305785123967, + "loss": 0.7961, + "step": 389 + }, + { + "epoch": 0.64, + "grad_norm": 0.26338303089141846, + "learning_rate": 0.00020578512396694215, + "loss": 0.6467, + "step": 390 + }, + { + "epoch": 0.64, + "grad_norm": 0.25761598348617554, + "learning_rate": 0.0002055371900826446, + "loss": 0.5891, + "step": 391 + }, + { + "epoch": 0.64, + "grad_norm": 0.2616937756538391, + "learning_rate": 0.0002052892561983471, + "loss": 0.5706, + "step": 392 + }, + { + "epoch": 0.64, + "grad_norm": 0.18980839848518372, + "learning_rate": 0.00020504132231404957, + "loss": 0.4479, + "step": 393 + }, + { + "epoch": 0.65, + "grad_norm": 0.250431627035141, + "learning_rate": 0.00020479338842975206, + "loss": 0.6006, + "step": 394 + }, + { + "epoch": 0.65, + "grad_norm": 0.2146655172109604, + "learning_rate": 0.0002045454545454545, + "loss": 0.7113, + "step": 395 + }, + { + "epoch": 0.65, + "grad_norm": 0.2195209115743637, + "learning_rate": 0.000204297520661157, + "loss": 0.5354, + "step": 396 + }, + { + "epoch": 0.65, + "grad_norm": 0.24879257380962372, + "learning_rate": 0.00020404958677685948, + "loss": 0.5478, + "step": 397 + }, + { + "epoch": 0.65, + "grad_norm": 0.27159082889556885, + "learning_rate": 0.00020380165289256196, + "loss": 0.7681, + "step": 398 + }, + { + "epoch": 0.65, + "grad_norm": 0.20614947378635406, + "learning_rate": 0.00020355371900826445, + "loss": 0.6357, + "step": 399 + }, + { + "epoch": 0.65, + "grad_norm": 0.25690051913261414, + "learning_rate": 0.0002033057851239669, + "loss": 0.5731, + "step": 400 + }, + { + "epoch": 0.66, + "grad_norm": 0.24473583698272705, + "learning_rate": 0.0002030578512396694, + "loss": 0.6784, + "step": 401 + }, + { + "epoch": 0.66, + "grad_norm": 0.32395297288894653, + "learning_rate": 0.0002028099173553719, + "loss": 0.7118, + "step": 402 + }, + { + "epoch": 0.66, + "grad_norm": 0.2975274324417114, + "learning_rate": 0.00020256198347107438, + "loss": 0.6504, + "step": 403 + }, + { + "epoch": 0.66, + "grad_norm": 0.2652553915977478, + "learning_rate": 0.00020231404958677684, + "loss": 0.6986, + "step": 404 + }, + { + "epoch": 0.66, + "grad_norm": 0.29475778341293335, + "learning_rate": 0.00020206611570247932, + "loss": 0.6525, + "step": 405 + }, + { + "epoch": 0.66, + "grad_norm": 0.24549973011016846, + "learning_rate": 0.0002018181818181818, + "loss": 0.5408, + "step": 406 + }, + { + "epoch": 0.67, + "grad_norm": 0.2181435376405716, + "learning_rate": 0.0002015702479338843, + "loss": 0.6146, + "step": 407 + }, + { + "epoch": 0.67, + "grad_norm": 0.2682584226131439, + "learning_rate": 0.00020132231404958677, + "loss": 0.6368, + "step": 408 + }, + { + "epoch": 0.67, + "grad_norm": 0.2641114592552185, + "learning_rate": 0.00020107438016528923, + "loss": 0.51, + "step": 409 + }, + { + "epoch": 0.67, + "grad_norm": 0.27871838212013245, + "learning_rate": 0.0002008264462809917, + "loss": 0.7269, + "step": 410 + }, + { + "epoch": 0.67, + "grad_norm": 0.23890569806098938, + "learning_rate": 0.0002005785123966942, + "loss": 0.6444, + "step": 411 + }, + { + "epoch": 0.67, + "grad_norm": 0.2451583445072174, + "learning_rate": 0.00020033057851239668, + "loss": 0.5806, + "step": 412 + }, + { + "epoch": 0.68, + "grad_norm": 0.2743864953517914, + "learning_rate": 0.00020008264462809914, + "loss": 0.6305, + "step": 413 + }, + { + "epoch": 0.68, + "grad_norm": 0.2626914978027344, + "learning_rate": 0.00019983471074380162, + "loss": 0.5765, + "step": 414 + }, + { + "epoch": 0.68, + "grad_norm": 0.2874875068664551, + "learning_rate": 0.0001995867768595041, + "loss": 0.5928, + "step": 415 + }, + { + "epoch": 0.68, + "grad_norm": 0.30499163269996643, + "learning_rate": 0.00019933884297520661, + "loss": 0.6271, + "step": 416 + }, + { + "epoch": 0.68, + "grad_norm": 0.30474454164505005, + "learning_rate": 0.0001990909090909091, + "loss": 0.6755, + "step": 417 + }, + { + "epoch": 0.68, + "grad_norm": 0.1819755882024765, + "learning_rate": 0.00019884297520661155, + "loss": 0.394, + "step": 418 + }, + { + "epoch": 0.69, + "grad_norm": 0.25470343232154846, + "learning_rate": 0.00019859504132231404, + "loss": 0.7121, + "step": 419 + }, + { + "epoch": 0.69, + "grad_norm": 0.26749151945114136, + "learning_rate": 0.00019834710743801652, + "loss": 0.6487, + "step": 420 + }, + { + "epoch": 0.69, + "grad_norm": 0.20643912255764008, + "learning_rate": 0.000198099173553719, + "loss": 0.4585, + "step": 421 + }, + { + "epoch": 0.69, + "grad_norm": 0.2576930522918701, + "learning_rate": 0.00019785123966942146, + "loss": 0.5235, + "step": 422 + }, + { + "epoch": 0.69, + "grad_norm": 0.2899012863636017, + "learning_rate": 0.00019760330578512395, + "loss": 0.6292, + "step": 423 + }, + { + "epoch": 0.69, + "grad_norm": 0.2541065216064453, + "learning_rate": 0.00019735537190082643, + "loss": 0.648, + "step": 424 + }, + { + "epoch": 0.7, + "grad_norm": 0.24382047355175018, + "learning_rate": 0.0001971074380165289, + "loss": 0.5939, + "step": 425 + }, + { + "epoch": 0.7, + "grad_norm": 0.22931940853595734, + "learning_rate": 0.00019685950413223137, + "loss": 0.6812, + "step": 426 + }, + { + "epoch": 0.7, + "grad_norm": 0.2592567205429077, + "learning_rate": 0.00019661157024793385, + "loss": 0.69, + "step": 427 + }, + { + "epoch": 0.7, + "grad_norm": 0.2516980767250061, + "learning_rate": 0.00019636363636363634, + "loss": 0.5707, + "step": 428 + }, + { + "epoch": 0.7, + "grad_norm": 0.23515059053897858, + "learning_rate": 0.00019611570247933882, + "loss": 0.6739, + "step": 429 + }, + { + "epoch": 0.7, + "grad_norm": 0.24742184579372406, + "learning_rate": 0.00019586776859504133, + "loss": 0.6761, + "step": 430 + }, + { + "epoch": 0.71, + "grad_norm": 0.26232922077178955, + "learning_rate": 0.00019561983471074376, + "loss": 0.7071, + "step": 431 + }, + { + "epoch": 0.71, + "grad_norm": 0.2853042781352997, + "learning_rate": 0.00019537190082644627, + "loss": 0.7667, + "step": 432 + }, + { + "epoch": 0.71, + "grad_norm": 0.251169353723526, + "learning_rate": 0.00019512396694214875, + "loss": 0.6518, + "step": 433 + }, + { + "epoch": 0.71, + "grad_norm": 0.2321665734052658, + "learning_rate": 0.00019487603305785124, + "loss": 0.4377, + "step": 434 + }, + { + "epoch": 0.71, + "grad_norm": 0.25216928124427795, + "learning_rate": 0.0001946280991735537, + "loss": 0.7173, + "step": 435 + }, + { + "epoch": 0.71, + "grad_norm": 0.19498330354690552, + "learning_rate": 0.00019438016528925618, + "loss": 0.5584, + "step": 436 + }, + { + "epoch": 0.72, + "grad_norm": 0.32786309719085693, + "learning_rate": 0.00019413223140495866, + "loss": 0.6583, + "step": 437 + }, + { + "epoch": 0.72, + "grad_norm": 0.25834760069847107, + "learning_rate": 0.00019388429752066115, + "loss": 0.4957, + "step": 438 + }, + { + "epoch": 0.72, + "grad_norm": 0.3462083041667938, + "learning_rate": 0.00019363636363636363, + "loss": 0.5205, + "step": 439 + }, + { + "epoch": 0.72, + "grad_norm": 0.27106693387031555, + "learning_rate": 0.00019338842975206609, + "loss": 0.6803, + "step": 440 + }, + { + "epoch": 0.72, + "grad_norm": 0.28165388107299805, + "learning_rate": 0.00019314049586776857, + "loss": 0.7049, + "step": 441 + }, + { + "epoch": 0.72, + "grad_norm": 0.20732273161411285, + "learning_rate": 0.00019289256198347105, + "loss": 0.6407, + "step": 442 + }, + { + "epoch": 0.73, + "grad_norm": 0.2609116733074188, + "learning_rate": 0.00019264462809917354, + "loss": 0.5377, + "step": 443 + }, + { + "epoch": 0.73, + "grad_norm": 0.2561998963356018, + "learning_rate": 0.000192396694214876, + "loss": 0.6212, + "step": 444 + }, + { + "epoch": 0.73, + "grad_norm": 0.27699044346809387, + "learning_rate": 0.00019214876033057848, + "loss": 0.5482, + "step": 445 + }, + { + "epoch": 0.73, + "grad_norm": 0.2426328808069229, + "learning_rate": 0.000191900826446281, + "loss": 0.6444, + "step": 446 + }, + { + "epoch": 0.73, + "grad_norm": 0.26187026500701904, + "learning_rate": 0.00019165289256198347, + "loss": 0.5443, + "step": 447 + }, + { + "epoch": 0.73, + "grad_norm": 0.2719630002975464, + "learning_rate": 0.00019140495867768595, + "loss": 0.6886, + "step": 448 + }, + { + "epoch": 0.74, + "grad_norm": 0.18477971851825714, + "learning_rate": 0.0001911570247933884, + "loss": 0.5292, + "step": 449 + }, + { + "epoch": 0.74, + "grad_norm": 0.2144313007593155, + "learning_rate": 0.0001909090909090909, + "loss": 0.4613, + "step": 450 + }, + { + "epoch": 0.74, + "grad_norm": 0.2580784857273102, + "learning_rate": 0.00019066115702479338, + "loss": 0.5606, + "step": 451 + }, + { + "epoch": 0.74, + "grad_norm": 0.3073588013648987, + "learning_rate": 0.00019041322314049586, + "loss": 0.6123, + "step": 452 + }, + { + "epoch": 0.74, + "grad_norm": 0.21787844598293304, + "learning_rate": 0.00019016528925619832, + "loss": 0.5939, + "step": 453 + }, + { + "epoch": 0.74, + "grad_norm": 0.255750447511673, + "learning_rate": 0.0001899173553719008, + "loss": 0.5739, + "step": 454 + }, + { + "epoch": 0.74, + "grad_norm": 0.24147820472717285, + "learning_rate": 0.00018966942148760329, + "loss": 0.6026, + "step": 455 + }, + { + "epoch": 0.75, + "grad_norm": 0.26172590255737305, + "learning_rate": 0.00018942148760330577, + "loss": 0.5166, + "step": 456 + }, + { + "epoch": 0.75, + "grad_norm": 0.2710455358028412, + "learning_rate": 0.00018917355371900825, + "loss": 0.6429, + "step": 457 + }, + { + "epoch": 0.75, + "grad_norm": 0.1971074640750885, + "learning_rate": 0.0001889256198347107, + "loss": 0.4799, + "step": 458 + }, + { + "epoch": 0.75, + "grad_norm": 0.23394368588924408, + "learning_rate": 0.0001886776859504132, + "loss": 0.5491, + "step": 459 + }, + { + "epoch": 0.75, + "grad_norm": 0.22820048034191132, + "learning_rate": 0.0001884297520661157, + "loss": 0.5343, + "step": 460 + }, + { + "epoch": 0.75, + "grad_norm": 0.23169974982738495, + "learning_rate": 0.0001881818181818182, + "loss": 0.5852, + "step": 461 + }, + { + "epoch": 0.76, + "grad_norm": 0.24015003442764282, + "learning_rate": 0.00018793388429752064, + "loss": 0.6209, + "step": 462 + }, + { + "epoch": 0.76, + "grad_norm": 0.2230776697397232, + "learning_rate": 0.00018768595041322313, + "loss": 0.6296, + "step": 463 + }, + { + "epoch": 0.76, + "grad_norm": 0.2518354654312134, + "learning_rate": 0.0001874380165289256, + "loss": 0.6167, + "step": 464 + }, + { + "epoch": 0.76, + "grad_norm": 0.338256299495697, + "learning_rate": 0.0001871900826446281, + "loss": 0.6512, + "step": 465 + }, + { + "epoch": 0.76, + "grad_norm": 0.23796728253364563, + "learning_rate": 0.00018694214876033055, + "loss": 0.8155, + "step": 466 + }, + { + "epoch": 0.76, + "grad_norm": 0.31516361236572266, + "learning_rate": 0.00018669421487603303, + "loss": 0.8023, + "step": 467 + }, + { + "epoch": 0.77, + "grad_norm": 0.2371574491262436, + "learning_rate": 0.00018644628099173552, + "loss": 0.5613, + "step": 468 + }, + { + "epoch": 0.77, + "grad_norm": 0.2822033762931824, + "learning_rate": 0.000186198347107438, + "loss": 0.5549, + "step": 469 + }, + { + "epoch": 0.77, + "grad_norm": 0.25953295826911926, + "learning_rate": 0.00018595041322314049, + "loss": 0.6199, + "step": 470 + }, + { + "epoch": 0.77, + "grad_norm": 0.2478639930486679, + "learning_rate": 0.00018570247933884294, + "loss": 0.5806, + "step": 471 + }, + { + "epoch": 0.77, + "grad_norm": 0.2439350187778473, + "learning_rate": 0.00018545454545454543, + "loss": 0.6222, + "step": 472 + }, + { + "epoch": 0.77, + "grad_norm": 0.24993474781513214, + "learning_rate": 0.0001852066115702479, + "loss": 0.6048, + "step": 473 + }, + { + "epoch": 0.78, + "grad_norm": 0.24781496822834015, + "learning_rate": 0.00018495867768595042, + "loss": 0.5941, + "step": 474 + }, + { + "epoch": 0.78, + "grad_norm": 0.1847202032804489, + "learning_rate": 0.00018471074380165285, + "loss": 0.609, + "step": 475 + }, + { + "epoch": 0.78, + "grad_norm": 0.21596528589725494, + "learning_rate": 0.00018446280991735536, + "loss": 0.4457, + "step": 476 + }, + { + "epoch": 0.78, + "grad_norm": 0.240879625082016, + "learning_rate": 0.00018421487603305784, + "loss": 0.6118, + "step": 477 + }, + { + "epoch": 0.78, + "grad_norm": 0.2898111641407013, + "learning_rate": 0.00018396694214876033, + "loss": 0.7725, + "step": 478 + }, + { + "epoch": 0.78, + "grad_norm": 0.27428382635116577, + "learning_rate": 0.0001837190082644628, + "loss": 0.5366, + "step": 479 + }, + { + "epoch": 0.79, + "grad_norm": 0.23467296361923218, + "learning_rate": 0.00018347107438016527, + "loss": 0.6018, + "step": 480 + }, + { + "epoch": 0.79, + "grad_norm": 0.2190561592578888, + "learning_rate": 0.00018322314049586775, + "loss": 0.5249, + "step": 481 + }, + { + "epoch": 0.79, + "grad_norm": 0.2240625023841858, + "learning_rate": 0.00018297520661157024, + "loss": 0.6891, + "step": 482 + }, + { + "epoch": 0.79, + "grad_norm": 0.24726848304271698, + "learning_rate": 0.00018272727272727272, + "loss": 0.5545, + "step": 483 + }, + { + "epoch": 0.79, + "grad_norm": 0.3318251371383667, + "learning_rate": 0.00018247933884297518, + "loss": 0.4809, + "step": 484 + }, + { + "epoch": 0.79, + "grad_norm": 0.2396695613861084, + "learning_rate": 0.00018223140495867766, + "loss": 0.4942, + "step": 485 + }, + { + "epoch": 0.8, + "grad_norm": 0.25009942054748535, + "learning_rate": 0.00018198347107438014, + "loss": 0.7381, + "step": 486 + }, + { + "epoch": 0.8, + "grad_norm": 0.22655311226844788, + "learning_rate": 0.00018173553719008263, + "loss": 0.4729, + "step": 487 + }, + { + "epoch": 0.8, + "grad_norm": 0.23187695443630219, + "learning_rate": 0.0001814876033057851, + "loss": 0.5719, + "step": 488 + }, + { + "epoch": 0.8, + "grad_norm": 0.2703653573989868, + "learning_rate": 0.00018123966942148757, + "loss": 0.6031, + "step": 489 + }, + { + "epoch": 0.8, + "grad_norm": 0.2207796424627304, + "learning_rate": 0.00018099173553719008, + "loss": 0.5361, + "step": 490 + }, + { + "epoch": 0.8, + "grad_norm": 0.24914169311523438, + "learning_rate": 0.00018074380165289256, + "loss": 0.6547, + "step": 491 + }, + { + "epoch": 0.81, + "grad_norm": 0.2714746594429016, + "learning_rate": 0.00018049586776859504, + "loss": 0.5702, + "step": 492 + }, + { + "epoch": 0.81, + "grad_norm": 0.3201580047607422, + "learning_rate": 0.0001802479338842975, + "loss": 0.6119, + "step": 493 + }, + { + "epoch": 0.81, + "grad_norm": 0.2548397183418274, + "learning_rate": 0.00017999999999999998, + "loss": 0.5251, + "step": 494 + }, + { + "epoch": 0.81, + "grad_norm": 0.28669115900993347, + "learning_rate": 0.00017975206611570247, + "loss": 0.5773, + "step": 495 + }, + { + "epoch": 0.81, + "grad_norm": 0.26253971457481384, + "learning_rate": 0.00017950413223140495, + "loss": 0.6504, + "step": 496 + }, + { + "epoch": 0.81, + "grad_norm": 0.22113384306430817, + "learning_rate": 0.00017925619834710744, + "loss": 0.4741, + "step": 497 + }, + { + "epoch": 0.82, + "grad_norm": 0.261636346578598, + "learning_rate": 0.0001790082644628099, + "loss": 0.6241, + "step": 498 + }, + { + "epoch": 0.82, + "grad_norm": 0.1780402809381485, + "learning_rate": 0.00017876033057851238, + "loss": 0.5207, + "step": 499 + }, + { + "epoch": 0.82, + "grad_norm": 0.26149195432662964, + "learning_rate": 0.00017851239669421486, + "loss": 0.5872, + "step": 500 + }, + { + "epoch": 0.82, + "grad_norm": 0.26113009452819824, + "learning_rate": 0.00017826446280991734, + "loss": 0.6163, + "step": 501 + }, + { + "epoch": 0.82, + "grad_norm": 0.21397502720355988, + "learning_rate": 0.0001780165289256198, + "loss": 0.479, + "step": 502 + }, + { + "epoch": 0.82, + "grad_norm": 0.21250088512897491, + "learning_rate": 0.00017776859504132228, + "loss": 0.6978, + "step": 503 + }, + { + "epoch": 0.83, + "grad_norm": 0.2556426525115967, + "learning_rate": 0.00017752066115702477, + "loss": 0.6128, + "step": 504 + }, + { + "epoch": 0.83, + "grad_norm": 0.24139715731143951, + "learning_rate": 0.00017727272727272728, + "loss": 0.5066, + "step": 505 + }, + { + "epoch": 0.83, + "grad_norm": 0.23671215772628784, + "learning_rate": 0.00017702479338842976, + "loss": 0.5183, + "step": 506 + }, + { + "epoch": 0.83, + "grad_norm": 0.23494285345077515, + "learning_rate": 0.00017677685950413222, + "loss": 0.5181, + "step": 507 + }, + { + "epoch": 0.83, + "grad_norm": 0.2547609806060791, + "learning_rate": 0.0001765289256198347, + "loss": 0.5406, + "step": 508 + }, + { + "epoch": 0.83, + "grad_norm": 0.3042651414871216, + "learning_rate": 0.00017628099173553718, + "loss": 0.5551, + "step": 509 + }, + { + "epoch": 0.84, + "grad_norm": 0.22910748422145844, + "learning_rate": 0.00017603305785123967, + "loss": 0.6373, + "step": 510 + }, + { + "epoch": 0.84, + "grad_norm": 0.19777967035770416, + "learning_rate": 0.00017578512396694212, + "loss": 0.5471, + "step": 511 + }, + { + "epoch": 0.84, + "grad_norm": 0.31034502387046814, + "learning_rate": 0.0001755371900826446, + "loss": 0.7017, + "step": 512 + }, + { + "epoch": 0.84, + "grad_norm": 0.3504410684108734, + "learning_rate": 0.0001752892561983471, + "loss": 0.7208, + "step": 513 + }, + { + "epoch": 0.84, + "grad_norm": 0.24271292984485626, + "learning_rate": 0.00017504132231404958, + "loss": 0.5563, + "step": 514 + }, + { + "epoch": 0.84, + "grad_norm": 0.27147865295410156, + "learning_rate": 0.00017479338842975203, + "loss": 0.5869, + "step": 515 + }, + { + "epoch": 0.84, + "grad_norm": 0.2976628839969635, + "learning_rate": 0.00017454545454545452, + "loss": 0.5471, + "step": 516 + }, + { + "epoch": 0.85, + "grad_norm": 0.28489646315574646, + "learning_rate": 0.000174297520661157, + "loss": 0.6053, + "step": 517 + }, + { + "epoch": 0.85, + "grad_norm": 0.30020108819007874, + "learning_rate": 0.00017404958677685948, + "loss": 0.6178, + "step": 518 + }, + { + "epoch": 0.85, + "grad_norm": 0.23986253142356873, + "learning_rate": 0.000173801652892562, + "loss": 0.5896, + "step": 519 + }, + { + "epoch": 0.85, + "grad_norm": 0.2667832374572754, + "learning_rate": 0.00017355371900826442, + "loss": 0.5375, + "step": 520 + }, + { + "epoch": 0.85, + "grad_norm": 0.22176356613636017, + "learning_rate": 0.00017330578512396693, + "loss": 0.5723, + "step": 521 + }, + { + "epoch": 0.85, + "grad_norm": 0.263257771730423, + "learning_rate": 0.00017305785123966942, + "loss": 0.7317, + "step": 522 + }, + { + "epoch": 0.86, + "grad_norm": 0.24838753044605255, + "learning_rate": 0.0001728099173553719, + "loss": 0.5849, + "step": 523 + }, + { + "epoch": 0.86, + "grad_norm": 0.24839664995670319, + "learning_rate": 0.00017256198347107436, + "loss": 0.6678, + "step": 524 + }, + { + "epoch": 0.86, + "grad_norm": 0.2849573493003845, + "learning_rate": 0.00017231404958677684, + "loss": 0.7144, + "step": 525 + }, + { + "epoch": 0.86, + "grad_norm": 0.26900768280029297, + "learning_rate": 0.00017206611570247932, + "loss": 0.5156, + "step": 526 + }, + { + "epoch": 0.86, + "grad_norm": 0.2212425172328949, + "learning_rate": 0.0001718181818181818, + "loss": 0.4551, + "step": 527 + }, + { + "epoch": 0.86, + "grad_norm": 0.2066129595041275, + "learning_rate": 0.0001715702479338843, + "loss": 0.4193, + "step": 528 + }, + { + "epoch": 0.87, + "grad_norm": 0.2838365137577057, + "learning_rate": 0.00017132231404958675, + "loss": 0.6078, + "step": 529 + }, + { + "epoch": 0.87, + "grad_norm": 0.239679753780365, + "learning_rate": 0.00017107438016528923, + "loss": 0.616, + "step": 530 + }, + { + "epoch": 0.87, + "grad_norm": 0.23269398510456085, + "learning_rate": 0.00017082644628099172, + "loss": 0.542, + "step": 531 + }, + { + "epoch": 0.87, + "grad_norm": 0.23838558793067932, + "learning_rate": 0.0001705785123966942, + "loss": 0.5147, + "step": 532 + }, + { + "epoch": 0.87, + "grad_norm": 0.2819415330886841, + "learning_rate": 0.00017033057851239666, + "loss": 0.6437, + "step": 533 + }, + { + "epoch": 0.87, + "grad_norm": 0.243398055434227, + "learning_rate": 0.00017008264462809914, + "loss": 0.6611, + "step": 534 + }, + { + "epoch": 0.88, + "grad_norm": 0.22569122910499573, + "learning_rate": 0.00016983471074380165, + "loss": 0.3979, + "step": 535 + }, + { + "epoch": 0.88, + "grad_norm": 0.33265820145606995, + "learning_rate": 0.00016958677685950413, + "loss": 0.6005, + "step": 536 + }, + { + "epoch": 0.88, + "grad_norm": 0.26828673481941223, + "learning_rate": 0.00016933884297520662, + "loss": 0.608, + "step": 537 + }, + { + "epoch": 0.88, + "grad_norm": 0.24439513683319092, + "learning_rate": 0.00016909090909090907, + "loss": 0.5572, + "step": 538 + }, + { + "epoch": 0.88, + "grad_norm": 0.22491876780986786, + "learning_rate": 0.00016884297520661156, + "loss": 0.7226, + "step": 539 + }, + { + "epoch": 0.88, + "grad_norm": 0.24468480050563812, + "learning_rate": 0.00016859504132231404, + "loss": 0.4582, + "step": 540 + }, + { + "epoch": 0.89, + "grad_norm": 0.23392945528030396, + "learning_rate": 0.00016834710743801652, + "loss": 0.6477, + "step": 541 + }, + { + "epoch": 0.89, + "grad_norm": 0.27548858523368835, + "learning_rate": 0.00016809917355371898, + "loss": 0.5846, + "step": 542 + }, + { + "epoch": 0.89, + "grad_norm": 0.2861180603504181, + "learning_rate": 0.00016785123966942146, + "loss": 0.6412, + "step": 543 + }, + { + "epoch": 0.89, + "grad_norm": 0.24700766801834106, + "learning_rate": 0.00016760330578512395, + "loss": 0.6947, + "step": 544 + }, + { + "epoch": 0.89, + "grad_norm": 0.2600953280925751, + "learning_rate": 0.00016735537190082643, + "loss": 0.6165, + "step": 545 + }, + { + "epoch": 0.89, + "grad_norm": 0.26876646280288696, + "learning_rate": 0.00016710743801652892, + "loss": 0.6855, + "step": 546 + }, + { + "epoch": 0.9, + "grad_norm": 0.26161080598831177, + "learning_rate": 0.00016685950413223137, + "loss": 0.5066, + "step": 547 + }, + { + "epoch": 0.9, + "grad_norm": 0.25190046429634094, + "learning_rate": 0.00016661157024793386, + "loss": 0.5902, + "step": 548 + }, + { + "epoch": 0.9, + "grad_norm": 0.25269225239753723, + "learning_rate": 0.00016636363636363637, + "loss": 0.7017, + "step": 549 + }, + { + "epoch": 0.9, + "grad_norm": 0.28042706847190857, + "learning_rate": 0.00016611570247933885, + "loss": 0.6264, + "step": 550 + }, + { + "epoch": 0.9, + "grad_norm": 0.2767360508441925, + "learning_rate": 0.0001658677685950413, + "loss": 0.7562, + "step": 551 + }, + { + "epoch": 0.9, + "grad_norm": 0.2771216034889221, + "learning_rate": 0.0001656198347107438, + "loss": 0.5333, + "step": 552 + }, + { + "epoch": 0.91, + "grad_norm": 0.189210906624794, + "learning_rate": 0.00016537190082644627, + "loss": 0.5378, + "step": 553 + }, + { + "epoch": 0.91, + "grad_norm": 0.22517065703868866, + "learning_rate": 0.00016512396694214876, + "loss": 0.5292, + "step": 554 + }, + { + "epoch": 0.91, + "grad_norm": 0.2390165776014328, + "learning_rate": 0.00016487603305785121, + "loss": 0.4407, + "step": 555 + }, + { + "epoch": 0.91, + "grad_norm": 0.21548262238502502, + "learning_rate": 0.0001646280991735537, + "loss": 0.4504, + "step": 556 + }, + { + "epoch": 0.91, + "grad_norm": 0.20831167697906494, + "learning_rate": 0.00016438016528925618, + "loss": 0.6848, + "step": 557 + }, + { + "epoch": 0.91, + "grad_norm": 0.271257609128952, + "learning_rate": 0.00016413223140495866, + "loss": 0.535, + "step": 558 + }, + { + "epoch": 0.92, + "grad_norm": 0.32008254528045654, + "learning_rate": 0.00016388429752066115, + "loss": 0.5107, + "step": 559 + }, + { + "epoch": 0.92, + "grad_norm": 0.34058302640914917, + "learning_rate": 0.0001636363636363636, + "loss": 0.5708, + "step": 560 + }, + { + "epoch": 0.92, + "grad_norm": 0.28070059418678284, + "learning_rate": 0.0001633884297520661, + "loss": 0.5086, + "step": 561 + }, + { + "epoch": 0.92, + "grad_norm": 0.25487688183784485, + "learning_rate": 0.00016314049586776857, + "loss": 0.5184, + "step": 562 + }, + { + "epoch": 0.92, + "grad_norm": 0.3240332007408142, + "learning_rate": 0.00016289256198347108, + "loss": 0.6774, + "step": 563 + }, + { + "epoch": 0.92, + "grad_norm": 0.30744409561157227, + "learning_rate": 0.0001626446280991735, + "loss": 0.5314, + "step": 564 + }, + { + "epoch": 0.93, + "grad_norm": 0.25220754742622375, + "learning_rate": 0.00016239669421487602, + "loss": 0.6308, + "step": 565 + }, + { + "epoch": 0.93, + "grad_norm": 0.29116958379745483, + "learning_rate": 0.0001621487603305785, + "loss": 0.5685, + "step": 566 + }, + { + "epoch": 0.93, + "grad_norm": 0.23250073194503784, + "learning_rate": 0.000161900826446281, + "loss": 0.4318, + "step": 567 + }, + { + "epoch": 0.93, + "grad_norm": 0.2808091640472412, + "learning_rate": 0.00016165289256198347, + "loss": 0.6313, + "step": 568 + }, + { + "epoch": 0.93, + "grad_norm": 0.2711193561553955, + "learning_rate": 0.00016140495867768593, + "loss": 0.4651, + "step": 569 + }, + { + "epoch": 0.93, + "grad_norm": 0.29540935158729553, + "learning_rate": 0.00016115702479338841, + "loss": 0.6663, + "step": 570 + }, + { + "epoch": 0.93, + "grad_norm": 0.23418714106082916, + "learning_rate": 0.0001609090909090909, + "loss": 0.448, + "step": 571 + }, + { + "epoch": 0.94, + "grad_norm": 0.21675793826580048, + "learning_rate": 0.00016066115702479338, + "loss": 0.5034, + "step": 572 + }, + { + "epoch": 0.94, + "grad_norm": 0.22451865673065186, + "learning_rate": 0.00016041322314049584, + "loss": 0.4476, + "step": 573 + }, + { + "epoch": 0.94, + "grad_norm": 0.26300856471061707, + "learning_rate": 0.00016016528925619832, + "loss": 0.6646, + "step": 574 + }, + { + "epoch": 0.94, + "grad_norm": 0.3377116918563843, + "learning_rate": 0.0001599173553719008, + "loss": 0.6029, + "step": 575 + }, + { + "epoch": 0.94, + "grad_norm": 0.23391880095005035, + "learning_rate": 0.0001596694214876033, + "loss": 0.6277, + "step": 576 + }, + { + "epoch": 0.94, + "grad_norm": 0.19620922207832336, + "learning_rate": 0.0001594214876033058, + "loss": 0.4638, + "step": 577 + }, + { + "epoch": 0.95, + "grad_norm": 0.22981096804141998, + "learning_rate": 0.00015917355371900823, + "loss": 0.5826, + "step": 578 + }, + { + "epoch": 0.95, + "grad_norm": 0.34321555495262146, + "learning_rate": 0.00015892561983471074, + "loss": 0.5618, + "step": 579 + }, + { + "epoch": 0.95, + "grad_norm": 0.28461968898773193, + "learning_rate": 0.00015867768595041322, + "loss": 0.5129, + "step": 580 + }, + { + "epoch": 0.95, + "grad_norm": 0.24368269741535187, + "learning_rate": 0.0001584297520661157, + "loss": 0.5866, + "step": 581 + }, + { + "epoch": 0.95, + "grad_norm": 0.282255083322525, + "learning_rate": 0.00015818181818181816, + "loss": 0.6274, + "step": 582 + }, + { + "epoch": 0.95, + "grad_norm": 0.26298072934150696, + "learning_rate": 0.00015793388429752065, + "loss": 0.5187, + "step": 583 + }, + { + "epoch": 0.96, + "grad_norm": 0.2671455144882202, + "learning_rate": 0.00015768595041322313, + "loss": 0.6878, + "step": 584 + }, + { + "epoch": 0.96, + "grad_norm": 0.2681390643119812, + "learning_rate": 0.00015743801652892561, + "loss": 0.5469, + "step": 585 + }, + { + "epoch": 0.96, + "grad_norm": 0.38484248518943787, + "learning_rate": 0.0001571900826446281, + "loss": 0.6364, + "step": 586 + }, + { + "epoch": 0.96, + "grad_norm": 0.23353587090969086, + "learning_rate": 0.00015694214876033055, + "loss": 0.4844, + "step": 587 + }, + { + "epoch": 0.96, + "grad_norm": 0.29452502727508545, + "learning_rate": 0.00015669421487603304, + "loss": 0.5059, + "step": 588 + }, + { + "epoch": 0.96, + "grad_norm": 0.2460879236459732, + "learning_rate": 0.00015644628099173552, + "loss": 0.6495, + "step": 589 + }, + { + "epoch": 0.97, + "grad_norm": 0.30693721771240234, + "learning_rate": 0.000156198347107438, + "loss": 0.5165, + "step": 590 + }, + { + "epoch": 0.97, + "grad_norm": 0.2171495109796524, + "learning_rate": 0.00015595041322314046, + "loss": 0.6172, + "step": 591 + }, + { + "epoch": 0.97, + "grad_norm": 0.24301984906196594, + "learning_rate": 0.00015570247933884294, + "loss": 0.6786, + "step": 592 + }, + { + "epoch": 0.97, + "grad_norm": 0.2288222461938858, + "learning_rate": 0.00015545454545454546, + "loss": 0.5669, + "step": 593 + }, + { + "epoch": 0.97, + "grad_norm": 0.2407921552658081, + "learning_rate": 0.00015520661157024794, + "loss": 0.5968, + "step": 594 + }, + { + "epoch": 0.97, + "grad_norm": 0.2591527998447418, + "learning_rate": 0.0001549586776859504, + "loss": 0.544, + "step": 595 + }, + { + "epoch": 0.98, + "grad_norm": 0.25770679116249084, + "learning_rate": 0.00015471074380165288, + "loss": 0.7177, + "step": 596 + }, + { + "epoch": 0.98, + "grad_norm": 0.2528848648071289, + "learning_rate": 0.00015446280991735536, + "loss": 0.4703, + "step": 597 + }, + { + "epoch": 0.98, + "grad_norm": 0.24993537366390228, + "learning_rate": 0.00015421487603305785, + "loss": 0.6003, + "step": 598 + }, + { + "epoch": 0.98, + "grad_norm": 0.25807908177375793, + "learning_rate": 0.00015396694214876033, + "loss": 0.465, + "step": 599 + }, + { + "epoch": 0.98, + "grad_norm": 0.3142452836036682, + "learning_rate": 0.0001537190082644628, + "loss": 0.6122, + "step": 600 + }, + { + "epoch": 0.98, + "grad_norm": 0.27111849188804626, + "learning_rate": 0.00015347107438016527, + "loss": 0.5962, + "step": 601 + }, + { + "epoch": 0.99, + "grad_norm": 0.28503674268722534, + "learning_rate": 0.00015322314049586775, + "loss": 0.6667, + "step": 602 + }, + { + "epoch": 0.99, + "grad_norm": 0.27074381709098816, + "learning_rate": 0.00015297520661157024, + "loss": 0.6115, + "step": 603 + }, + { + "epoch": 0.99, + "grad_norm": 0.25918465852737427, + "learning_rate": 0.0001527272727272727, + "loss": 0.4483, + "step": 604 + }, + { + "epoch": 0.99, + "grad_norm": 0.24476633965969086, + "learning_rate": 0.00015247933884297518, + "loss": 0.6501, + "step": 605 + }, + { + "epoch": 0.99, + "grad_norm": 0.21205200254917145, + "learning_rate": 0.00015223140495867766, + "loss": 0.3914, + "step": 606 + }, + { + "epoch": 0.99, + "grad_norm": 0.25496751070022583, + "learning_rate": 0.00015198347107438017, + "loss": 0.5335, + "step": 607 + }, + { + "epoch": 1.0, + "grad_norm": 0.27991780638694763, + "learning_rate": 0.00015173553719008266, + "loss": 0.6083, + "step": 608 + }, + { + "epoch": 1.0, + "grad_norm": 0.23995639383792877, + "learning_rate": 0.0001514876033057851, + "loss": 0.55, + "step": 609 + }, + { + "epoch": 1.0, + "grad_norm": 0.2349666953086853, + "learning_rate": 0.0001512396694214876, + "loss": 0.7054, + "step": 610 + } + ], + "logging_steps": 1, + "max_steps": 1220, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 610, + "total_flos": 1.313101299619971e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}