| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.997792494481236, | |
| "eval_steps": 50, | |
| "global_step": 2037, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014716703458425313, | |
| "grad_norm": 3.226644655877531, | |
| "learning_rate": 4.901960784313725e-07, | |
| "loss": 0.4182, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.029433406916850625, | |
| "grad_norm": 2.129692195859408, | |
| "learning_rate": 9.80392156862745e-07, | |
| "loss": 0.389, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.04415011037527594, | |
| "grad_norm": 1.5861033073146842, | |
| "learning_rate": 1.4705882352941177e-06, | |
| "loss": 0.2876, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.05886681383370125, | |
| "grad_norm": 0.9864226661653924, | |
| "learning_rate": 1.96078431372549e-06, | |
| "loss": 0.1933, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.07358351729212656, | |
| "grad_norm": 0.8851816239940652, | |
| "learning_rate": 2.450980392156863e-06, | |
| "loss": 0.166, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07358351729212656, | |
| "eval_loss": 0.1525491625070572, | |
| "eval_runtime": 216.1194, | |
| "eval_samples_per_second": 5.59, | |
| "eval_steps_per_second": 0.699, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08830022075055188, | |
| "grad_norm": 0.8806004863473016, | |
| "learning_rate": 2.9411764705882355e-06, | |
| "loss": 0.1491, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.10301692420897719, | |
| "grad_norm": 1.0095005155732772, | |
| "learning_rate": 3.431372549019608e-06, | |
| "loss": 0.1444, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1177336276674025, | |
| "grad_norm": 0.8222552861447616, | |
| "learning_rate": 3.92156862745098e-06, | |
| "loss": 0.1325, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.13245033112582782, | |
| "grad_norm": 0.8482175166475515, | |
| "learning_rate": 4.411764705882353e-06, | |
| "loss": 0.1249, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.14716703458425312, | |
| "grad_norm": 0.8916707135250133, | |
| "learning_rate": 4.901960784313726e-06, | |
| "loss": 0.1267, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14716703458425312, | |
| "eval_loss": 0.12268291413784027, | |
| "eval_runtime": 206.0326, | |
| "eval_samples_per_second": 5.863, | |
| "eval_steps_per_second": 0.733, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16188373804267844, | |
| "grad_norm": 0.8391480398866726, | |
| "learning_rate": 5.392156862745098e-06, | |
| "loss": 0.1261, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.17660044150110377, | |
| "grad_norm": 0.8543855316305797, | |
| "learning_rate": 5.882352941176471e-06, | |
| "loss": 0.1239, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.19131714495952906, | |
| "grad_norm": 1.0426618599860231, | |
| "learning_rate": 6.372549019607843e-06, | |
| "loss": 0.1249, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.20603384841795438, | |
| "grad_norm": 0.7381326766253737, | |
| "learning_rate": 6.862745098039216e-06, | |
| "loss": 0.1161, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.22075055187637968, | |
| "grad_norm": 0.7710809135546592, | |
| "learning_rate": 7.352941176470589e-06, | |
| "loss": 0.1171, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.22075055187637968, | |
| "eval_loss": 0.11395128816366196, | |
| "eval_runtime": 173.8286, | |
| "eval_samples_per_second": 6.949, | |
| "eval_steps_per_second": 0.869, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.235467255334805, | |
| "grad_norm": 0.6891100266664143, | |
| "learning_rate": 7.84313725490196e-06, | |
| "loss": 0.1156, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2501839587932303, | |
| "grad_norm": 0.8566371646933698, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.1123, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.26490066225165565, | |
| "grad_norm": 0.678987575471473, | |
| "learning_rate": 8.823529411764707e-06, | |
| "loss": 0.114, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.27961736571008095, | |
| "grad_norm": 0.7177541472393981, | |
| "learning_rate": 9.31372549019608e-06, | |
| "loss": 0.1144, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.29433406916850624, | |
| "grad_norm": 0.6069002401700933, | |
| "learning_rate": 9.803921568627451e-06, | |
| "loss": 0.1117, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.29433406916850624, | |
| "eval_loss": 0.1121131181716919, | |
| "eval_runtime": 204.0232, | |
| "eval_samples_per_second": 5.921, | |
| "eval_steps_per_second": 0.74, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3090507726269316, | |
| "grad_norm": 0.6658587363100609, | |
| "learning_rate": 9.999735629192408e-06, | |
| "loss": 0.1207, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3237674760853569, | |
| "grad_norm": 0.6216355033039211, | |
| "learning_rate": 9.99812013105419e-06, | |
| "loss": 0.1099, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3384841795437822, | |
| "grad_norm": 0.6191227561051886, | |
| "learning_rate": 9.995036481411005e-06, | |
| "loss": 0.1099, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.35320088300220753, | |
| "grad_norm": 1.8888088694270877, | |
| "learning_rate": 9.990485586056381e-06, | |
| "loss": 0.1091, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.36791758646063283, | |
| "grad_norm": 0.6030722360970995, | |
| "learning_rate": 9.984468781773688e-06, | |
| "loss": 0.1089, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.36791758646063283, | |
| "eval_loss": 0.10794272273778915, | |
| "eval_runtime": 192.1395, | |
| "eval_samples_per_second": 6.287, | |
| "eval_steps_per_second": 0.786, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3826342899190581, | |
| "grad_norm": 0.595437994630761, | |
| "learning_rate": 9.976987835943465e-06, | |
| "loss": 0.1059, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3973509933774834, | |
| "grad_norm": 0.6619589566630248, | |
| "learning_rate": 9.968044946024277e-06, | |
| "loss": 0.113, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.41206769683590877, | |
| "grad_norm": 0.5476231049438186, | |
| "learning_rate": 9.957642738907226e-06, | |
| "loss": 0.1143, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.42678440029433407, | |
| "grad_norm": 0.5802953355038116, | |
| "learning_rate": 9.945784270144321e-06, | |
| "loss": 0.11, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.44150110375275936, | |
| "grad_norm": 0.5847953307046128, | |
| "learning_rate": 9.932473023050954e-06, | |
| "loss": 0.1048, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.44150110375275936, | |
| "eval_loss": 0.10326112061738968, | |
| "eval_runtime": 179.9326, | |
| "eval_samples_per_second": 6.714, | |
| "eval_steps_per_second": 0.839, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4562178072111847, | |
| "grad_norm": 0.6275753190574224, | |
| "learning_rate": 9.917712907682694e-06, | |
| "loss": 0.1013, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.47093451066961, | |
| "grad_norm": 0.6431980899061217, | |
| "learning_rate": 9.901508259686746e-06, | |
| "loss": 0.1017, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.4856512141280353, | |
| "grad_norm": 0.5721037703631747, | |
| "learning_rate": 9.883863839028402e-06, | |
| "loss": 0.1099, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5003679175864606, | |
| "grad_norm": 0.5487439214439007, | |
| "learning_rate": 9.864784828592842e-06, | |
| "loss": 0.0969, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.515084621044886, | |
| "grad_norm": 0.5817005922601163, | |
| "learning_rate": 9.844276832662704e-06, | |
| "loss": 0.0976, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.515084621044886, | |
| "eval_loss": 0.09972475469112396, | |
| "eval_runtime": 193.3575, | |
| "eval_samples_per_second": 6.247, | |
| "eval_steps_per_second": 0.781, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5298013245033113, | |
| "grad_norm": 0.7409277848217514, | |
| "learning_rate": 9.822345875271884e-06, | |
| "loss": 0.1053, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5445180279617365, | |
| "grad_norm": 0.6141304848014978, | |
| "learning_rate": 9.798998398436031e-06, | |
| "loss": 0.1028, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5592347314201619, | |
| "grad_norm": 0.5607954946605025, | |
| "learning_rate": 9.774241260260266e-06, | |
| "loss": 0.1033, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5739514348785872, | |
| "grad_norm": 0.5542868417397482, | |
| "learning_rate": 9.74808173292467e-06, | |
| "loss": 0.1037, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5886681383370125, | |
| "grad_norm": 0.6069603969724401, | |
| "learning_rate": 9.720527500548155e-06, | |
| "loss": 0.0972, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5886681383370125, | |
| "eval_loss": 0.09853184223175049, | |
| "eval_runtime": 199.2969, | |
| "eval_samples_per_second": 6.061, | |
| "eval_steps_per_second": 0.758, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6033848417954378, | |
| "grad_norm": 0.5133322169553051, | |
| "learning_rate": 9.691586656931326e-06, | |
| "loss": 0.1024, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6181015452538632, | |
| "grad_norm": 0.5357388008964457, | |
| "learning_rate": 9.661267703178999e-06, | |
| "loss": 0.1033, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6328182487122884, | |
| "grad_norm": 0.5215856861597291, | |
| "learning_rate": 9.629579545203076e-06, | |
| "loss": 0.0994, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6475349521707138, | |
| "grad_norm": 0.5969857087876467, | |
| "learning_rate": 9.596531491106528e-06, | |
| "loss": 0.1019, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6622516556291391, | |
| "grad_norm": 0.5231050382062306, | |
| "learning_rate": 9.56213324844921e-06, | |
| "loss": 0.0968, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6622516556291391, | |
| "eval_loss": 0.09699959307909012, | |
| "eval_runtime": 190.5546, | |
| "eval_samples_per_second": 6.339, | |
| "eval_steps_per_second": 0.792, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6769683590875644, | |
| "grad_norm": 0.5546434863895826, | |
| "learning_rate": 9.526394921396373e-06, | |
| "loss": 0.1026, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6916850625459897, | |
| "grad_norm": 0.5168914632751676, | |
| "learning_rate": 9.489327007750644e-06, | |
| "loss": 0.1012, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7064017660044151, | |
| "grad_norm": 0.5152122638926383, | |
| "learning_rate": 9.450940395868397e-06, | |
| "loss": 0.1013, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7211184694628403, | |
| "grad_norm": 0.5104516201207467, | |
| "learning_rate": 9.41124636146141e-06, | |
| "loss": 0.0945, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7358351729212657, | |
| "grad_norm": 0.5202984287068582, | |
| "learning_rate": 9.370256564284713e-06, | |
| "loss": 0.0967, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7358351729212657, | |
| "eval_loss": 0.09431542456150055, | |
| "eval_runtime": 184.4167, | |
| "eval_samples_per_second": 6.55, | |
| "eval_steps_per_second": 0.819, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7505518763796909, | |
| "grad_norm": 0.5032141555673829, | |
| "learning_rate": 9.327983044711655e-06, | |
| "loss": 0.0935, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7652685798381162, | |
| "grad_norm": 0.5167615309062046, | |
| "learning_rate": 9.28443822019715e-06, | |
| "loss": 0.0981, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7799852832965416, | |
| "grad_norm": 0.48598303739277543, | |
| "learning_rate": 9.239634881630162e-06, | |
| "loss": 0.0897, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7947019867549668, | |
| "grad_norm": 0.53876383666863, | |
| "learning_rate": 9.19358618957651e-06, | |
| "loss": 0.0986, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8094186902133922, | |
| "grad_norm": 0.49526243406348325, | |
| "learning_rate": 9.146305670413069e-06, | |
| "loss": 0.0879, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8094186902133922, | |
| "eval_loss": 0.09373725950717926, | |
| "eval_runtime": 203.9941, | |
| "eval_samples_per_second": 5.922, | |
| "eval_steps_per_second": 0.74, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8241353936718175, | |
| "grad_norm": 0.5723604640533689, | |
| "learning_rate": 9.097807212354513e-06, | |
| "loss": 0.0915, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8388520971302428, | |
| "grad_norm": 0.567543105501399, | |
| "learning_rate": 9.048105061373793e-06, | |
| "loss": 0.0947, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8535688005886681, | |
| "grad_norm": 0.5181347389812981, | |
| "learning_rate": 8.997213817017508e-06, | |
| "loss": 0.095, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.8682855040470935, | |
| "grad_norm": 0.4258862103531478, | |
| "learning_rate": 8.945148428117423e-06, | |
| "loss": 0.0917, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8830022075055187, | |
| "grad_norm": 0.5739504951081847, | |
| "learning_rate": 8.891924188399395e-06, | |
| "loss": 0.1014, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8830022075055187, | |
| "eval_loss": 0.09279368817806244, | |
| "eval_runtime": 174.7309, | |
| "eval_samples_per_second": 6.913, | |
| "eval_steps_per_second": 0.864, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8977189109639441, | |
| "grad_norm": 0.5023401278687947, | |
| "learning_rate": 8.837556731990973e-06, | |
| "loss": 0.0977, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9124356144223694, | |
| "grad_norm": 0.4472157776860558, | |
| "learning_rate": 8.782062028829028e-06, | |
| "loss": 0.0944, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9271523178807947, | |
| "grad_norm": 0.5229751477277164, | |
| "learning_rate": 8.725456379968717e-06, | |
| "loss": 0.0894, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.94186902133922, | |
| "grad_norm": 0.540335952099867, | |
| "learning_rate": 8.667756412795217e-06, | |
| "loss": 0.0914, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.9565857247976454, | |
| "grad_norm": 0.5214096611567617, | |
| "learning_rate": 8.608979076139572e-06, | |
| "loss": 0.1026, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9565857247976454, | |
| "eval_loss": 0.09049851447343826, | |
| "eval_runtime": 191.9453, | |
| "eval_samples_per_second": 6.293, | |
| "eval_steps_per_second": 0.787, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9713024282560706, | |
| "grad_norm": 0.47553610942736374, | |
| "learning_rate": 8.549141635300135e-06, | |
| "loss": 0.0906, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.986019131714496, | |
| "grad_norm": 0.5432074308037707, | |
| "learning_rate": 8.488261666971047e-06, | |
| "loss": 0.0854, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.0007358351729212, | |
| "grad_norm": 0.5579816589630594, | |
| "learning_rate": 8.426357054079244e-06, | |
| "loss": 0.0923, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.0154525386313467, | |
| "grad_norm": 0.5140159523753607, | |
| "learning_rate": 8.363445980531515e-06, | |
| "loss": 0.0683, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.030169242089772, | |
| "grad_norm": 0.49111266471989273, | |
| "learning_rate": 8.299546925873148e-06, | |
| "loss": 0.0635, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.030169242089772, | |
| "eval_loss": 0.09157832711935043, | |
| "eval_runtime": 185.5584, | |
| "eval_samples_per_second": 6.51, | |
| "eval_steps_per_second": 0.814, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.0448859455481971, | |
| "grad_norm": 0.4650423339954392, | |
| "learning_rate": 8.234678659859729e-06, | |
| "loss": 0.0667, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.0596026490066226, | |
| "grad_norm": 0.5350038624215137, | |
| "learning_rate": 8.168860236943709e-06, | |
| "loss": 0.0692, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.0743193524650478, | |
| "grad_norm": 0.4137475767583062, | |
| "learning_rate": 8.102110990677328e-06, | |
| "loss": 0.0723, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.089036055923473, | |
| "grad_norm": 0.42028700866957225, | |
| "learning_rate": 8.034450528033565e-06, | |
| "loss": 0.066, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.1037527593818985, | |
| "grad_norm": 0.5321405562977654, | |
| "learning_rate": 7.965898723646777e-06, | |
| "loss": 0.0703, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.1037527593818985, | |
| "eval_loss": 0.08948411047458649, | |
| "eval_runtime": 184.8668, | |
| "eval_samples_per_second": 6.534, | |
| "eval_steps_per_second": 0.817, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.1184694628403238, | |
| "grad_norm": 0.4973824096134147, | |
| "learning_rate": 7.896475713974696e-06, | |
| "loss": 0.0667, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.133186166298749, | |
| "grad_norm": 0.5184687953265169, | |
| "learning_rate": 7.826201891383542e-06, | |
| "loss": 0.0721, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.1479028697571745, | |
| "grad_norm": 0.4182786077759931, | |
| "learning_rate": 7.755097898157957e-06, | |
| "loss": 0.0652, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.1626195732155997, | |
| "grad_norm": 0.5162298391916976, | |
| "learning_rate": 7.683184620437511e-06, | |
| "loss": 0.0715, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.177336276674025, | |
| "grad_norm": 0.41958696094652936, | |
| "learning_rate": 7.610483182081607e-06, | |
| "loss": 0.0699, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.177336276674025, | |
| "eval_loss": 0.08885398507118225, | |
| "eval_runtime": 198.9152, | |
| "eval_samples_per_second": 6.073, | |
| "eval_steps_per_second": 0.759, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.1920529801324504, | |
| "grad_norm": 0.4131639402362476, | |
| "learning_rate": 7.537014938464529e-06, | |
| "loss": 0.0679, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.2067696835908757, | |
| "grad_norm": 0.48371552023497083, | |
| "learning_rate": 7.462801470202513e-06, | |
| "loss": 0.0724, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.221486387049301, | |
| "grad_norm": 0.5028126635648151, | |
| "learning_rate": 7.387864576814628e-06, | |
| "loss": 0.065, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.2362030905077264, | |
| "grad_norm": 0.46008897965297035, | |
| "learning_rate": 7.31222627031938e-06, | |
| "loss": 0.0672, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.2509197939661516, | |
| "grad_norm": 0.3995351586970657, | |
| "learning_rate": 7.235908768768875e-06, | |
| "loss": 0.0655, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.2509197939661516, | |
| "eval_loss": 0.0898497924208641, | |
| "eval_runtime": 190.9254, | |
| "eval_samples_per_second": 6.327, | |
| "eval_steps_per_second": 0.791, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.2656364974245768, | |
| "grad_norm": 0.37529528925372135, | |
| "learning_rate": 7.1589344897224795e-06, | |
| "loss": 0.0696, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.280353200883002, | |
| "grad_norm": 0.5211153879452506, | |
| "learning_rate": 7.081326043661867e-06, | |
| "loss": 0.0671, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.2950699043414275, | |
| "grad_norm": 0.46585166367095826, | |
| "learning_rate": 7.003106227349399e-06, | |
| "loss": 0.0673, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.3097866077998528, | |
| "grad_norm": 0.49300557145854806, | |
| "learning_rate": 6.924298017131786e-06, | |
| "loss": 0.0664, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.3245033112582782, | |
| "grad_norm": 0.480260675255211, | |
| "learning_rate": 6.844924562191003e-06, | |
| "loss": 0.065, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3245033112582782, | |
| "eval_loss": 0.08873660862445831, | |
| "eval_runtime": 206.5717, | |
| "eval_samples_per_second": 5.848, | |
| "eval_steps_per_second": 0.731, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3392200147167035, | |
| "grad_norm": 0.4824688537300334, | |
| "learning_rate": 6.765009177744425e-06, | |
| "loss": 0.0704, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.3539367181751287, | |
| "grad_norm": 0.4415786568127757, | |
| "learning_rate": 6.6845753381961995e-06, | |
| "loss": 0.0654, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.368653421633554, | |
| "grad_norm": 0.5631526023299833, | |
| "learning_rate": 6.603646670241863e-06, | |
| "loss": 0.0663, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.3833701250919794, | |
| "grad_norm": 0.46084364060561317, | |
| "learning_rate": 6.522246945928214e-06, | |
| "loss": 0.0692, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.3980868285504047, | |
| "grad_norm": 0.5348577097898968, | |
| "learning_rate": 6.440400075670491e-06, | |
| "loss": 0.069, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.3980868285504047, | |
| "eval_loss": 0.08685711026191711, | |
| "eval_runtime": 177.7464, | |
| "eval_samples_per_second": 6.796, | |
| "eval_steps_per_second": 0.85, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.4128035320088301, | |
| "grad_norm": 0.5250790642687054, | |
| "learning_rate": 6.358130101228914e-06, | |
| "loss": 0.0702, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.4275202354672554, | |
| "grad_norm": 0.5047393202253249, | |
| "learning_rate": 6.275461188646641e-06, | |
| "loss": 0.0699, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.4422369389256806, | |
| "grad_norm": 0.48776704190164294, | |
| "learning_rate": 6.1924176211512145e-06, | |
| "loss": 0.0634, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.4569536423841059, | |
| "grad_norm": 0.49529594396564186, | |
| "learning_rate": 6.109023792021586e-06, | |
| "loss": 0.0667, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.4716703458425313, | |
| "grad_norm": 0.47438683295737333, | |
| "learning_rate": 6.025304197422819e-06, | |
| "loss": 0.0693, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.4716703458425313, | |
| "eval_loss": 0.08619654178619385, | |
| "eval_runtime": 181.9786, | |
| "eval_samples_per_second": 6.638, | |
| "eval_steps_per_second": 0.83, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.4863870493009566, | |
| "grad_norm": 0.47483465689550636, | |
| "learning_rate": 5.941283429210568e-06, | |
| "loss": 0.0659, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.501103752759382, | |
| "grad_norm": 0.4446944338196383, | |
| "learning_rate": 5.856986167707448e-06, | |
| "loss": 0.0638, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.5158204562178073, | |
| "grad_norm": 0.47714369154377795, | |
| "learning_rate": 5.772437174453418e-06, | |
| "loss": 0.0646, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.5305371596762325, | |
| "grad_norm": 0.4489337679674589, | |
| "learning_rate": 5.687661284932306e-06, | |
| "loss": 0.0644, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.5452538631346577, | |
| "grad_norm": 0.5494239982767725, | |
| "learning_rate": 5.6026834012766155e-06, | |
| "loss": 0.0648, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.5452538631346577, | |
| "eval_loss": 0.08584881573915482, | |
| "eval_runtime": 196.0939, | |
| "eval_samples_per_second": 6.16, | |
| "eval_steps_per_second": 0.77, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.5599705665930832, | |
| "grad_norm": 0.4324026964232888, | |
| "learning_rate": 5.5175284849527635e-06, | |
| "loss": 0.0662, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.5746872700515084, | |
| "grad_norm": 0.43771048938211576, | |
| "learning_rate": 5.432221549428867e-06, | |
| "loss": 0.0646, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.589403973509934, | |
| "grad_norm": 0.40653033653295745, | |
| "learning_rate": 5.346787652827279e-06, | |
| "loss": 0.0673, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.6041206769683591, | |
| "grad_norm": 0.4218995885501481, | |
| "learning_rate": 5.26125189056399e-06, | |
| "loss": 0.0652, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.6188373804267844, | |
| "grad_norm": 0.42589283927464555, | |
| "learning_rate": 5.175639387977091e-06, | |
| "loss": 0.067, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.6188373804267844, | |
| "eval_loss": 0.08547249436378479, | |
| "eval_runtime": 188.3934, | |
| "eval_samples_per_second": 6.412, | |
| "eval_steps_per_second": 0.802, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.6335540838852096, | |
| "grad_norm": 0.4607522386339002, | |
| "learning_rate": 5.089975292946427e-06, | |
| "loss": 0.0677, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.648270787343635, | |
| "grad_norm": 0.41120752213023654, | |
| "learning_rate": 5.00428476850665e-06, | |
| "loss": 0.0633, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.6629874908020603, | |
| "grad_norm": 0.5477912053365783, | |
| "learning_rate": 4.918592985455799e-06, | |
| "loss": 0.0648, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.6777041942604858, | |
| "grad_norm": 0.47503483012059583, | |
| "learning_rate": 4.832925114961629e-06, | |
| "loss": 0.0618, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.692420897718911, | |
| "grad_norm": 0.45774600350002437, | |
| "learning_rate": 4.747306321167791e-06, | |
| "loss": 0.0617, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.692420897718911, | |
| "eval_loss": 0.08534925431013107, | |
| "eval_runtime": 204.7242, | |
| "eval_samples_per_second": 5.901, | |
| "eval_steps_per_second": 0.738, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.7071376011773363, | |
| "grad_norm": 0.45847738919073283, | |
| "learning_rate": 4.66176175380212e-06, | |
| "loss": 0.0658, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.7218543046357615, | |
| "grad_norm": 0.44501034067234635, | |
| "learning_rate": 4.576316540789122e-06, | |
| "loss": 0.0649, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.7365710080941867, | |
| "grad_norm": 0.4832020901371425, | |
| "learning_rate": 4.4909957808688765e-06, | |
| "loss": 0.0663, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.7512877115526122, | |
| "grad_norm": 0.5231088503027554, | |
| "learning_rate": 4.4058245362245276e-06, | |
| "loss": 0.0617, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.7660044150110377, | |
| "grad_norm": 0.5011172484501668, | |
| "learning_rate": 4.320827825120485e-06, | |
| "loss": 0.0639, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.7660044150110377, | |
| "eval_loss": 0.08313070237636566, | |
| "eval_runtime": 199.3984, | |
| "eval_samples_per_second": 6.058, | |
| "eval_steps_per_second": 0.757, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.780721118469463, | |
| "grad_norm": 0.5345442409242496, | |
| "learning_rate": 4.236030614553552e-06, | |
| "loss": 0.0606, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.7954378219278881, | |
| "grad_norm": 0.512563715796756, | |
| "learning_rate": 4.151457812919094e-06, | |
| "loss": 0.0603, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.8101545253863134, | |
| "grad_norm": 0.44851621254213614, | |
| "learning_rate": 4.067134262694431e-06, | |
| "loss": 0.0645, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.8248712288447386, | |
| "grad_norm": 0.5248672860684085, | |
| "learning_rate": 3.983084733141588e-06, | |
| "loss": 0.0623, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.839587932303164, | |
| "grad_norm": 0.5498054945628633, | |
| "learning_rate": 3.899333913031561e-06, | |
| "loss": 0.0668, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.839587932303164, | |
| "eval_loss": 0.0824863463640213, | |
| "eval_runtime": 186.253, | |
| "eval_samples_per_second": 6.486, | |
| "eval_steps_per_second": 0.811, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.8543046357615895, | |
| "grad_norm": 0.42853375775393104, | |
| "learning_rate": 3.815906403392203e-06, | |
| "loss": 0.0593, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.8690213392200148, | |
| "grad_norm": 0.45809760814838824, | |
| "learning_rate": 3.732826710281923e-06, | |
| "loss": 0.0635, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.88373804267844, | |
| "grad_norm": 0.41621812440438655, | |
| "learning_rate": 3.650119237591232e-06, | |
| "loss": 0.0585, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.8984547461368653, | |
| "grad_norm": 0.47534317303862195, | |
| "learning_rate": 3.5678082798743498e-06, | |
| "loss": 0.0595, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.9131714495952905, | |
| "grad_norm": 0.41752392992965454, | |
| "learning_rate": 3.485918015212891e-06, | |
| "loss": 0.0643, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.9131714495952905, | |
| "eval_loss": 0.08134686201810837, | |
| "eval_runtime": 180.7959, | |
| "eval_samples_per_second": 6.682, | |
| "eval_steps_per_second": 0.835, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.927888153053716, | |
| "grad_norm": 0.4388803040345972, | |
| "learning_rate": 3.4044724981137787e-06, | |
| "loss": 0.0609, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.9426048565121414, | |
| "grad_norm": 0.4342058670787917, | |
| "learning_rate": 3.3234956524434615e-06, | |
| "loss": 0.062, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.9573215599705667, | |
| "grad_norm": 0.40894625830036435, | |
| "learning_rate": 3.243011264400494e-06, | |
| "loss": 0.0606, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.972038263428992, | |
| "grad_norm": 0.4587254776423067, | |
| "learning_rate": 3.1630429755285623e-06, | |
| "loss": 0.0639, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.9867549668874172, | |
| "grad_norm": 0.5863720947155439, | |
| "learning_rate": 3.0836142757720034e-06, | |
| "loss": 0.0601, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.9867549668874172, | |
| "eval_loss": 0.08116251230239868, | |
| "eval_runtime": 214.6117, | |
| "eval_samples_per_second": 5.629, | |
| "eval_steps_per_second": 0.704, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.0014716703458424, | |
| "grad_norm": 0.3180683055717724, | |
| "learning_rate": 3.004748496575842e-06, | |
| "loss": 0.0571, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.0161883738042676, | |
| "grad_norm": 0.37920317857819413, | |
| "learning_rate": 2.9264688040324098e-06, | |
| "loss": 0.0418, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.0309050772626933, | |
| "grad_norm": 0.43496386857367136, | |
| "learning_rate": 2.8487981920765044e-06, | |
| "loss": 0.0412, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.0456217807211186, | |
| "grad_norm": 0.454994148288807, | |
| "learning_rate": 2.7717594757311435e-06, | |
| "loss": 0.0386, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.060338484179544, | |
| "grad_norm": 0.4879169888697804, | |
| "learning_rate": 2.69537528440586e-06, | |
| "loss": 0.0391, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.060338484179544, | |
| "eval_loss": 0.08909143507480621, | |
| "eval_runtime": 197.7898, | |
| "eval_samples_per_second": 6.107, | |
| "eval_steps_per_second": 0.763, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.075055187637969, | |
| "grad_norm": 0.44447001392962837, | |
| "learning_rate": 2.619668055249527e-06, | |
| "loss": 0.0381, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.0897718910963943, | |
| "grad_norm": 0.40740917793748654, | |
| "learning_rate": 2.544660026559639e-06, | |
| "loss": 0.0367, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.1044885945548195, | |
| "grad_norm": 0.399633409974892, | |
| "learning_rate": 2.4703732312500438e-06, | |
| "loss": 0.0382, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.119205298013245, | |
| "grad_norm": 0.43107632751069047, | |
| "learning_rate": 2.3968294903789474e-06, | |
| "loss": 0.0398, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.1339220014716704, | |
| "grad_norm": 0.43610535435590353, | |
| "learning_rate": 2.324050406739205e-06, | |
| "loss": 0.0411, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.1339220014716704, | |
| "eval_loss": 0.08864710479974747, | |
| "eval_runtime": 185.9081, | |
| "eval_samples_per_second": 6.498, | |
| "eval_steps_per_second": 0.812, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.1486387049300957, | |
| "grad_norm": 0.3969874821725999, | |
| "learning_rate": 2.2520573585126863e-06, | |
| "loss": 0.0407, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.163355408388521, | |
| "grad_norm": 0.44469487818286946, | |
| "learning_rate": 2.1808714929906394e-06, | |
| "loss": 0.037, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.178072111846946, | |
| "grad_norm": 0.4933403170140201, | |
| "learning_rate": 2.110513720361869e-06, | |
| "loss": 0.0385, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.1927888153053714, | |
| "grad_norm": 0.40970411491367764, | |
| "learning_rate": 2.041004707570555e-06, | |
| "loss": 0.0362, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.207505518763797, | |
| "grad_norm": 0.47294108634743565, | |
| "learning_rate": 1.972364872245539e-06, | |
| "loss": 0.0376, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.207505518763797, | |
| "eval_loss": 0.09001829475164413, | |
| "eval_runtime": 203.7053, | |
| "eval_samples_per_second": 5.93, | |
| "eval_steps_per_second": 0.741, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 0.4013296778951186, | |
| "learning_rate": 1.9046143767028309e-06, | |
| "loss": 0.0359, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.2369389256806476, | |
| "grad_norm": 0.41616621605630383, | |
| "learning_rate": 1.8377731220231144e-06, | |
| "loss": 0.0373, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.251655629139073, | |
| "grad_norm": 0.4858320948580327, | |
| "learning_rate": 1.771860742205988e-06, | |
| "loss": 0.0355, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.266372332597498, | |
| "grad_norm": 0.4284960397863766, | |
| "learning_rate": 1.706896598402663e-06, | |
| "loss": 0.0379, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.2810890360559233, | |
| "grad_norm": 0.41264671002453457, | |
| "learning_rate": 1.642899773228801e-06, | |
| "loss": 0.0372, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.2810890360559233, | |
| "eval_loss": 0.08930070698261261, | |
| "eval_runtime": 216.4439, | |
| "eval_samples_per_second": 5.581, | |
| "eval_steps_per_second": 0.698, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.295805739514349, | |
| "grad_norm": 0.4348744731420184, | |
| "learning_rate": 1.5798890651591759e-06, | |
| "loss": 0.0375, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.310522442972774, | |
| "grad_norm": 0.4350319794815005, | |
| "learning_rate": 1.5178829830057883e-06, | |
| "loss": 0.0353, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.3252391464311994, | |
| "grad_norm": 0.397696827791832, | |
| "learning_rate": 1.4568997404810858e-06, | |
| "loss": 0.0369, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.3399558498896247, | |
| "grad_norm": 0.44249359787198733, | |
| "learning_rate": 1.3969572508478424e-06, | |
| "loss": 0.0365, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.35467255334805, | |
| "grad_norm": 0.3999504032855848, | |
| "learning_rate": 1.33807312165731e-06, | |
| "loss": 0.0391, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.35467255334805, | |
| "eval_loss": 0.08941526710987091, | |
| "eval_runtime": 201.599, | |
| "eval_samples_per_second": 5.992, | |
| "eval_steps_per_second": 0.749, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.369389256806475, | |
| "grad_norm": 0.47235025180203943, | |
| "learning_rate": 1.2802646495771592e-06, | |
| "loss": 0.0374, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.384105960264901, | |
| "grad_norm": 0.4505178794969632, | |
| "learning_rate": 1.2235488153107488e-06, | |
| "loss": 0.0386, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.398822663723326, | |
| "grad_norm": 0.4515169168194488, | |
| "learning_rate": 1.1679422786091909e-06, | |
| "loss": 0.0355, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.4135393671817513, | |
| "grad_norm": 0.4486232416834487, | |
| "learning_rate": 1.1134613733777195e-06, | |
| "loss": 0.0353, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.4282560706401766, | |
| "grad_norm": 0.45969446958453936, | |
| "learning_rate": 1.060122102877739e-06, | |
| "loss": 0.0369, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.4282560706401766, | |
| "eval_loss": 0.08896949887275696, | |
| "eval_runtime": 190.8926, | |
| "eval_samples_per_second": 6.328, | |
| "eval_steps_per_second": 0.791, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.442972774098602, | |
| "grad_norm": 0.4795593227430335, | |
| "learning_rate": 1.0079401350260288e-06, | |
| "loss": 0.0365, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.457689477557027, | |
| "grad_norm": 0.4364131921904563, | |
| "learning_rate": 9.569307977924304e-07, | |
| "loss": 0.0374, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.4724061810154527, | |
| "grad_norm": 0.39082384348290283, | |
| "learning_rate": 9.071090746973999e-07, | |
| "loss": 0.0367, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.487122884473878, | |
| "grad_norm": 0.4316116220500935, | |
| "learning_rate": 8.584896004107379e-07, | |
| "loss": 0.0357, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.501839587932303, | |
| "grad_norm": 0.4639023437586311, | |
| "learning_rate": 8.110866564527925e-07, | |
| "loss": 0.0362, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.501839587932303, | |
| "eval_loss": 0.08904436975717545, | |
| "eval_runtime": 192.3246, | |
| "eval_samples_per_second": 6.281, | |
| "eval_steps_per_second": 0.785, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.5165562913907285, | |
| "grad_norm": 0.44077589190339306, | |
| "learning_rate": 7.649141669993881e-07, | |
| "loss": 0.0342, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.5312729948491537, | |
| "grad_norm": 0.4866710092763864, | |
| "learning_rate": 7.199856947917372e-07, | |
| "loss": 0.0355, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.5459896983075794, | |
| "grad_norm": 0.5558412036138655, | |
| "learning_rate": 6.763144371525048e-07, | |
| "loss": 0.0362, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.560706401766004, | |
| "grad_norm": 0.5242729609693463, | |
| "learning_rate": 6.339132221092181e-07, | |
| "loss": 0.0346, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.57542310522443, | |
| "grad_norm": 0.43612087623478013, | |
| "learning_rate": 5.927945046261541e-07, | |
| "loss": 0.0351, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.57542310522443, | |
| "eval_loss": 0.08865496516227722, | |
| "eval_runtime": 189.4933, | |
| "eval_samples_per_second": 6.375, | |
| "eval_steps_per_second": 0.797, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.590139808682855, | |
| "grad_norm": 0.4402357233053372, | |
| "learning_rate": 5.529703629458027e-07, | |
| "loss": 0.0351, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.6048565121412803, | |
| "grad_norm": 0.4547936707636127, | |
| "learning_rate": 5.144524950410074e-07, | |
| "loss": 0.0353, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.6195732155997056, | |
| "grad_norm": 0.46968163264663654, | |
| "learning_rate": 4.772522151787822e-07, | |
| "loss": 0.0335, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.634289919058131, | |
| "grad_norm": 0.5323493186585175, | |
| "learning_rate": 4.413804505968533e-07, | |
| "loss": 0.0381, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.6490066225165565, | |
| "grad_norm": 0.44646022512750955, | |
| "learning_rate": 4.0684773829388737e-07, | |
| "loss": 0.0365, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.6490066225165565, | |
| "eval_loss": 0.08848826587200165, | |
| "eval_runtime": 144.7247, | |
| "eval_samples_per_second": 8.347, | |
| "eval_steps_per_second": 1.043, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.6637233259749817, | |
| "grad_norm": 0.48313861824298177, | |
| "learning_rate": 3.736642219343456e-07, | |
| "loss": 0.0341, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.678440029433407, | |
| "grad_norm": 0.46254464308741905, | |
| "learning_rate": 3.4183964886887135e-07, | |
| "loss": 0.035, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.693156732891832, | |
| "grad_norm": 0.44252007786800557, | |
| "learning_rate": 3.1138336727110307e-07, | |
| "loss": 0.0349, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.7078734363502575, | |
| "grad_norm": 0.4843414570638625, | |
| "learning_rate": 2.823043233917272e-07, | |
| "loss": 0.0315, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.7225901398086827, | |
| "grad_norm": 0.4233437476360991, | |
| "learning_rate": 2.5461105893060667e-07, | |
| "loss": 0.0336, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.7225901398086827, | |
| "eval_loss": 0.0889279693365097, | |
| "eval_runtime": 148.3, | |
| "eval_samples_per_second": 8.146, | |
| "eval_steps_per_second": 1.018, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.737306843267108, | |
| "grad_norm": 0.43298766819944895, | |
| "learning_rate": 2.2831170852773198e-07, | |
| "loss": 0.0327, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.7520235467255336, | |
| "grad_norm": 0.5107092795769058, | |
| "learning_rate": 2.03413997373747e-07, | |
| "loss": 0.035, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.766740250183959, | |
| "grad_norm": 0.42425673512298995, | |
| "learning_rate": 1.7992523894074688e-07, | |
| "loss": 0.0356, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.781456953642384, | |
| "grad_norm": 0.4354877107084126, | |
| "learning_rate": 1.578523328340087e-07, | |
| "loss": 0.0351, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.7961736571008093, | |
| "grad_norm": 0.409569927662352, | |
| "learning_rate": 1.372017627653044e-07, | |
| "loss": 0.0328, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.7961736571008093, | |
| "eval_loss": 0.08891716599464417, | |
| "eval_runtime": 150.1495, | |
| "eval_samples_per_second": 8.045, | |
| "eval_steps_per_second": 1.006, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.8108903605592346, | |
| "grad_norm": 0.505372980725658, | |
| "learning_rate": 1.179795946483625e-07, | |
| "loss": 0.0359, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.8256070640176603, | |
| "grad_norm": 0.4789426274321432, | |
| "learning_rate": 1.0019147481706626e-07, | |
| "loss": 0.034, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.8403237674760855, | |
| "grad_norm": 0.4326698452169212, | |
| "learning_rate": 8.384262836689472e-08, | |
| "loss": 0.0359, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 2.8550404709345107, | |
| "grad_norm": 0.4504134801165135, | |
| "learning_rate": 6.893785762009942e-08, | |
| "loss": 0.033, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 2.869757174392936, | |
| "grad_norm": 0.4418357481535817, | |
| "learning_rate": 5.5481540715066616e-08, | |
| "loss": 0.031, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.869757174392936, | |
| "eval_loss": 0.08881029486656189, | |
| "eval_runtime": 160.2824, | |
| "eval_samples_per_second": 7.537, | |
| "eval_steps_per_second": 0.942, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.8844738778513612, | |
| "grad_norm": 0.39177398397892965, | |
| "learning_rate": 4.3477630320279405e-08, | |
| "loss": 0.0341, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 2.8991905813097865, | |
| "grad_norm": 0.4264281839143634, | |
| "learning_rate": 3.292965247325641e-08, | |
| "loss": 0.0327, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 2.9139072847682117, | |
| "grad_norm": 0.4458194572989954, | |
| "learning_rate": 2.3840705544815324e-08, | |
| "loss": 0.037, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 2.9286239882266374, | |
| "grad_norm": 0.4161959002875069, | |
| "learning_rate": 1.6213459328950355e-08, | |
| "loss": 0.0336, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 2.9433406916850626, | |
| "grad_norm": 0.4581647348930819, | |
| "learning_rate": 1.0050154258607336e-08, | |
| "loss": 0.0361, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.9433406916850626, | |
| "eval_loss": 0.08885689079761505, | |
| "eval_runtime": 115.9736, | |
| "eval_samples_per_second": 10.416, | |
| "eval_steps_per_second": 1.302, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.958057395143488, | |
| "grad_norm": 0.4098910931260614, | |
| "learning_rate": 5.352600747577929e-09, | |
| "loss": 0.0323, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 2.972774098601913, | |
| "grad_norm": 0.4203250122459563, | |
| "learning_rate": 2.12217865870612e-09, | |
| "loss": 0.0337, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 2.9874908020603383, | |
| "grad_norm": 0.4851341052865305, | |
| "learning_rate": 3.5983689856522453e-10, | |
| "loss": 0.0343, | |
| "step": 2030 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2037, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 31897094414336.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |