{ "best_metric": null, "best_model_checkpoint": null, "epoch": 100.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_loss": 0.7225164175033569, "eval_runtime": 2.4692, "eval_samples_per_second": 80.998, "eval_steps_per_second": 10.125, "step": 25 }, { "epoch": 2.0, "eval_loss": 0.5052716732025146, "eval_runtime": 2.4705, "eval_samples_per_second": 80.955, "eval_steps_per_second": 10.119, "step": 50 }, { "epoch": 3.0, "eval_loss": 0.4475671350955963, "eval_runtime": 2.4731, "eval_samples_per_second": 80.871, "eval_steps_per_second": 10.109, "step": 75 }, { "epoch": 4.0, "eval_loss": 0.4104467034339905, "eval_runtime": 2.4763, "eval_samples_per_second": 80.765, "eval_steps_per_second": 10.096, "step": 100 }, { "epoch": 5.0, "eval_loss": 0.39656341075897217, "eval_runtime": 2.4766, "eval_samples_per_second": 80.755, "eval_steps_per_second": 10.094, "step": 125 }, { "epoch": 6.0, "eval_loss": 0.36905747652053833, "eval_runtime": 2.4774, "eval_samples_per_second": 80.731, "eval_steps_per_second": 10.091, "step": 150 }, { "epoch": 7.0, "eval_loss": 0.362547367811203, "eval_runtime": 2.4792, "eval_samples_per_second": 80.672, "eval_steps_per_second": 10.084, "step": 175 }, { "epoch": 8.0, "eval_loss": 0.3497239053249359, "eval_runtime": 2.4872, "eval_samples_per_second": 80.413, "eval_steps_per_second": 10.052, "step": 200 }, { "epoch": 9.0, "eval_loss": 0.3523653447628021, "eval_runtime": 2.4878, "eval_samples_per_second": 80.392, "eval_steps_per_second": 10.049, "step": 225 }, { "epoch": 10.0, "eval_loss": 0.33985191583633423, "eval_runtime": 2.4877, "eval_samples_per_second": 80.395, "eval_steps_per_second": 10.049, "step": 250 }, { "epoch": 11.0, "eval_loss": 0.3415805399417877, "eval_runtime": 2.4896, "eval_samples_per_second": 80.333, "eval_steps_per_second": 10.042, "step": 275 }, { "epoch": 12.0, "eval_loss": 0.33282220363616943, "eval_runtime": 2.49, "eval_samples_per_second": 80.321, "eval_steps_per_second": 10.04, "step": 300 }, { "epoch": 13.0, "eval_loss": 0.33800217509269714, "eval_runtime": 2.491, "eval_samples_per_second": 80.289, "eval_steps_per_second": 10.036, "step": 325 }, { "epoch": 14.0, "eval_loss": 0.3342490792274475, "eval_runtime": 2.491, "eval_samples_per_second": 80.288, "eval_steps_per_second": 10.036, "step": 350 }, { "epoch": 15.0, "eval_loss": 0.34117013216018677, "eval_runtime": 2.4915, "eval_samples_per_second": 80.273, "eval_steps_per_second": 10.034, "step": 375 }, { "epoch": 16.0, "eval_loss": 0.3388213813304901, "eval_runtime": 2.4936, "eval_samples_per_second": 80.205, "eval_steps_per_second": 10.026, "step": 400 }, { "epoch": 17.0, "eval_loss": 0.319783091545105, "eval_runtime": 2.4927, "eval_samples_per_second": 80.234, "eval_steps_per_second": 10.029, "step": 425 }, { "epoch": 18.0, "eval_loss": 0.3183771073818207, "eval_runtime": 2.4921, "eval_samples_per_second": 80.253, "eval_steps_per_second": 10.032, "step": 450 }, { "epoch": 19.0, "eval_loss": 0.31770122051239014, "eval_runtime": 2.4935, "eval_samples_per_second": 80.21, "eval_steps_per_second": 10.026, "step": 475 }, { "epoch": 20.0, "grad_norm": 0.8955293893814087, "learning_rate": 1.6000000000000003e-05, "loss": 0.4631, "step": 500 }, { "epoch": 20.0, "eval_loss": 0.31926316022872925, "eval_runtime": 2.4537, "eval_samples_per_second": 81.51, "eval_steps_per_second": 10.189, "step": 500 }, { "epoch": 21.0, "eval_loss": 0.3148637115955353, "eval_runtime": 2.4822, "eval_samples_per_second": 80.572, "eval_steps_per_second": 10.072, "step": 525 }, { "epoch": 22.0, "eval_loss": 0.31756889820098877, "eval_runtime": 2.4835, "eval_samples_per_second": 80.531, "eval_steps_per_second": 10.066, "step": 550 }, { "epoch": 23.0, "eval_loss": 0.3171720504760742, "eval_runtime": 2.4904, "eval_samples_per_second": 80.309, "eval_steps_per_second": 10.039, "step": 575 }, { "epoch": 24.0, "eval_loss": 0.3179776072502136, "eval_runtime": 2.4915, "eval_samples_per_second": 80.273, "eval_steps_per_second": 10.034, "step": 600 }, { "epoch": 25.0, "eval_loss": 0.3148040473461151, "eval_runtime": 2.4928, "eval_samples_per_second": 80.232, "eval_steps_per_second": 10.029, "step": 625 }, { "epoch": 26.0, "eval_loss": 0.3072579503059387, "eval_runtime": 2.4926, "eval_samples_per_second": 80.239, "eval_steps_per_second": 10.03, "step": 650 }, { "epoch": 27.0, "eval_loss": 0.3129171133041382, "eval_runtime": 2.4936, "eval_samples_per_second": 80.206, "eval_steps_per_second": 10.026, "step": 675 }, { "epoch": 28.0, "eval_loss": 0.3081643283367157, "eval_runtime": 2.4941, "eval_samples_per_second": 80.188, "eval_steps_per_second": 10.024, "step": 700 }, { "epoch": 29.0, "eval_loss": 0.3064418435096741, "eval_runtime": 2.4964, "eval_samples_per_second": 80.116, "eval_steps_per_second": 10.014, "step": 725 }, { "epoch": 30.0, "eval_loss": 0.30982914566993713, "eval_runtime": 2.4963, "eval_samples_per_second": 80.12, "eval_steps_per_second": 10.015, "step": 750 }, { "epoch": 31.0, "eval_loss": 0.3063754439353943, "eval_runtime": 2.4956, "eval_samples_per_second": 80.142, "eval_steps_per_second": 10.018, "step": 775 }, { "epoch": 32.0, "eval_loss": 0.3113013207912445, "eval_runtime": 2.4972, "eval_samples_per_second": 80.089, "eval_steps_per_second": 10.011, "step": 800 }, { "epoch": 33.0, "eval_loss": 0.30704861879348755, "eval_runtime": 2.4962, "eval_samples_per_second": 80.122, "eval_steps_per_second": 10.015, "step": 825 }, { "epoch": 34.0, "eval_loss": 0.2988375425338745, "eval_runtime": 2.4974, "eval_samples_per_second": 80.083, "eval_steps_per_second": 10.01, "step": 850 }, { "epoch": 35.0, "eval_loss": 0.3142584264278412, "eval_runtime": 2.4958, "eval_samples_per_second": 80.135, "eval_steps_per_second": 10.017, "step": 875 }, { "epoch": 36.0, "eval_loss": 0.3032761514186859, "eval_runtime": 2.4976, "eval_samples_per_second": 80.077, "eval_steps_per_second": 10.01, "step": 900 }, { "epoch": 37.0, "eval_loss": 0.30415403842926025, "eval_runtime": 2.4973, "eval_samples_per_second": 80.087, "eval_steps_per_second": 10.011, "step": 925 }, { "epoch": 38.0, "eval_loss": 0.30165913701057434, "eval_runtime": 2.4982, "eval_samples_per_second": 80.057, "eval_steps_per_second": 10.007, "step": 950 }, { "epoch": 39.0, "eval_loss": 0.3017444908618927, "eval_runtime": 2.4977, "eval_samples_per_second": 80.072, "eval_steps_per_second": 10.009, "step": 975 }, { "epoch": 40.0, "grad_norm": 2.4287023544311523, "learning_rate": 1.2e-05, "loss": 0.3457, "step": 1000 }, { "epoch": 40.0, "eval_loss": 0.3025864064693451, "eval_runtime": 2.4539, "eval_samples_per_second": 81.504, "eval_steps_per_second": 10.188, "step": 1000 }, { "epoch": 41.0, "eval_loss": 0.30045461654663086, "eval_runtime": 2.4819, "eval_samples_per_second": 80.584, "eval_steps_per_second": 10.073, "step": 1025 }, { "epoch": 42.0, "eval_loss": 0.30064135789871216, "eval_runtime": 2.4896, "eval_samples_per_second": 80.334, "eval_steps_per_second": 10.042, "step": 1050 }, { "epoch": 43.0, "eval_loss": 0.29575350880622864, "eval_runtime": 2.4903, "eval_samples_per_second": 80.312, "eval_steps_per_second": 10.039, "step": 1075 }, { "epoch": 44.0, "eval_loss": 0.30160149931907654, "eval_runtime": 2.4926, "eval_samples_per_second": 80.239, "eval_steps_per_second": 10.03, "step": 1100 }, { "epoch": 45.0, "eval_loss": 0.30429255962371826, "eval_runtime": 2.4921, "eval_samples_per_second": 80.255, "eval_steps_per_second": 10.032, "step": 1125 }, { "epoch": 46.0, "eval_loss": 0.3015859127044678, "eval_runtime": 2.4927, "eval_samples_per_second": 80.236, "eval_steps_per_second": 10.029, "step": 1150 }, { "epoch": 47.0, "eval_loss": 0.29914650321006775, "eval_runtime": 2.4947, "eval_samples_per_second": 80.171, "eval_steps_per_second": 10.021, "step": 1175 }, { "epoch": 48.0, "eval_loss": 0.2971905469894409, "eval_runtime": 2.4958, "eval_samples_per_second": 80.134, "eval_steps_per_second": 10.017, "step": 1200 }, { "epoch": 49.0, "eval_loss": 0.29176658391952515, "eval_runtime": 2.4963, "eval_samples_per_second": 80.118, "eval_steps_per_second": 10.015, "step": 1225 }, { "epoch": 50.0, "eval_loss": 0.2934282720088959, "eval_runtime": 2.4976, "eval_samples_per_second": 80.076, "eval_steps_per_second": 10.01, "step": 1250 }, { "epoch": 51.0, "eval_loss": 0.2918751835823059, "eval_runtime": 2.4964, "eval_samples_per_second": 80.115, "eval_steps_per_second": 10.014, "step": 1275 }, { "epoch": 52.0, "eval_loss": 0.2914879620075226, "eval_runtime": 2.4977, "eval_samples_per_second": 80.075, "eval_steps_per_second": 10.009, "step": 1300 }, { "epoch": 53.0, "eval_loss": 0.2925909757614136, "eval_runtime": 2.4975, "eval_samples_per_second": 80.081, "eval_steps_per_second": 10.01, "step": 1325 }, { "epoch": 54.0, "eval_loss": 0.2940743565559387, "eval_runtime": 2.4974, "eval_samples_per_second": 80.085, "eval_steps_per_second": 10.011, "step": 1350 }, { "epoch": 55.0, "eval_loss": 0.2973780930042267, "eval_runtime": 2.4989, "eval_samples_per_second": 80.037, "eval_steps_per_second": 10.005, "step": 1375 }, { "epoch": 56.0, "eval_loss": 0.2954687178134918, "eval_runtime": 2.498, "eval_samples_per_second": 80.065, "eval_steps_per_second": 10.008, "step": 1400 }, { "epoch": 57.0, "eval_loss": 0.29051879048347473, "eval_runtime": 2.4979, "eval_samples_per_second": 80.069, "eval_steps_per_second": 10.009, "step": 1425 }, { "epoch": 58.0, "eval_loss": 0.29731473326683044, "eval_runtime": 2.4993, "eval_samples_per_second": 80.023, "eval_steps_per_second": 10.003, "step": 1450 }, { "epoch": 59.0, "eval_loss": 0.2933524250984192, "eval_runtime": 2.4988, "eval_samples_per_second": 80.04, "eval_steps_per_second": 10.005, "step": 1475 }, { "epoch": 60.0, "grad_norm": 1.120781421661377, "learning_rate": 8.000000000000001e-06, "loss": 0.3291, "step": 1500 }, { "epoch": 60.0, "eval_loss": 0.2888854742050171, "eval_runtime": 2.456, "eval_samples_per_second": 81.434, "eval_steps_per_second": 10.179, "step": 1500 }, { "epoch": 61.0, "eval_loss": 0.2901514172554016, "eval_runtime": 2.4865, "eval_samples_per_second": 80.436, "eval_steps_per_second": 10.054, "step": 1525 }, { "epoch": 62.0, "eval_loss": 0.29295194149017334, "eval_runtime": 2.4921, "eval_samples_per_second": 80.253, "eval_steps_per_second": 10.032, "step": 1550 }, { "epoch": 63.0, "eval_loss": 0.29049646854400635, "eval_runtime": 2.4924, "eval_samples_per_second": 80.244, "eval_steps_per_second": 10.031, "step": 1575 }, { "epoch": 64.0, "eval_loss": 0.2913173735141754, "eval_runtime": 2.4934, "eval_samples_per_second": 80.212, "eval_steps_per_second": 10.026, "step": 1600 }, { "epoch": 65.0, "eval_loss": 0.28798389434814453, "eval_runtime": 2.4945, "eval_samples_per_second": 80.176, "eval_steps_per_second": 10.022, "step": 1625 }, { "epoch": 66.0, "eval_loss": 0.28929680585861206, "eval_runtime": 2.4963, "eval_samples_per_second": 80.118, "eval_steps_per_second": 10.015, "step": 1650 }, { "epoch": 67.0, "eval_loss": 0.2856563925743103, "eval_runtime": 2.4957, "eval_samples_per_second": 80.139, "eval_steps_per_second": 10.017, "step": 1675 }, { "epoch": 68.0, "eval_loss": 0.2869837284088135, "eval_runtime": 2.496, "eval_samples_per_second": 80.13, "eval_steps_per_second": 10.016, "step": 1700 }, { "epoch": 69.0, "eval_loss": 0.2902255356311798, "eval_runtime": 2.4969, "eval_samples_per_second": 80.098, "eval_steps_per_second": 10.012, "step": 1725 }, { "epoch": 70.0, "eval_loss": 0.28557059168815613, "eval_runtime": 2.4957, "eval_samples_per_second": 80.139, "eval_steps_per_second": 10.017, "step": 1750 }, { "epoch": 71.0, "eval_loss": 0.2883276343345642, "eval_runtime": 2.4986, "eval_samples_per_second": 80.045, "eval_steps_per_second": 10.006, "step": 1775 }, { "epoch": 72.0, "eval_loss": 0.286774605512619, "eval_runtime": 2.4979, "eval_samples_per_second": 80.068, "eval_steps_per_second": 10.009, "step": 1800 }, { "epoch": 73.0, "eval_loss": 0.28692272305488586, "eval_runtime": 2.4977, "eval_samples_per_second": 80.073, "eval_steps_per_second": 10.009, "step": 1825 }, { "epoch": 74.0, "eval_loss": 0.2842114567756653, "eval_runtime": 2.4982, "eval_samples_per_second": 80.058, "eval_steps_per_second": 10.007, "step": 1850 }, { "epoch": 75.0, "eval_loss": 0.2869337201118469, "eval_runtime": 2.4984, "eval_samples_per_second": 80.052, "eval_steps_per_second": 10.007, "step": 1875 }, { "epoch": 76.0, "eval_loss": 0.2843911647796631, "eval_runtime": 2.498, "eval_samples_per_second": 80.064, "eval_steps_per_second": 10.008, "step": 1900 }, { "epoch": 77.0, "eval_loss": 0.28588855266571045, "eval_runtime": 2.4985, "eval_samples_per_second": 80.047, "eval_steps_per_second": 10.006, "step": 1925 }, { "epoch": 78.0, "eval_loss": 0.2864097058773041, "eval_runtime": 2.4994, "eval_samples_per_second": 80.02, "eval_steps_per_second": 10.002, "step": 1950 }, { "epoch": 79.0, "eval_loss": 0.28731438517570496, "eval_runtime": 2.4981, "eval_samples_per_second": 80.061, "eval_steps_per_second": 10.008, "step": 1975 }, { "epoch": 80.0, "grad_norm": 1.4072085618972778, "learning_rate": 4.000000000000001e-06, "loss": 0.3199, "step": 2000 }, { "epoch": 80.0, "eval_loss": 0.2888672947883606, "eval_runtime": 2.4574, "eval_samples_per_second": 81.386, "eval_steps_per_second": 10.173, "step": 2000 }, { "epoch": 81.0, "eval_loss": 0.28676241636276245, "eval_runtime": 2.4848, "eval_samples_per_second": 80.49, "eval_steps_per_second": 10.061, "step": 2025 }, { "epoch": 82.0, "eval_loss": 0.28567585349082947, "eval_runtime": 2.4917, "eval_samples_per_second": 80.268, "eval_steps_per_second": 10.033, "step": 2050 }, { "epoch": 83.0, "eval_loss": 0.2843726873397827, "eval_runtime": 2.4926, "eval_samples_per_second": 80.236, "eval_steps_per_second": 10.03, "step": 2075 }, { "epoch": 84.0, "eval_loss": 0.28754809498786926, "eval_runtime": 2.4936, "eval_samples_per_second": 80.205, "eval_steps_per_second": 10.026, "step": 2100 }, { "epoch": 85.0, "eval_loss": 0.2854582667350769, "eval_runtime": 2.4941, "eval_samples_per_second": 80.188, "eval_steps_per_second": 10.023, "step": 2125 }, { "epoch": 86.0, "eval_loss": 0.2840147316455841, "eval_runtime": 2.4944, "eval_samples_per_second": 80.181, "eval_steps_per_second": 10.023, "step": 2150 }, { "epoch": 87.0, "eval_loss": 0.28520676493644714, "eval_runtime": 2.4963, "eval_samples_per_second": 80.119, "eval_steps_per_second": 10.015, "step": 2175 }, { "epoch": 88.0, "eval_loss": 0.28196609020233154, "eval_runtime": 2.4972, "eval_samples_per_second": 80.091, "eval_steps_per_second": 10.011, "step": 2200 }, { "epoch": 89.0, "eval_loss": 0.28386008739471436, "eval_runtime": 2.4982, "eval_samples_per_second": 80.057, "eval_steps_per_second": 10.007, "step": 2225 }, { "epoch": 90.0, "eval_loss": 0.2850269675254822, "eval_runtime": 2.4975, "eval_samples_per_second": 80.079, "eval_steps_per_second": 10.01, "step": 2250 }, { "epoch": 91.0, "eval_loss": 0.28362977504730225, "eval_runtime": 2.4982, "eval_samples_per_second": 80.057, "eval_steps_per_second": 10.007, "step": 2275 }, { "epoch": 92.0, "eval_loss": 0.2840833365917206, "eval_runtime": 2.4978, "eval_samples_per_second": 80.07, "eval_steps_per_second": 10.009, "step": 2300 }, { "epoch": 93.0, "eval_loss": 0.28477975726127625, "eval_runtime": 2.4997, "eval_samples_per_second": 80.01, "eval_steps_per_second": 10.001, "step": 2325 }, { "epoch": 94.0, "eval_loss": 0.2831202745437622, "eval_runtime": 2.4995, "eval_samples_per_second": 80.016, "eval_steps_per_second": 10.002, "step": 2350 }, { "epoch": 95.0, "eval_loss": 0.28298699855804443, "eval_runtime": 2.5008, "eval_samples_per_second": 79.975, "eval_steps_per_second": 9.997, "step": 2375 }, { "epoch": 96.0, "eval_loss": 0.2848021984100342, "eval_runtime": 2.4987, "eval_samples_per_second": 80.041, "eval_steps_per_second": 10.005, "step": 2400 }, { "epoch": 97.0, "eval_loss": 0.2818942368030548, "eval_runtime": 2.4999, "eval_samples_per_second": 80.004, "eval_steps_per_second": 10.0, "step": 2425 }, { "epoch": 98.0, "eval_loss": 0.28425753116607666, "eval_runtime": 2.5005, "eval_samples_per_second": 79.982, "eval_steps_per_second": 9.998, "step": 2450 }, { "epoch": 99.0, "eval_loss": 0.28273478150367737, "eval_runtime": 2.4987, "eval_samples_per_second": 80.042, "eval_steps_per_second": 10.005, "step": 2475 }, { "epoch": 100.0, "grad_norm": 1.4653393030166626, "learning_rate": 0.0, "loss": 0.3157, "step": 2500 } ], "logging_steps": 500, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "total_flos": 1306483752960000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }