| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9997049277072882, | |
| "eval_steps": 500, | |
| "global_step": 847, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.011802891708468575, | |
| "grad_norm": 0.6382197141647339, | |
| "learning_rate": 5.294117647058824e-06, | |
| "loss": 1.7524, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.02360578341693715, | |
| "grad_norm": 0.5001206994056702, | |
| "learning_rate": 1.1176470588235295e-05, | |
| "loss": 1.3315, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03540867512540572, | |
| "grad_norm": 0.41650518774986267, | |
| "learning_rate": 1.7058823529411767e-05, | |
| "loss": 1.1148, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0472115668338743, | |
| "grad_norm": 0.42574718594551086, | |
| "learning_rate": 2.235294117647059e-05, | |
| "loss": 1.0196, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05901445854234287, | |
| "grad_norm": 0.3408316373825073, | |
| "learning_rate": 2.823529411764706e-05, | |
| "loss": 0.94, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07081735025081144, | |
| "grad_norm": 0.39876773953437805, | |
| "learning_rate": 3.411764705882353e-05, | |
| "loss": 0.8918, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.08262024195928003, | |
| "grad_norm": 0.32425975799560547, | |
| "learning_rate": 4e-05, | |
| "loss": 0.8412, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0944231336677486, | |
| "grad_norm": 0.40873634815216064, | |
| "learning_rate": 4.588235294117647e-05, | |
| "loss": 0.887, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.10622602537621717, | |
| "grad_norm": 0.4909669756889343, | |
| "learning_rate": 4.9998087784700426e-05, | |
| "loss": 0.8888, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.11802891708468574, | |
| "grad_norm": 0.3897865414619446, | |
| "learning_rate": 4.996410098317137e-05, | |
| "loss": 0.8555, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1298318087931543, | |
| "grad_norm": 0.3305865228176117, | |
| "learning_rate": 4.989723448187131e-05, | |
| "loss": 0.8424, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.14163470050162288, | |
| "grad_norm": 0.3554224669933319, | |
| "learning_rate": 4.9845268462432916e-05, | |
| "loss": 0.8445, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.15343759221009148, | |
| "grad_norm": 0.46097129583358765, | |
| "learning_rate": 4.970969070763177e-05, | |
| "loss": 0.8377, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.16524048391856005, | |
| "grad_norm": 0.3145534098148346, | |
| "learning_rate": 4.953211814536217e-05, | |
| "loss": 0.759, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.17704337562702863, | |
| "grad_norm": 0.42392656207084656, | |
| "learning_rate": 4.931285256513868e-05, | |
| "loss": 0.8121, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1888462673354972, | |
| "grad_norm": 0.4339812994003296, | |
| "learning_rate": 4.905226661492095e-05, | |
| "loss": 0.7896, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.20064915904396577, | |
| "grad_norm": 0.44723227620124817, | |
| "learning_rate": 4.8750803167788136e-05, | |
| "loss": 0.8057, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.21245205075243434, | |
| "grad_norm": 0.46169158816337585, | |
| "learning_rate": 4.840897456926373e-05, | |
| "loss": 0.7724, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2242549424609029, | |
| "grad_norm": 0.41829928755760193, | |
| "learning_rate": 4.8027361766570117e-05, | |
| "loss": 0.7458, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.23605783416937148, | |
| "grad_norm": 0.4120149612426758, | |
| "learning_rate": 4.760661332129254e-05, | |
| "loss": 0.7686, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.24786072587784008, | |
| "grad_norm": 0.3918631970882416, | |
| "learning_rate": 4.7147444307130686e-05, | |
| "loss": 0.769, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.2596636175863086, | |
| "grad_norm": 0.4276711642742157, | |
| "learning_rate": 4.665063509461097e-05, | |
| "loss": 0.7574, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.2714665092947772, | |
| "grad_norm": 0.42904192209243774, | |
| "learning_rate": 4.6117030024825114e-05, | |
| "loss": 0.7826, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.28326940100324577, | |
| "grad_norm": 0.5145927667617798, | |
| "learning_rate": 4.554753597444896e-05, | |
| "loss": 0.7954, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.29507229271171437, | |
| "grad_norm": 0.3549771010875702, | |
| "learning_rate": 4.494312081448029e-05, | |
| "loss": 0.7527, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.30687518442018297, | |
| "grad_norm": 0.4441188871860504, | |
| "learning_rate": 4.4304811765315105e-05, | |
| "loss": 0.7321, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3186780761286515, | |
| "grad_norm": 0.3967060148715973, | |
| "learning_rate": 4.3633693650957976e-05, | |
| "loss": 0.7047, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3304809678371201, | |
| "grad_norm": 0.44348135590553284, | |
| "learning_rate": 4.293090705533342e-05, | |
| "loss": 0.7431, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.34228385954558865, | |
| "grad_norm": 0.9141893982887268, | |
| "learning_rate": 4.219764638383177e-05, | |
| "loss": 0.7177, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.35408675125405725, | |
| "grad_norm": 0.45525214076042175, | |
| "learning_rate": 4.1435157833383955e-05, | |
| "loss": 0.7128, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3658896429625258, | |
| "grad_norm": 0.537662148475647, | |
| "learning_rate": 4.06447372745151e-05, | |
| "loss": 0.7162, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3776925346709944, | |
| "grad_norm": 0.4020293653011322, | |
| "learning_rate": 3.982772804897649e-05, | |
| "loss": 0.7212, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.389495426379463, | |
| "grad_norm": 0.6390876173973083, | |
| "learning_rate": 3.898551868669883e-05, | |
| "loss": 0.716, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.40129831808793154, | |
| "grad_norm": 0.47102075815200806, | |
| "learning_rate": 3.811954054594702e-05, | |
| "loss": 0.733, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.41310120979640014, | |
| "grad_norm": 0.5660268664360046, | |
| "learning_rate": 3.723126538068686e-05, | |
| "loss": 0.764, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.4249041015048687, | |
| "grad_norm": 0.595162570476532, | |
| "learning_rate": 3.632220283929822e-05, | |
| "loss": 0.7302, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.4367069932133373, | |
| "grad_norm": 0.5331649780273438, | |
| "learning_rate": 3.5393897898885606e-05, | |
| "loss": 0.7127, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4485098849218058, | |
| "grad_norm": 0.4248451590538025, | |
| "learning_rate": 3.444792823954651e-05, | |
| "loss": 0.6933, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4603127766302744, | |
| "grad_norm": 0.5570621490478516, | |
| "learning_rate": 3.348590156306017e-05, | |
| "loss": 0.7012, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.47211566833874297, | |
| "grad_norm": 0.41210871934890747, | |
| "learning_rate": 3.25094528605536e-05, | |
| "loss": 0.7006, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.48391856004721157, | |
| "grad_norm": 0.5020595788955688, | |
| "learning_rate": 3.152024163378867e-05, | |
| "loss": 0.7159, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.49572145175568016, | |
| "grad_norm": 0.5407310724258423, | |
| "learning_rate": 3.051994907479265e-05, | |
| "loss": 0.7002, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.5075243434641488, | |
| "grad_norm": 0.422695130109787, | |
| "learning_rate": 2.9510275208625522e-05, | |
| "loss": 0.6721, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.5193272351726173, | |
| "grad_norm": 0.4953523576259613, | |
| "learning_rate": 2.849293600414002e-05, | |
| "loss": 0.6612, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.5311301268810859, | |
| "grad_norm": 0.44490641355514526, | |
| "learning_rate": 2.7469660457644857e-05, | |
| "loss": 0.6786, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5429330185895545, | |
| "grad_norm": 0.3714945912361145, | |
| "learning_rate": 2.644218765442728e-05, | |
| "loss": 0.6731, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.554735910298023, | |
| "grad_norm": 0.44450584053993225, | |
| "learning_rate": 2.541226381312924e-05, | |
| "loss": 0.6876, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5665388020064915, | |
| "grad_norm": 0.4537455439567566, | |
| "learning_rate": 2.4381639318000126e-05, | |
| "loss": 0.6757, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5783416937149601, | |
| "grad_norm": 0.4810272753238678, | |
| "learning_rate": 2.3352065744070072e-05, | |
| "loss": 0.7128, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5901445854234287, | |
| "grad_norm": 0.49226102232933044, | |
| "learning_rate": 2.2325292880299335e-05, | |
| "loss": 0.6928, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6019474771318973, | |
| "grad_norm": 0.46990668773651123, | |
| "learning_rate": 2.1303065755763277e-05, | |
| "loss": 0.6482, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.6137503688403659, | |
| "grad_norm": 0.43036311864852905, | |
| "learning_rate": 2.0287121673926828e-05, | |
| "loss": 0.6759, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.6255532605488344, | |
| "grad_norm": 0.373436838388443, | |
| "learning_rate": 1.92791872600489e-05, | |
| "loss": 0.674, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.637356152257303, | |
| "grad_norm": 0.4169735312461853, | |
| "learning_rate": 1.8280975526734657e-05, | |
| "loss": 0.6636, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6491590439657716, | |
| "grad_norm": 0.3966214060783386, | |
| "learning_rate": 1.7294182962622846e-05, | |
| "loss": 0.658, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6609619356742402, | |
| "grad_norm": 0.45455384254455566, | |
| "learning_rate": 1.632048664915622e-05, | |
| "loss": 0.6563, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6727648273827088, | |
| "grad_norm": 0.513671875, | |
| "learning_rate": 1.536154141033482e-05, | |
| "loss": 0.6481, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6845677190911773, | |
| "grad_norm": 0.4144147038459778, | |
| "learning_rate": 1.4418977000296552e-05, | |
| "loss": 0.681, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6963706107996459, | |
| "grad_norm": 0.4277999997138977, | |
| "learning_rate": 1.3494395333504622e-05, | |
| "loss": 0.655, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.7081735025081145, | |
| "grad_norm": 0.4542660415172577, | |
| "learning_rate": 1.2589367762249347e-05, | |
| "loss": 0.6557, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7199763942165831, | |
| "grad_norm": 0.518882155418396, | |
| "learning_rate": 1.1705432406091085e-05, | |
| "loss": 0.6504, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.7317792859250516, | |
| "grad_norm": 0.3764165937900543, | |
| "learning_rate": 1.0844091537783316e-05, | |
| "loss": 0.6509, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.7435821776335202, | |
| "grad_norm": 0.40605178475379944, | |
| "learning_rate": 1.0006809030118181e-05, | |
| "loss": 0.6619, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7553850693419888, | |
| "grad_norm": 0.42034676671028137, | |
| "learning_rate": 9.195007868033933e-06, | |
| "loss": 0.6083, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7671879610504574, | |
| "grad_norm": 0.4199008345603943, | |
| "learning_rate": 8.410067730212439e-06, | |
| "loss": 0.6464, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.778990852758926, | |
| "grad_norm": 0.4271228611469269, | |
| "learning_rate": 7.653322644276779e-06, | |
| "loss": 0.6342, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7907937444673945, | |
| "grad_norm": 0.49036702513694763, | |
| "learning_rate": 6.926058719574207e-06, | |
| "loss": 0.6492, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.8025966361758631, | |
| "grad_norm": 0.4103890061378479, | |
| "learning_rate": 6.229511961397455e-06, | |
| "loss": 0.6294, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.8143995278843317, | |
| "grad_norm": 0.38033077120780945, | |
| "learning_rate": 5.564866170359351e-06, | |
| "loss": 0.638, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.8262024195928003, | |
| "grad_norm": 0.3652307987213135, | |
| "learning_rate": 4.933250930490715e-06, | |
| "loss": 0.6096, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8380053113012688, | |
| "grad_norm": 0.5351826548576355, | |
| "learning_rate": 4.335739689480778e-06, | |
| "loss": 0.6285, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.8498082030097374, | |
| "grad_norm": 0.427626371383667, | |
| "learning_rate": 3.773347934323035e-06, | |
| "loss": 0.6257, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.861611094718206, | |
| "grad_norm": 0.46427205204963684, | |
| "learning_rate": 3.2470314654667487e-06, | |
| "loss": 0.6142, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8734139864266746, | |
| "grad_norm": 0.5393053293228149, | |
| "learning_rate": 2.7576847724075123e-06, | |
| "loss": 0.6485, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8852168781351432, | |
| "grad_norm": 0.4637604057788849, | |
| "learning_rate": 2.3061395134774038e-06, | |
| "loss": 0.6407, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8970197698436116, | |
| "grad_norm": 0.40724095702171326, | |
| "learning_rate": 1.8931631024185327e-06, | |
| "loss": 0.6535, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.9088226615520802, | |
| "grad_norm": 0.4840000569820404, | |
| "learning_rate": 1.5194574041419802e-06, | |
| "loss": 0.642, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.9206255532605488, | |
| "grad_norm": 0.41105934977531433, | |
| "learning_rate": 1.185657541888857e-06, | |
| "loss": 0.617, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.9324284449690174, | |
| "grad_norm": 0.557059645652771, | |
| "learning_rate": 8.923308178206552e-07, | |
| "loss": 0.6415, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.9442313366774859, | |
| "grad_norm": 0.38617223501205444, | |
| "learning_rate": 6.39975748873431e-07, | |
| "loss": 0.6388, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9560342283859545, | |
| "grad_norm": 0.4779140055179596, | |
| "learning_rate": 4.2902121951440834e-07, | |
| "loss": 0.6366, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9678371200944231, | |
| "grad_norm": 0.4569835662841797, | |
| "learning_rate": 2.5982575284084486e-07, | |
| "loss": 0.6735, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9796400118028917, | |
| "grad_norm": 0.4118465185165405, | |
| "learning_rate": 1.3267690126008425e-07, | |
| "loss": 0.6238, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9914429035113603, | |
| "grad_norm": 0.4550204873085022, | |
| "learning_rate": 4.779075778620079e-08, | |
| "loss": 0.6613, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9997049277072882, | |
| "step": 847, | |
| "total_flos": 5.491458012295987e+18, | |
| "train_loss": 0.7367874357185229, | |
| "train_runtime": 38132.292, | |
| "train_samples_per_second": 0.711, | |
| "train_steps_per_second": 0.022 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 847, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.491458012295987e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |