-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfinetuning_mned_mbert_mid_layers.txt
512 lines (509 loc) · 160 KB
/
finetuning_mned_mbert_mid_layers.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
[2023-03-25 14:52:21,299][__main__][INFO] -
alg: mend
lr: 1.0e-06
edit_lr: 0.0001
seed: 0
debug: false
model_save_pt: 5000
edit_bs: 1
silent: false
max_iters: 1000000
log_interval: 100
val_interval: 5000
lr_lr: 0.0001
batch_size: 2
val_batch_size: 5
accumulate_bs: 10
cedit: 0.1
cloc: 1.0
cbase: 1.0
val_steps: 500
device: cuda
base_loss: distill
oracle: false
train: true
train_base: false
opt: Adam
single_batch: false
archive: null
grad_clip: 100.0
ref: null
early_stop_patience: 20000
early_stop_key: loss/total_edit_val
dropout: 0.0
tokenizer: null
results_dir: null
no_grad_layers: null
eval_only: false
half: false
save: false
model:
pt: null
name: bert-base-multilingual-uncased
class_name: BertForSequenceClassification
tokenizer_class: BertTokenizer
tokenizer_name: bert-base-multilingual-uncased
inner_params:
- bert.encoder.layer.7.intermediate.dense.weight
- bert.encoder.layer.7.output.dense.weight
- bert.encoder.layer.8.intermediate.dense.weight
- bert.encoder.layer.8.output.dense.weight
- bert.encoder.layer.9.intermediate.dense.weight
- bert.encoder.layer.9.output.dense.weight
data:
path: null
rephrase: true
zsre_nq: true
nq_path: ${hydra:runtime.cwd}/data/nq
wiki_webtext: true
n_edits: 1
eval:
verbose: true
log_interval: 100
final_eval: true
mend:
one_sided: false
n_hidden: 1
hidden_dim: null
init: id
norm: true
combine: true
x_only: false
delta_only: false
act: relu
rank: 1920
mlp_class: IDMLP
shared: true
task: fc
dataset: fever
train_set: fever/fever_train_1200 - spanish_1200.jsonl
val_set: fever/fever_dev_1200 - spanish_1200.jsonl
tests: false
[2023-03-25 14:52:21,299][__main__][INFO] - Project base directory: /home/anonymous-xme/mend/mend
[2023-03-25 14:52:21,339][models][INFO] - Loading model class <class 'transformers.models.bert.modeling_bert.BertForSequenceClassification'> with name bert-base-multilingual-uncased from cache dir /home/anonymous-xme/mend/mend/cache/
[2023-03-25 14:52:23,969][models][INFO] - Set 38 dropout modules to p=0.0
Data Size: 104422
Data Size: 10364
[2023-03-25 14:52:27,246][__main__][INFO] - Loading class MEND from module <module 'algs.mend' from '/home/anonymous-xme/mend/mend/algs/mend.py'>
[2023-03-25 14:52:27,246][algs.mend][INFO] - Hooked 6 modules
========== 768 3072
========== 3
[2023-03-25 14:52:27,248][algs.mend][INFO] - Building Gradient Transform with MLP class <class 'nn.IDMLP'>
[2023-03-25 14:52:27,248][nn][INFO] - Building IDMLP (id) [3840, 3840, 3840]
========== 3072 768
========== 3
[2023-03-25 14:52:27,350][algs.mend][INFO] - Building Gradient Transform with MLP class <class 'nn.IDMLP'>
[2023-03-25 14:52:27,350][nn][INFO] - Building IDMLP (id) [3840, 3840, 3840]
[2023-03-25 14:52:31,439][trainer][INFO] - Building optimizer <class 'torch.optim.adam.Adam'> with lr 1e-06
[2023-03-25 14:52:31,441][trainer][INFO] - Writing wandb run "fever - mend - bert-base-multilingual-uncased - 2023-03-25_14-52-21_2648014163" to /tmp/tmpe9jp1xn4
[2023-03-25 14:52:34,497][trainer][INFO] - Step 0:
[2023-03-25 14:52:34,498][trainer][INFO] - loss/edit_train: 1.57424; loss/loc_train: 0.08695; edit/acc_train: 0.00000; edit/log_prob_train: -1.57424; edit/prob_train: 0.20717; acc/pre_train: 0.00000; acc/post_train: 1.00000; nll/pre_train: 0.73266; perplexity/pre_train: 2.08060; nll/post_train: 0.24563; perplexity/post_train: 1.27843; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.12876; loss/total_train: 0.24438; loss/total_edit_train: 0.24438; memory/alloc_max_train: 2788374016.00000; memory/res_max_train: 3135242240.00000
[2023-03-25 14:53:34,949][trainer][INFO] - Step 0:
[2023-03-25 14:53:34,950][trainer][INFO] - loss/edit_val: 1.31823; loss/loc_val: 0.04613; edit/acc_val: 0.03400; edit/log_prob_val: -1.31823; edit/prob_val: 0.28041; acc/pre_val: 0.47600; acc/post_val: 0.50250; nll/pre_val: 0.69428; perplexity/pre_val: 2.00227; nll/post_val: 0.80314; perplexity/post_val: 2.23254; n_tokens/pre_val: 4.00000; n_tokens/post_val: 4.00000; time/edit_val: 0.08293; loss/total_val: 0.17795; loss/total_edit_val: 0.17795; memory/alloc_max_val: 2942155911.16800; memory/res_max_val: 3471038218.24000; eval_time/elapsed: 60.42646; eval_time/average: 0.12085
[2023-03-25 14:53:34,953][trainer][INFO] - Saving model to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-52-21_2648014163/models/bert-base-multilingual-uncased.2023-03-25_14-52-21_2648014163
[2023-03-25 14:53:35,319][trainer][INFO] - Write complete.
[2023-03-25 14:53:51,192][trainer][INFO] - Step 100:
[2023-03-25 14:53:51,193][trainer][INFO] - loss/edit_train: 1.24085; loss/loc_train: 0.02281; edit/acc_train: 0.02000; edit/log_prob_train: -1.24085; edit/prob_train: 0.29881; acc/pre_train: 0.45000; acc/post_train: 0.51000; nll/pre_train: 0.70310; perplexity/pre_train: 2.02001; nll/post_train: 0.72347; perplexity/post_train: 2.06158; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08683; loss/total_train: 0.14690; loss/total_edit_train: 0.14690; memory/alloc_max_train: 3299327319.04000; memory/res_max_train: 3777641840.64000; grad_train: 121.00130; lr/lr0_train: 0.00009; lr/lr1_train: 0.00009; lr/lr2_train: 0.00009; lr/lr3_train: 0.00009; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 14:54:07,064][trainer][INFO] - Step 200:
[2023-03-25 14:54:07,064][trainer][INFO] - loss/edit_train: 1.13571; loss/loc_train: 0.01015; edit/acc_train: 0.03000; edit/log_prob_train: -1.13571; edit/prob_train: 0.33108; acc/pre_train: 0.47000; acc/post_train: 0.48000; nll/pre_train: 0.68809; perplexity/pre_train: 1.98992; nll/post_train: 0.71120; perplexity/post_train: 2.03643; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08739; loss/total_train: 0.12372; loss/total_edit_train: 0.12372; memory/alloc_max_train: 3339538119.68000; memory/res_max_train: 3810525184.00000; grad_train: 70.15205; lr/lr0_train: 0.00009; lr/lr1_train: 0.00009; lr/lr2_train: 0.00009; lr/lr3_train: 0.00009; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 14:54:23,029][trainer][INFO] - Step 300:
[2023-03-25 14:54:23,030][trainer][INFO] - loss/edit_train: 1.01296; loss/loc_train: 0.00551; edit/acc_train: 0.07000; edit/log_prob_train: -1.01296; edit/prob_train: 0.37220; acc/pre_train: 0.43000; acc/post_train: 0.41000; nll/pre_train: 0.70086; perplexity/pre_train: 2.01549; nll/post_train: 0.72581; perplexity/post_train: 2.06640; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09024; loss/total_train: 0.10680; loss/total_edit_train: 0.10680; memory/alloc_max_train: 3339848704.00000; memory/res_max_train: 3810525184.00000; grad_train: 100.01915; lr/lr0_train: 0.00009; lr/lr1_train: 0.00009; lr/lr2_train: 0.00008; lr/lr3_train: 0.00008; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 14:54:39,461][trainer][INFO] - Step 400:
[2023-03-25 14:54:39,462][trainer][INFO] - loss/edit_train: 0.90485; loss/loc_train: 0.00325; edit/acc_train: 0.09000; edit/log_prob_train: -0.90485; edit/prob_train: 0.41239; acc/pre_train: 0.35000; acc/post_train: 0.40000; nll/pre_train: 0.70402; perplexity/pre_train: 2.02186; nll/post_train: 0.71467; perplexity/post_train: 2.04351; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08906; loss/total_train: 0.09373; loss/total_edit_train: 0.09373; memory/alloc_max_train: 3342222801.92000; memory/res_max_train: 3814677544.96000; grad_train: 116.74245; lr/lr0_train: 0.00009; lr/lr1_train: 0.00009; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00007; lr/lr5_train: 0.00008
[2023-03-25 14:54:55,709][trainer][INFO] - Step 500:
[2023-03-25 14:54:55,710][trainer][INFO] - loss/edit_train: 0.76108; loss/loc_train: 0.00280; edit/acc_train: 0.40000; edit/log_prob_train: -0.76108; edit/prob_train: 0.47595; acc/pre_train: 0.34000; acc/post_train: 0.43000; nll/pre_train: 0.70766; perplexity/pre_train: 2.02924; nll/post_train: 0.72476; perplexity/post_train: 2.06423; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08741; loss/total_train: 0.07891; loss/total_edit_train: 0.07891; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 104.66227; lr/lr0_train: 0.00010; lr/lr1_train: 0.00010; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00007; lr/lr5_train: 0.00008
[2023-03-25 14:55:11,717][trainer][INFO] - Step 600:
[2023-03-25 14:55:11,717][trainer][INFO] - loss/edit_train: 0.64189; loss/loc_train: 0.00578; edit/acc_train: 0.62000; edit/log_prob_train: -0.64189; edit/prob_train: 0.53773; acc/pre_train: 0.44000; acc/post_train: 0.50000; nll/pre_train: 0.69399; perplexity/pre_train: 2.00168; nll/post_train: 0.71089; perplexity/post_train: 2.03581; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08928; loss/total_train: 0.06997; loss/total_edit_train: 0.06997; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 198.17659; lr/lr0_train: 0.00010; lr/lr1_train: 0.00011; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00006; lr/lr5_train: 0.00009
[2023-03-25 14:55:27,729][trainer][INFO] - Step 700:
[2023-03-25 14:55:27,730][trainer][INFO] - loss/edit_train: 0.54347; loss/loc_train: 0.00997; edit/acc_train: 0.81000; edit/log_prob_train: -0.54347; edit/prob_train: 0.58930; acc/pre_train: 0.46000; acc/post_train: 0.56000; nll/pre_train: 0.69682; perplexity/pre_train: 2.00735; nll/post_train: 0.67896; perplexity/post_train: 1.97182; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09046; loss/total_train: 0.06432; loss/total_edit_train: 0.06432; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 53.98927; lr/lr0_train: 0.00011; lr/lr1_train: 0.00012; lr/lr2_train: 0.00005; lr/lr3_train: 0.00009; lr/lr4_train: 0.00006; lr/lr5_train: 0.00009
[2023-03-25 14:55:43,716][trainer][INFO] - Step 800:
[2023-03-25 14:55:43,716][trainer][INFO] - loss/edit_train: 0.52067; loss/loc_train: 0.00716; edit/acc_train: 0.88000; edit/log_prob_train: -0.52067; edit/prob_train: 0.60013; acc/pre_train: 0.39000; acc/post_train: 0.59000; nll/pre_train: 0.71805; perplexity/pre_train: 2.05044; nll/post_train: 0.65373; perplexity/post_train: 1.92269; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08685; loss/total_train: 0.05923; loss/total_edit_train: 0.05923; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 53.81109; lr/lr0_train: 0.00011; lr/lr1_train: 0.00012; lr/lr2_train: 0.00005; lr/lr3_train: 0.00009; lr/lr4_train: 0.00006; lr/lr5_train: 0.00009
[2023-03-25 14:55:59,689][trainer][INFO] - Step 900:
[2023-03-25 14:55:59,690][trainer][INFO] - loss/edit_train: 0.47896; loss/loc_train: 0.00889; edit/acc_train: 0.91000; edit/log_prob_train: -0.47896; edit/prob_train: 0.62586; acc/pre_train: 0.36000; acc/post_train: 0.46000; nll/pre_train: 0.72234; perplexity/pre_train: 2.05925; nll/post_train: 0.72446; perplexity/post_train: 2.06361; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08673; loss/total_train: 0.05678; loss/total_edit_train: 0.05678; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 54.05768; lr/lr0_train: 0.00012; lr/lr1_train: 0.00012; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00009
[2023-03-25 14:56:15,479][trainer][INFO] - Step 1000:
[2023-03-25 14:56:15,479][trainer][INFO] - loss/edit_train: 0.42981; loss/loc_train: 0.01025; edit/acc_train: 0.98000; edit/log_prob_train: -0.42981; edit/prob_train: 0.65573; acc/pre_train: 0.51000; acc/post_train: 0.50000; nll/pre_train: 0.68991; perplexity/pre_train: 1.99353; nll/post_train: 0.72456; perplexity/post_train: 2.06382; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08978; loss/total_train: 0.05323; loss/total_edit_train: 0.05323; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 79.10730; lr/lr0_train: 0.00012; lr/lr1_train: 0.00012; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00009
[2023-03-25 14:56:31,441][trainer][INFO] - Step 1100:
[2023-03-25 14:56:31,441][trainer][INFO] - loss/edit_train: 0.44805; loss/loc_train: 0.00917; edit/acc_train: 0.96000; edit/log_prob_train: -0.44805; edit/prob_train: 0.64328; acc/pre_train: 0.51000; acc/post_train: 0.44000; nll/pre_train: 0.69201; perplexity/pre_train: 1.99774; nll/post_train: 0.76416; perplexity/post_train: 2.14719; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08753; loss/total_train: 0.05398; loss/total_edit_train: 0.05398; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 41.78271; lr/lr0_train: 0.00013; lr/lr1_train: 0.00013; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00009
[2023-03-25 14:56:47,075][trainer][INFO] - Step 1200:
[2023-03-25 14:56:47,075][trainer][INFO] - loss/edit_train: 0.37413; loss/loc_train: 0.01162; edit/acc_train: 0.99000; edit/log_prob_train: -0.37413; edit/prob_train: 0.69243; acc/pre_train: 0.44000; acc/post_train: 0.55000; nll/pre_train: 0.68903; perplexity/pre_train: 1.99178; nll/post_train: 0.72660; perplexity/post_train: 2.06804; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08948; loss/total_train: 0.04903; loss/total_edit_train: 0.04903; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 103.50338; lr/lr0_train: 0.00013; lr/lr1_train: 0.00013; lr/lr2_train: 0.00004; lr/lr3_train: 0.00009; lr/lr4_train: 0.00006; lr/lr5_train: 0.00008
[2023-03-25 14:57:03,016][trainer][INFO] - Step 1300:
[2023-03-25 14:57:03,017][trainer][INFO] - loss/edit_train: 0.41145; loss/loc_train: 0.00803; edit/acc_train: 0.96000; edit/log_prob_train: -0.41145; edit/prob_train: 0.66926; acc/pre_train: 0.38000; acc/post_train: 0.43000; nll/pre_train: 0.70755; perplexity/pre_train: 2.02902; nll/post_train: 0.73364; perplexity/post_train: 2.08264; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08635; loss/total_train: 0.04917; loss/total_edit_train: 0.04917; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 74.68546; lr/lr0_train: 0.00013; lr/lr1_train: 0.00013; lr/lr2_train: 0.00005; lr/lr3_train: 0.00009; lr/lr4_train: 0.00006; lr/lr5_train: 0.00008
[2023-03-25 14:57:18,716][trainer][INFO] - Step 1400:
[2023-03-25 14:57:18,717][trainer][INFO] - loss/edit_train: 0.35852; loss/loc_train: 0.01000; edit/acc_train: 0.97000; edit/log_prob_train: -0.35852; edit/prob_train: 0.70302; acc/pre_train: 0.43000; acc/post_train: 0.49000; nll/pre_train: 0.70381; perplexity/pre_train: 2.02144; nll/post_train: 0.73641; perplexity/post_train: 2.08842; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08432; loss/total_train: 0.04585; loss/total_edit_train: 0.04585; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 79.15296; lr/lr0_train: 0.00013; lr/lr1_train: 0.00013; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00008
[2023-03-25 14:57:34,666][trainer][INFO] - Step 1500:
[2023-03-25 14:57:34,667][trainer][INFO] - loss/edit_train: 0.36239; loss/loc_train: 0.00792; edit/acc_train: 0.97000; edit/log_prob_train: -0.36239; edit/prob_train: 0.70217; acc/pre_train: 0.48000; acc/post_train: 0.38000; nll/pre_train: 0.69275; perplexity/pre_train: 1.99920; nll/post_train: 0.76830; perplexity/post_train: 2.15609; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08795; loss/total_train: 0.04416; loss/total_edit_train: 0.04416; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 60.18043; lr/lr0_train: 0.00014; lr/lr1_train: 0.00013; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00008
[2023-03-25 14:57:50,643][trainer][INFO] - Step 1600:
[2023-03-25 14:57:50,644][trainer][INFO] - loss/edit_train: 0.35232; loss/loc_train: 0.00797; edit/acc_train: 0.98000; edit/log_prob_train: -0.35232; edit/prob_train: 0.70841; acc/pre_train: 0.42000; acc/post_train: 0.47000; nll/pre_train: 0.71515; perplexity/pre_train: 2.04449; nll/post_train: 0.72715; perplexity/post_train: 2.06918; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08751; loss/total_train: 0.04320; loss/total_edit_train: 0.04320; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 81.94317; lr/lr0_train: 0.00014; lr/lr1_train: 0.00014; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00008
[2023-03-25 14:58:07,137][trainer][INFO] - Step 1700:
[2023-03-25 14:58:07,137][trainer][INFO] - loss/edit_train: 0.33723; loss/loc_train: 0.00769; edit/acc_train: 0.98000; edit/log_prob_train: -0.33723; edit/prob_train: 0.71850; acc/pre_train: 0.42000; acc/post_train: 0.41000; nll/pre_train: 0.70573; perplexity/pre_train: 2.02533; nll/post_train: 0.76037; perplexity/post_train: 2.13907; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08884; loss/total_train: 0.04142; loss/total_edit_train: 0.04142; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 51.64470; lr/lr0_train: 0.00014; lr/lr1_train: 0.00014; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00008
[2023-03-25 14:58:22,953][trainer][INFO] - Step 1800:
[2023-03-25 14:58:22,953][trainer][INFO] - loss/edit_train: 0.34393; loss/loc_train: 0.00721; edit/acc_train: 0.97000; edit/log_prob_train: -0.34393; edit/prob_train: 0.71515; acc/pre_train: 0.43000; acc/post_train: 0.36000; nll/pre_train: 0.69337; perplexity/pre_train: 2.00045; nll/post_train: 0.75224; perplexity/post_train: 2.12174; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08909; loss/total_train: 0.04161; loss/total_edit_train: 0.04161; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 68.90976; lr/lr0_train: 0.00015; lr/lr1_train: 0.00014; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00008
[2023-03-25 14:58:38,722][trainer][INFO] - Step 1900:
[2023-03-25 14:58:38,723][trainer][INFO] - loss/edit_train: 0.29755; loss/loc_train: 0.00872; edit/acc_train: 0.98000; edit/log_prob_train: -0.29755; edit/prob_train: 0.74770; acc/pre_train: 0.46000; acc/post_train: 0.55000; nll/pre_train: 0.69180; perplexity/pre_train: 1.99731; nll/post_train: 0.67916; perplexity/post_train: 1.97223; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08723; loss/total_train: 0.03848; loss/total_edit_train: 0.03848; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 190.57541; lr/lr0_train: 0.00015; lr/lr1_train: 0.00015; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00008
[2023-03-25 14:58:55,083][trainer][INFO] - Step 2000:
[2023-03-25 14:58:55,083][trainer][INFO] - loss/edit_train: 0.29726; loss/loc_train: 0.00849; edit/acc_train: 0.99000; edit/log_prob_train: -0.29726; edit/prob_train: 0.74701; acc/pre_train: 0.39000; acc/post_train: 0.42000; nll/pre_train: 0.69350; perplexity/pre_train: 2.00070; nll/post_train: 0.76394; perplexity/post_train: 2.14671; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09033; loss/total_train: 0.03821; loss/total_edit_train: 0.03821; memory/alloc_max_train: 3349710848.00000; memory/res_max_train: 3827302400.00000; grad_train: 73.38758; lr/lr0_train: 0.00015; lr/lr1_train: 0.00015; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00006; lr/lr5_train: 0.00008
[2023-03-25 14:59:10,817][trainer][INFO] - Step 2100:
[2023-03-25 14:59:10,818][trainer][INFO] - loss/edit_train: 0.32102; loss/loc_train: 0.00890; edit/acc_train: 0.99000; edit/log_prob_train: -0.32102; edit/prob_train: 0.73134; acc/pre_train: 0.47000; acc/post_train: 0.45000; nll/pre_train: 0.67966; perplexity/pre_train: 1.97321; nll/post_train: 0.70563; perplexity/post_train: 2.02512; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08548; loss/total_train: 0.04100; loss/total_edit_train: 0.04100; memory/alloc_max_train: 3349711452.16000; memory/res_max_train: 3827302400.00000; grad_train: 107.96176; lr/lr0_train: 0.00015; lr/lr1_train: 0.00015; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00007; lr/lr5_train: 0.00008
[2023-03-25 14:59:26,317][trainer][INFO] - Step 2200:
[2023-03-25 14:59:26,317][trainer][INFO] - loss/edit_train: 0.25856; loss/loc_train: 0.00578; edit/acc_train: 1.00000; edit/log_prob_train: -0.25856; edit/prob_train: 0.77547; acc/pre_train: 0.50000; acc/post_train: 0.46000; nll/pre_train: 0.68933; perplexity/pre_train: 1.99237; nll/post_train: 0.70989; perplexity/post_train: 2.03376; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08884; loss/total_train: 0.03164; loss/total_edit_train: 0.03164; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 61.11891; lr/lr0_train: 0.00016; lr/lr1_train: 0.00016; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00007; lr/lr5_train: 0.00008
[2023-03-25 14:59:42,057][trainer][INFO] - Step 2300:
[2023-03-25 14:59:42,058][trainer][INFO] - loss/edit_train: 0.24710; loss/loc_train: 0.00618; edit/acc_train: 0.99000; edit/log_prob_train: -0.24710; edit/prob_train: 0.78492; acc/pre_train: 0.56000; acc/post_train: 0.50000; nll/pre_train: 0.68177; perplexity/pre_train: 1.97738; nll/post_train: 0.72116; perplexity/post_train: 2.05682; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08706; loss/total_train: 0.03089; loss/total_edit_train: 0.03089; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 127.70962; lr/lr0_train: 0.00016; lr/lr1_train: 0.00016; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00007; lr/lr5_train: 0.00008
[2023-03-25 14:59:58,186][trainer][INFO] - Step 2400:
[2023-03-25 14:59:58,186][trainer][INFO] - loss/edit_train: 0.24230; loss/loc_train: 0.00540; edit/acc_train: 1.00000; edit/log_prob_train: -0.24230; edit/prob_train: 0.78792; acc/pre_train: 0.45000; acc/post_train: 0.46000; nll/pre_train: 0.70394; perplexity/pre_train: 2.02169; nll/post_train: 0.73563; perplexity/post_train: 2.08680; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08729; loss/total_train: 0.02963; loss/total_edit_train: 0.02963; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 83.37956; lr/lr0_train: 0.00016; lr/lr1_train: 0.00017; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00007; lr/lr5_train: 0.00008
[2023-03-25 15:00:14,474][trainer][INFO] - Step 2500:
[2023-03-25 15:00:14,474][trainer][INFO] - loss/edit_train: 0.23413; loss/loc_train: 0.00671; edit/acc_train: 1.00000; edit/log_prob_train: -0.23413; edit/prob_train: 0.79483; acc/pre_train: 0.50000; acc/post_train: 0.40000; nll/pre_train: 0.69956; perplexity/pre_train: 2.01286; nll/post_train: 0.74584; perplexity/post_train: 2.10820; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09117; loss/total_train: 0.03012; loss/total_edit_train: 0.03012; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 86.51044; lr/lr0_train: 0.00017; lr/lr1_train: 0.00017; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00007; lr/lr5_train: 0.00008
[2023-03-25 15:00:30,577][trainer][INFO] - Step 2600:
[2023-03-25 15:00:30,577][trainer][INFO] - loss/edit_train: 0.22603; loss/loc_train: 0.00485; edit/acc_train: 1.00000; edit/log_prob_train: -0.22603; edit/prob_train: 0.80003; acc/pre_train: 0.37000; acc/post_train: 0.40000; nll/pre_train: 0.70167; perplexity/pre_train: 2.01712; nll/post_train: 0.70650; perplexity/post_train: 2.02688; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08980; loss/total_train: 0.02746; loss/total_edit_train: 0.02746; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 78.62318; lr/lr0_train: 0.00017; lr/lr1_train: 0.00017; lr/lr2_train: 0.00004; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:00:46,423][trainer][INFO] - Step 2700:
[2023-03-25 15:00:46,424][trainer][INFO] - loss/edit_train: 0.21965; loss/loc_train: 0.00475; edit/acc_train: 0.98000; edit/log_prob_train: -0.21965; edit/prob_train: 0.80666; acc/pre_train: 0.40000; acc/post_train: 0.39000; nll/pre_train: 0.71045; perplexity/pre_train: 2.03491; nll/post_train: 0.73541; perplexity/post_train: 2.08635; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08484; loss/total_train: 0.02672; loss/total_edit_train: 0.02672; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 62.73169; lr/lr0_train: 0.00017; lr/lr1_train: 0.00018; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:01:02,249][trainer][INFO] - Step 2800:
[2023-03-25 15:01:02,249][trainer][INFO] - loss/edit_train: 0.20888; loss/loc_train: 0.00419; edit/acc_train: 0.99000; edit/log_prob_train: -0.20888; edit/prob_train: 0.81532; acc/pre_train: 0.43000; acc/post_train: 0.54000; nll/pre_train: 0.69234; perplexity/pre_train: 1.99838; nll/post_train: 0.71319; perplexity/post_train: 2.04050; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08717; loss/total_train: 0.02508; loss/total_edit_train: 0.02508; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 52.69543; lr/lr0_train: 0.00018; lr/lr1_train: 0.00018; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00007; lr/lr5_train: 0.00008
[2023-03-25 15:01:17,630][trainer][INFO] - Step 2900:
[2023-03-25 15:01:17,631][trainer][INFO] - loss/edit_train: 0.19041; loss/loc_train: 0.00611; edit/acc_train: 1.00000; edit/log_prob_train: -0.19041; edit/prob_train: 0.82911; acc/pre_train: 0.53000; acc/post_train: 0.43000; nll/pre_train: 0.68622; perplexity/pre_train: 1.98619; nll/post_train: 0.73859; perplexity/post_train: 2.09297; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08459; loss/total_train: 0.02515; loss/total_edit_train: 0.02515; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 76.03908; lr/lr0_train: 0.00018; lr/lr1_train: 0.00019; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00007; lr/lr5_train: 0.00008
[2023-03-25 15:01:33,258][trainer][INFO] - Step 3000:
[2023-03-25 15:01:33,259][trainer][INFO] - loss/edit_train: 0.19486; loss/loc_train: 0.00367; edit/acc_train: 1.00000; edit/log_prob_train: -0.19486; edit/prob_train: 0.82537; acc/pre_train: 0.49000; acc/post_train: 0.54000; nll/pre_train: 0.68138; perplexity/pre_train: 1.97660; nll/post_train: 0.69347; perplexity/post_train: 2.00064; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08594; loss/total_train: 0.02316; loss/total_edit_train: 0.02316; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 76.16900; lr/lr0_train: 0.00018; lr/lr1_train: 0.00019; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:01:48,895][trainer][INFO] - Step 3100:
[2023-03-25 15:01:48,896][trainer][INFO] - loss/edit_train: 0.18401; loss/loc_train: 0.00702; edit/acc_train: 1.00000; edit/log_prob_train: -0.18401; edit/prob_train: 0.83396; acc/pre_train: 0.54000; acc/post_train: 0.52000; nll/pre_train: 0.68121; perplexity/pre_train: 1.97626; nll/post_train: 0.70631; perplexity/post_train: 2.02650; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08603; loss/total_train: 0.02542; loss/total_edit_train: 0.02542; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 85.80756; lr/lr0_train: 0.00018; lr/lr1_train: 0.00019; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:02:05,007][trainer][INFO] - Step 3200:
[2023-03-25 15:02:05,008][trainer][INFO] - loss/edit_train: 0.17995; loss/loc_train: 0.00315; edit/acc_train: 1.00000; edit/log_prob_train: -0.17995; edit/prob_train: 0.83720; acc/pre_train: 0.46000; acc/post_train: 0.52000; nll/pre_train: 0.69298; perplexity/pre_train: 1.99967; nll/post_train: 0.70408; perplexity/post_train: 2.02199; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08793; loss/total_train: 0.02114; loss/total_edit_train: 0.02114; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 53.25316; lr/lr0_train: 0.00019; lr/lr1_train: 0.00019; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:02:21,097][trainer][INFO] - Step 3300:
[2023-03-25 15:02:21,098][trainer][INFO] - loss/edit_train: 0.18579; loss/loc_train: 0.00378; edit/acc_train: 1.00000; edit/log_prob_train: -0.18579; edit/prob_train: 0.83350; acc/pre_train: 0.38000; acc/post_train: 0.53000; nll/pre_train: 0.71170; perplexity/pre_train: 2.03744; nll/post_train: 0.69327; perplexity/post_train: 2.00025; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08690; loss/total_train: 0.02236; loss/total_edit_train: 0.02236; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 55.84412; lr/lr0_train: 0.00019; lr/lr1_train: 0.00019; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:02:37,254][trainer][INFO] - Step 3400:
[2023-03-25 15:02:37,254][trainer][INFO] - loss/edit_train: 0.19023; loss/loc_train: 0.00360; edit/acc_train: 0.99000; edit/log_prob_train: -0.19023; edit/prob_train: 0.83120; acc/pre_train: 0.37000; acc/post_train: 0.41000; nll/pre_train: 0.69920; perplexity/pre_train: 2.01215; nll/post_train: 0.71128; perplexity/post_train: 2.03661; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08682; loss/total_train: 0.02262; loss/total_edit_train: 0.02262; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 48.00842; lr/lr0_train: 0.00019; lr/lr1_train: 0.00019; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:02:53,268][trainer][INFO] - Step 3500:
[2023-03-25 15:02:53,268][trainer][INFO] - loss/edit_train: 0.17019; loss/loc_train: 0.00411; edit/acc_train: 1.00000; edit/log_prob_train: -0.17019; edit/prob_train: 0.84525; acc/pre_train: 0.47000; acc/post_train: 0.52000; nll/pre_train: 0.70698; perplexity/pre_train: 2.02786; nll/post_train: 0.70366; perplexity/post_train: 2.02113; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08657; loss/total_train: 0.02113; loss/total_edit_train: 0.02113; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 39.28359; lr/lr0_train: 0.00019; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:03:08,950][trainer][INFO] - Step 3600:
[2023-03-25 15:03:08,951][trainer][INFO] - loss/edit_train: 0.17870; loss/loc_train: 0.00306; edit/acc_train: 1.00000; edit/log_prob_train: -0.17870; edit/prob_train: 0.83969; acc/pre_train: 0.34000; acc/post_train: 0.58000; nll/pre_train: 0.72107; perplexity/pre_train: 2.05664; nll/post_train: 0.68852; perplexity/post_train: 1.99077; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08648; loss/total_train: 0.02093; loss/total_edit_train: 0.02093; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 50.91869; lr/lr0_train: 0.00019; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:03:24,918][trainer][INFO] - Step 3700:
[2023-03-25 15:03:24,919][trainer][INFO] - loss/edit_train: 0.15548; loss/loc_train: 0.00303; edit/acc_train: 0.99000; edit/log_prob_train: -0.15548; edit/prob_train: 0.85934; acc/pre_train: 0.41000; acc/post_train: 0.57000; nll/pre_train: 0.70340; perplexity/pre_train: 2.02060; nll/post_train: 0.70089; perplexity/post_train: 2.01555; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08674; loss/total_train: 0.01858; loss/total_edit_train: 0.01858; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 54.35927; lr/lr0_train: 0.00020; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:03:41,394][trainer][INFO] - Step 3800:
[2023-03-25 15:03:41,395][trainer][INFO] - loss/edit_train: 0.15331; loss/loc_train: 0.00310; edit/acc_train: 1.00000; edit/log_prob_train: -0.15331; edit/prob_train: 0.85984; acc/pre_train: 0.41000; acc/post_train: 0.54000; nll/pre_train: 0.70223; perplexity/pre_train: 2.01825; nll/post_train: 0.72389; perplexity/post_train: 2.06244; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08926; loss/total_train: 0.01843; loss/total_edit_train: 0.01843; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 74.82016; lr/lr0_train: 0.00020; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00009; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:03:58,175][trainer][INFO] - Step 3900:
[2023-03-25 15:03:58,175][trainer][INFO] - loss/edit_train: 0.16137; loss/loc_train: 0.00219; edit/acc_train: 1.00000; edit/log_prob_train: -0.16137; edit/prob_train: 0.85375; acc/pre_train: 0.38000; acc/post_train: 0.57000; nll/pre_train: 0.70532; perplexity/pre_train: 2.02449; nll/post_train: 0.69116; perplexity/post_train: 1.99603; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09258; loss/total_train: 0.01833; loss/total_edit_train: 0.01833; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 41.32986; lr/lr0_train: 0.00020; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00009; lr/lr4_train: 0.00008; lr/lr5_train: 0.00008
[2023-03-25 15:04:14,049][trainer][INFO] - Step 4000:
[2023-03-25 15:04:14,049][trainer][INFO] - loss/edit_train: 0.15125; loss/loc_train: 0.00513; edit/acc_train: 1.00000; edit/log_prob_train: -0.15125; edit/prob_train: 0.86142; acc/pre_train: 0.47000; acc/post_train: 0.62000; nll/pre_train: 0.71176; perplexity/pre_train: 2.03757; nll/post_train: 0.69395; perplexity/post_train: 2.00161; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08712; loss/total_train: 0.02025; loss/total_edit_train: 0.02025; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 40.02574; lr/lr0_train: 0.00020; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00008; lr/lr5_train: 0.00009
[2023-03-25 15:04:29,487][trainer][INFO] - Step 4100:
[2023-03-25 15:04:29,487][trainer][INFO] - loss/edit_train: 0.14472; loss/loc_train: 0.00457; edit/acc_train: 1.00000; edit/log_prob_train: -0.14472; edit/prob_train: 0.86707; acc/pre_train: 0.34000; acc/post_train: 0.66000; nll/pre_train: 0.71868; perplexity/pre_train: 2.05173; nll/post_train: 0.70107; perplexity/post_train: 2.01591; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08779; loss/total_train: 0.01905; loss/total_edit_train: 0.01905; memory/alloc_max_train: 3349717504.00000; memory/res_max_train: 3827302400.00000; grad_train: 47.45701; lr/lr0_train: 0.00020; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:04:45,328][trainer][INFO] - Step 4200:
[2023-03-25 15:04:45,329][trainer][INFO] - loss/edit_train: 0.14233; loss/loc_train: 0.00173; edit/acc_train: 1.00000; edit/log_prob_train: -0.14233; edit/prob_train: 0.86870; acc/pre_train: 0.41000; acc/post_train: 0.53000; nll/pre_train: 0.70554; perplexity/pre_train: 2.02494; nll/post_train: 0.69786; perplexity/post_train: 2.00945; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08582; loss/total_train: 0.01596; loss/total_edit_train: 0.01596; memory/alloc_max_train: 3349717529.60000; memory/res_max_train: 3827302400.00000; grad_train: 46.68998; lr/lr0_train: 0.00020; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:05:01,109][trainer][INFO] - Step 4300:
[2023-03-25 15:05:01,109][trainer][INFO] - loss/edit_train: 0.14402; loss/loc_train: 0.00409; edit/acc_train: 1.00000; edit/log_prob_train: -0.14402; edit/prob_train: 0.86692; acc/pre_train: 0.40000; acc/post_train: 0.49000; nll/pre_train: 0.69674; perplexity/pre_train: 2.00719; nll/post_train: 0.69493; perplexity/post_train: 2.00357; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08927; loss/total_train: 0.01849; loss/total_edit_train: 0.01849; memory/alloc_max_train: 3349720064.00000; memory/res_max_train: 3827302400.00000; grad_train: 52.27946; lr/lr0_train: 0.00020; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:05:17,191][trainer][INFO] - Step 4400:
[2023-03-25 15:05:17,191][trainer][INFO] - loss/edit_train: 0.14029; loss/loc_train: 0.00202; edit/acc_train: 0.99000; edit/log_prob_train: -0.14029; edit/prob_train: 0.87170; acc/pre_train: 0.46000; acc/post_train: 0.58000; nll/pre_train: 0.69127; perplexity/pre_train: 1.99625; nll/post_train: 0.68648; perplexity/post_train: 1.98671; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08701; loss/total_train: 0.01605; loss/total_edit_train: 0.01605; memory/alloc_max_train: 3349720064.00000; memory/res_max_train: 3827302400.00000; grad_train: 31.21052; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00005; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:05:33,055][trainer][INFO] - Step 4500:
[2023-03-25 15:05:33,055][trainer][INFO] - loss/edit_train: 0.13629; loss/loc_train: 0.00393; edit/acc_train: 1.00000; edit/log_prob_train: -0.13629; edit/prob_train: 0.87486; acc/pre_train: 0.48000; acc/post_train: 0.45000; nll/pre_train: 0.67755; perplexity/pre_train: 1.96905; nll/post_train: 0.69231; perplexity/post_train: 1.99832; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08671; loss/total_train: 0.01756; loss/total_edit_train: 0.01756; memory/alloc_max_train: 3349720064.00000; memory/res_max_train: 3827302400.00000; grad_train: 54.64591; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:05:49,287][trainer][INFO] - Step 4600:
[2023-03-25 15:05:49,288][trainer][INFO] - loss/edit_train: 0.12187; loss/loc_train: 0.00259; edit/acc_train: 1.00000; edit/log_prob_train: -0.12187; edit/prob_train: 0.88635; acc/pre_train: 0.42000; acc/post_train: 0.59000; nll/pre_train: 0.71096; perplexity/pre_train: 2.03595; nll/post_train: 0.68612; perplexity/post_train: 1.98599; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08844; loss/total_train: 0.01478; loss/total_edit_train: 0.01478; memory/alloc_max_train: 3349720064.00000; memory/res_max_train: 3827302400.00000; grad_train: 36.26397; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:06:05,215][trainer][INFO] - Step 4700:
[2023-03-25 15:06:05,215][trainer][INFO] - loss/edit_train: 0.13721; loss/loc_train: 0.00275; edit/acc_train: 1.00000; edit/log_prob_train: -0.13721; edit/prob_train: 0.87517; acc/pre_train: 0.40000; acc/post_train: 0.59000; nll/pre_train: 0.70360; perplexity/pre_train: 2.02101; nll/post_train: 0.67911; perplexity/post_train: 1.97212; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08716; loss/total_train: 0.01647; loss/total_edit_train: 0.01647; memory/alloc_max_train: 3349720064.00000; memory/res_max_train: 3827302400.00000; grad_train: 44.95683; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:06:21,127][trainer][INFO] - Step 4800:
[2023-03-25 15:06:21,127][trainer][INFO] - loss/edit_train: 0.12123; loss/loc_train: 0.00335; edit/acc_train: 1.00000; edit/log_prob_train: -0.12123; edit/prob_train: 0.88927; acc/pre_train: 0.40000; acc/post_train: 0.62000; nll/pre_train: 0.71012; perplexity/pre_train: 2.03423; nll/post_train: 0.69232; perplexity/post_train: 1.99835; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08743; loss/total_train: 0.01547; loss/total_edit_train: 0.01547; memory/alloc_max_train: 3349720064.00000; memory/res_max_train: 3827302400.00000; grad_train: 79.99891; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:06:37,369][trainer][INFO] - Step 4900:
[2023-03-25 15:06:37,369][trainer][INFO] - loss/edit_train: 0.11220; loss/loc_train: 0.00298; edit/acc_train: 1.00000; edit/log_prob_train: -0.11220; edit/prob_train: 0.89489; acc/pre_train: 0.38000; acc/post_train: 0.59000; nll/pre_train: 0.70180; perplexity/pre_train: 2.01737; nll/post_train: 0.70158; perplexity/post_train: 2.01694; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08765; loss/total_train: 0.01420; loss/total_edit_train: 0.01420; memory/alloc_max_train: 3349720064.00000; memory/res_max_train: 3827302400.00000; grad_train: 52.11687; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:06:53,470][trainer][INFO] - Step 5000:
[2023-03-25 15:06:53,470][trainer][INFO] - loss/edit_train: 0.10107; loss/loc_train: 0.00352; edit/acc_train: 1.00000; edit/log_prob_train: -0.10107; edit/prob_train: 0.90472; acc/pre_train: 0.42000; acc/post_train: 0.60000; nll/pre_train: 0.69762; perplexity/pre_train: 2.00897; nll/post_train: 0.68209; perplexity/post_train: 1.97801; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08704; loss/total_train: 0.01362; loss/total_edit_train: 0.01362; memory/alloc_max_train: 3349720064.00000; memory/res_max_train: 3827302400.00000; grad_train: 57.41734; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:07:49,820][trainer][INFO] - Step 5000:
[2023-03-25 15:07:49,821][trainer][INFO] - loss/edit_val: 0.10759; loss/loc_val: 0.00492; edit/acc_val: 0.99800; edit/log_prob_val: -0.10759; edit/prob_val: 0.90035; acc/pre_val: 0.47600; acc/post_val: 0.49500; nll/pre_val: 0.69428; perplexity/pre_val: 2.00227; nll/post_val: 0.70558; perplexity/post_val: 2.02501; n_tokens/pre_val: 4.00000; n_tokens/post_val: 4.00000; time/edit_val: 0.07479; loss/total_val: 0.01568; loss/total_edit_val: 0.01568; memory/alloc_max_val: 3414117440.51200; memory/res_max_val: 3827302400.00000; eval_time/elapsed: 56.32468; eval_time/average: 0.11265
[2023-03-25 15:07:49,824][trainer][INFO] - Saving model to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-52-21_2648014163/models/bert-base-multilingual-uncased.2023-03-25_14-52-21_2648014163
[2023-03-25 15:07:49,824][trainer][INFO] - Moving old archive to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-52-21_2648014163/models/bert-base-multilingual-uncased.2023-03-25_14-52-21_2648014163.bk
[2023-03-25 15:07:50,861][trainer][INFO] - Write complete.
[2023-03-25 15:08:07,001][trainer][INFO] - Step 5100:
[2023-03-25 15:08:07,002][trainer][INFO] - loss/edit_train: 0.10324; loss/loc_train: 0.00325; edit/acc_train: 1.00000; edit/log_prob_train: -0.10324; edit/prob_train: 0.90303; acc/pre_train: 0.40000; acc/post_train: 0.56000; nll/pre_train: 0.70783; perplexity/pre_train: 2.02959; nll/post_train: 0.68149; perplexity/post_train: 1.97682; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08849; loss/total_train: 0.01357; loss/total_edit_train: 0.01357; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 34.96772; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:08:22,317][trainer][INFO] - Step 5200:
[2023-03-25 15:08:22,318][trainer][INFO] - loss/edit_train: 0.10309; loss/loc_train: 0.00245; edit/acc_train: 1.00000; edit/log_prob_train: -0.10309; edit/prob_train: 0.90368; acc/pre_train: 0.42000; acc/post_train: 0.54000; nll/pre_train: 0.70494; perplexity/pre_train: 2.02372; nll/post_train: 0.69829; perplexity/post_train: 2.01032; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08371; loss/total_train: 0.01276; loss/total_edit_train: 0.01276; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 37.43516; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00009
[2023-03-25 15:08:38,005][trainer][INFO] - Step 5300:
[2023-03-25 15:08:38,005][trainer][INFO] - loss/edit_train: 0.08863; loss/loc_train: 0.00548; edit/acc_train: 1.00000; edit/log_prob_train: -0.08863; edit/prob_train: 0.91558; acc/pre_train: 0.48000; acc/post_train: 0.61000; nll/pre_train: 0.69764; perplexity/pre_train: 2.00901; nll/post_train: 0.71641; perplexity/post_train: 2.04708; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09039; loss/total_train: 0.01434; loss/total_edit_train: 0.01434; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 52.08409; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00010; lr/lr4_train: 0.00009; lr/lr5_train: 0.00010
[2023-03-25 15:08:53,936][trainer][INFO] - Step 5400:
[2023-03-25 15:08:53,936][trainer][INFO] - loss/edit_train: 0.10747; loss/loc_train: 0.00433; edit/acc_train: 0.99000; edit/log_prob_train: -0.10747; edit/prob_train: 0.90318; acc/pre_train: 0.44000; acc/post_train: 0.56000; nll/pre_train: 0.69983; perplexity/pre_train: 2.01341; nll/post_train: 0.70007; perplexity/post_train: 2.01390; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08768; loss/total_train: 0.01508; loss/total_edit_train: 0.01508; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 61.49318; lr/lr0_train: 0.00020; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00011; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:09:10,033][trainer][INFO] - Step 5500:
[2023-03-25 15:09:10,033][trainer][INFO] - loss/edit_train: 0.09722; loss/loc_train: 0.00276; edit/acc_train: 1.00000; edit/log_prob_train: -0.09722; edit/prob_train: 0.90874; acc/pre_train: 0.34000; acc/post_train: 0.60000; nll/pre_train: 0.71484; perplexity/pre_train: 2.04385; nll/post_train: 0.70754; perplexity/post_train: 2.02899; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08937; loss/total_train: 0.01248; loss/total_edit_train: 0.01248; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 68.37945; lr/lr0_train: 0.00020; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00011; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:09:25,942][trainer][INFO] - Step 5600:
[2023-03-25 15:09:25,942][trainer][INFO] - loss/edit_train: 0.08372; loss/loc_train: 0.00436; edit/acc_train: 1.00000; edit/log_prob_train: -0.08372; edit/prob_train: 0.92020; acc/pre_train: 0.46000; acc/post_train: 0.57000; nll/pre_train: 0.69530; perplexity/pre_train: 2.00430; nll/post_train: 0.69003; perplexity/post_train: 1.99377; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08630; loss/total_train: 0.01273; loss/total_edit_train: 0.01273; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 80.93343; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00011; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:09:42,070][trainer][INFO] - Step 5700:
[2023-03-25 15:09:42,071][trainer][INFO] - loss/edit_train: 0.08955; loss/loc_train: 0.00378; edit/acc_train: 1.00000; edit/log_prob_train: -0.08955; edit/prob_train: 0.91552; acc/pre_train: 0.43000; acc/post_train: 0.56000; nll/pre_train: 0.70748; perplexity/pre_train: 2.02887; nll/post_train: 0.70982; perplexity/post_train: 2.03363; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08796; loss/total_train: 0.01274; loss/total_edit_train: 0.01274; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 72.50710; lr/lr0_train: 0.00020; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00011; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:09:58,243][trainer][INFO] - Step 5800:
[2023-03-25 15:09:58,243][trainer][INFO] - loss/edit_train: 0.08844; loss/loc_train: 0.00292; edit/acc_train: 1.00000; edit/log_prob_train: -0.08844; edit/prob_train: 0.91719; acc/pre_train: 0.44000; acc/post_train: 0.49000; nll/pre_train: 0.70285; perplexity/pre_train: 2.01950; nll/post_train: 0.68745; perplexity/post_train: 1.98864; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08961; loss/total_train: 0.01177; loss/total_edit_train: 0.01177; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 39.31480; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00009
[2023-03-25 15:10:14,173][trainer][INFO] - Step 5900:
[2023-03-25 15:10:14,174][trainer][INFO] - loss/edit_train: 0.08079; loss/loc_train: 0.00210; edit/acc_train: 1.00000; edit/log_prob_train: -0.08079; edit/prob_train: 0.92309; acc/pre_train: 0.38000; acc/post_train: 0.53000; nll/pre_train: 0.69581; perplexity/pre_train: 2.00532; nll/post_train: 0.69879; perplexity/post_train: 2.01131; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08606; loss/total_train: 0.01018; loss/total_edit_train: 0.01018; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 68.08007; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00009
[2023-03-25 15:10:30,111][trainer][INFO] - Step 6000:
[2023-03-25 15:10:30,111][trainer][INFO] - loss/edit_train: 0.08113; loss/loc_train: 0.00417; edit/acc_train: 1.00000; edit/log_prob_train: -0.08113; edit/prob_train: 0.92331; acc/pre_train: 0.39000; acc/post_train: 0.45000; nll/pre_train: 0.70080; perplexity/pre_train: 2.01537; nll/post_train: 0.70450; perplexity/post_train: 2.02284; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08862; loss/total_train: 0.01228; loss/total_edit_train: 0.01228; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 41.02055; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:10:45,564][trainer][INFO] - Step 6100:
[2023-03-25 15:10:45,564][trainer][INFO] - loss/edit_train: 0.09182; loss/loc_train: 0.00388; edit/acc_train: 1.00000; edit/log_prob_train: -0.09182; edit/prob_train: 0.91541; acc/pre_train: 0.44000; acc/post_train: 0.57000; nll/pre_train: 0.69792; perplexity/pre_train: 2.00957; nll/post_train: 0.69131; perplexity/post_train: 1.99633; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08878; loss/total_train: 0.01307; loss/total_edit_train: 0.01307; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 74.62936; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:11:02,274][trainer][INFO] - Step 6200:
[2023-03-25 15:11:02,274][trainer][INFO] - loss/edit_train: 0.06790; loss/loc_train: 0.00317; edit/acc_train: 1.00000; edit/log_prob_train: -0.06790; edit/prob_train: 0.93460; acc/pre_train: 0.47000; acc/post_train: 0.58000; nll/pre_train: 0.69601; perplexity/pre_train: 2.00574; nll/post_train: 0.67303; perplexity/post_train: 1.96016; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09139; loss/total_train: 0.00996; loss/total_edit_train: 0.00996; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 48.45163; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:11:19,089][trainer][INFO] - Step 6300:
[2023-03-25 15:11:19,089][trainer][INFO] - loss/edit_train: 0.07879; loss/loc_train: 0.00257; edit/acc_train: 1.00000; edit/log_prob_train: -0.07879; edit/prob_train: 0.92542; acc/pre_train: 0.37000; acc/post_train: 0.46000; nll/pre_train: 0.70368; perplexity/pre_train: 2.02118; nll/post_train: 0.71051; perplexity/post_train: 2.03503; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09375; loss/total_train: 0.01044; loss/total_edit_train: 0.01044; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 48.40431; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:11:34,982][trainer][INFO] - Step 6400:
[2023-03-25 15:11:34,983][trainer][INFO] - loss/edit_train: 0.09473; loss/loc_train: 0.00196; edit/acc_train: 0.99000; edit/log_prob_train: -0.09473; edit/prob_train: 0.91736; acc/pre_train: 0.46000; acc/post_train: 0.48000; nll/pre_train: 0.69177; perplexity/pre_train: 1.99724; nll/post_train: 0.69437; perplexity/post_train: 2.00244; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08843; loss/total_train: 0.01143; loss/total_edit_train: 0.01143; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 38.59124; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:11:51,237][trainer][INFO] - Step 6500:
[2023-03-25 15:11:51,238][trainer][INFO] - loss/edit_train: 0.11849; loss/loc_train: 0.00340; edit/acc_train: 0.96000; edit/log_prob_train: -0.11849; edit/prob_train: 0.89791; acc/pre_train: 0.44000; acc/post_train: 0.63000; nll/pre_train: 0.70310; perplexity/pre_train: 2.02000; nll/post_train: 0.68698; perplexity/post_train: 1.98771; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08901; loss/total_train: 0.01525; loss/total_edit_train: 0.01525; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 135.47466; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:12:07,548][trainer][INFO] - Step 6600:
[2023-03-25 15:12:07,549][trainer][INFO] - loss/edit_train: 0.07894; loss/loc_train: 0.00383; edit/acc_train: 1.00000; edit/log_prob_train: -0.07894; edit/prob_train: 0.92531; acc/pre_train: 0.36000; acc/post_train: 0.51000; nll/pre_train: 0.70420; perplexity/pre_train: 2.02223; nll/post_train: 0.71370; perplexity/post_train: 2.04153; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09084; loss/total_train: 0.01172; loss/total_edit_train: 0.01172; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 43.77531; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:12:23,449][trainer][INFO] - Step 6700:
[2023-03-25 15:12:23,449][trainer][INFO] - loss/edit_train: 0.06910; loss/loc_train: 0.00373; edit/acc_train: 1.00000; edit/log_prob_train: -0.06910; edit/prob_train: 0.93412; acc/pre_train: 0.54000; acc/post_train: 0.48000; nll/pre_train: 0.68784; perplexity/pre_train: 1.98942; nll/post_train: 0.71952; perplexity/post_train: 2.05346; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08719; loss/total_train: 0.01064; loss/total_edit_train: 0.01064; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 34.74631; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00008; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:12:39,851][trainer][INFO] - Step 6800:
[2023-03-25 15:12:39,851][trainer][INFO] - loss/edit_train: 0.07464; loss/loc_train: 0.00328; edit/acc_train: 1.00000; edit/log_prob_train: -0.07464; edit/prob_train: 0.92908; acc/pre_train: 0.44000; acc/post_train: 0.61000; nll/pre_train: 0.69503; perplexity/pre_train: 2.00377; nll/post_train: 0.68050; perplexity/post_train: 1.97486; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08927; loss/total_train: 0.01074; loss/total_edit_train: 0.01074; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 34.44665; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00008; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:12:55,560][trainer][INFO] - Step 6900:
[2023-03-25 15:12:55,560][trainer][INFO] - loss/edit_train: 0.07024; loss/loc_train: 0.00401; edit/acc_train: 1.00000; edit/log_prob_train: -0.07024; edit/prob_train: 0.93288; acc/pre_train: 0.44000; acc/post_train: 0.51000; nll/pre_train: 0.70030; perplexity/pre_train: 2.01435; nll/post_train: 0.71478; perplexity/post_train: 2.04374; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08821; loss/total_train: 0.01103; loss/total_edit_train: 0.01103; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 88.96743; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00008; lr/lr3_train: 0.00011; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:13:12,005][trainer][INFO] - Step 7000:
[2023-03-25 15:13:12,005][trainer][INFO] - loss/edit_train: 0.07356; loss/loc_train: 0.00150; edit/acc_train: 1.00000; edit/log_prob_train: -0.07356; edit/prob_train: 0.93124; acc/pre_train: 0.48000; acc/post_train: 0.44000; nll/pre_train: 0.69145; perplexity/pre_train: 1.99661; nll/post_train: 0.69935; perplexity/post_train: 2.01244; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08968; loss/total_train: 0.00885; loss/total_edit_train: 0.00885; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 43.00100; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00008; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00010
[2023-03-25 15:13:28,031][trainer][INFO] - Step 7100:
[2023-03-25 15:13:28,032][trainer][INFO] - loss/edit_train: 0.07052; loss/loc_train: 0.00173; edit/acc_train: 1.00000; edit/log_prob_train: -0.07052; edit/prob_train: 0.93428; acc/pre_train: 0.47000; acc/post_train: 0.39000; nll/pre_train: 0.69989; perplexity/pre_train: 2.01354; nll/post_train: 0.70559; perplexity/post_train: 2.02504; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08492; loss/total_train: 0.00878; loss/total_edit_train: 0.00878; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 24.77347; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00008; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:13:44,404][trainer][INFO] - Step 7200:
[2023-03-25 15:13:44,404][trainer][INFO] - loss/edit_train: 0.06072; loss/loc_train: 0.00279; edit/acc_train: 1.00000; edit/log_prob_train: -0.06072; edit/prob_train: 0.94129; acc/pre_train: 0.39000; acc/post_train: 0.54000; nll/pre_train: 0.71774; perplexity/pre_train: 2.04980; nll/post_train: 0.69321; perplexity/post_train: 2.00012; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08670; loss/total_train: 0.00886; loss/total_edit_train: 0.00886; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 53.98104; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00008; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:14:00,571][trainer][INFO] - Step 7300:
[2023-03-25 15:14:00,572][trainer][INFO] - loss/edit_train: 0.06805; loss/loc_train: 0.00317; edit/acc_train: 1.00000; edit/log_prob_train: -0.06805; edit/prob_train: 0.93530; acc/pre_train: 0.53000; acc/post_train: 0.43000; nll/pre_train: 0.68144; perplexity/pre_train: 1.97672; nll/post_train: 0.70374; perplexity/post_train: 2.02130; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08747; loss/total_train: 0.00998; loss/total_edit_train: 0.00998; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 95.70555; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00008; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:14:16,553][trainer][INFO] - Step 7400:
[2023-03-25 15:14:16,554][trainer][INFO] - loss/edit_train: 0.06648; loss/loc_train: 0.00141; edit/acc_train: 1.00000; edit/log_prob_train: -0.06648; edit/prob_train: 0.93752; acc/pre_train: 0.51000; acc/post_train: 0.52000; nll/pre_train: 0.70329; perplexity/pre_train: 2.02039; nll/post_train: 0.69380; perplexity/post_train: 2.00130; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08644; loss/total_train: 0.00806; loss/total_edit_train: 0.00806; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 24.30400; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00008; lr/lr3_train: 0.00010; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:14:32,597][trainer][INFO] - Step 7500:
[2023-03-25 15:14:32,597][trainer][INFO] - loss/edit_train: 0.07154; loss/loc_train: 0.00225; edit/acc_train: 1.00000; edit/log_prob_train: -0.07154; edit/prob_train: 0.93350; acc/pre_train: 0.40000; acc/post_train: 0.51000; nll/pre_train: 0.70229; perplexity/pre_train: 2.01838; nll/post_train: 0.69890; perplexity/post_train: 2.01154; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08688; loss/total_train: 0.00941; loss/total_edit_train: 0.00941; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 49.77812; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:14:48,368][trainer][INFO] - Step 7600:
[2023-03-25 15:14:48,368][trainer][INFO] - loss/edit_train: 0.05656; loss/loc_train: 0.00546; edit/acc_train: 1.00000; edit/log_prob_train: -0.05656; edit/prob_train: 0.94523; acc/pre_train: 0.47000; acc/post_train: 0.65000; nll/pre_train: 0.68844; perplexity/pre_train: 1.99060; nll/post_train: 0.70406; perplexity/post_train: 2.02194; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08624; loss/total_train: 0.01112; loss/total_edit_train: 0.01112; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 102.92557; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:15:03,997][trainer][INFO] - Step 7700:
[2023-03-25 15:15:03,998][trainer][INFO] - loss/edit_train: 0.06010; loss/loc_train: 0.00349; edit/acc_train: 1.00000; edit/log_prob_train: -0.06010; edit/prob_train: 0.94216; acc/pre_train: 0.43000; acc/post_train: 0.58000; nll/pre_train: 0.70719; perplexity/pre_train: 2.02829; nll/post_train: 0.71384; perplexity/post_train: 2.04182; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08575; loss/total_train: 0.00950; loss/total_edit_train: 0.00950; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 40.53953; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:15:19,944][trainer][INFO] - Step 7800:
[2023-03-25 15:15:19,944][trainer][INFO] - loss/edit_train: 0.06712; loss/loc_train: 0.00375; edit/acc_train: 1.00000; edit/log_prob_train: -0.06712; edit/prob_train: 0.93571; acc/pre_train: 0.45000; acc/post_train: 0.51000; nll/pre_train: 0.68735; perplexity/pre_train: 1.98844; nll/post_train: 0.70903; perplexity/post_train: 2.03202; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08644; loss/total_train: 0.01046; loss/total_edit_train: 0.01046; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 59.96234; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:15:35,462][trainer][INFO] - Step 7900:
[2023-03-25 15:15:35,463][trainer][INFO] - loss/edit_train: 0.07161; loss/loc_train: 0.00311; edit/acc_train: 1.00000; edit/log_prob_train: -0.07161; edit/prob_train: 0.93338; acc/pre_train: 0.42000; acc/post_train: 0.52000; nll/pre_train: 0.70141; perplexity/pre_train: 2.01660; nll/post_train: 0.69517; perplexity/post_train: 2.00405; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08436; loss/total_train: 0.01027; loss/total_edit_train: 0.01027; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 60.56809; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:15:50,933][trainer][INFO] - Step 8000:
[2023-03-25 15:15:50,933][trainer][INFO] - loss/edit_train: 0.06087; loss/loc_train: 0.00184; edit/acc_train: 1.00000; edit/log_prob_train: -0.06087; edit/prob_train: 0.94157; acc/pre_train: 0.42000; acc/post_train: 0.56000; nll/pre_train: 0.70232; perplexity/pre_train: 2.01844; nll/post_train: 0.70681; perplexity/post_train: 2.02751; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08486; loss/total_train: 0.00793; loss/total_edit_train: 0.00793; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 32.75615; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:16:06,619][trainer][INFO] - Step 8100:
[2023-03-25 15:16:06,619][trainer][INFO] - loss/edit_train: 0.06424; loss/loc_train: 0.00198; edit/acc_train: 1.00000; edit/log_prob_train: -0.06424; edit/prob_train: 0.93933; acc/pre_train: 0.46000; acc/post_train: 0.52000; nll/pre_train: 0.70772; perplexity/pre_train: 2.02936; nll/post_train: 0.69173; perplexity/post_train: 1.99717; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08519; loss/total_train: 0.00840; loss/total_edit_train: 0.00840; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 35.26064; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:16:22,229][trainer][INFO] - Step 8200:
[2023-03-25 15:16:22,230][trainer][INFO] - loss/edit_train: 0.05707; loss/loc_train: 0.00375; edit/acc_train: 1.00000; edit/log_prob_train: -0.05707; edit/prob_train: 0.94509; acc/pre_train: 0.46000; acc/post_train: 0.49000; nll/pre_train: 0.69447; perplexity/pre_train: 2.00266; nll/post_train: 0.70487; perplexity/post_train: 2.02358; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08628; loss/total_train: 0.00946; loss/total_edit_train: 0.00946; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 81.45049; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:16:37,772][trainer][INFO] - Step 8300:
[2023-03-25 15:16:37,773][trainer][INFO] - loss/edit_train: 0.07481; loss/loc_train: 0.00461; edit/acc_train: 1.00000; edit/log_prob_train: -0.07481; edit/prob_train: 0.93074; acc/pre_train: 0.46000; acc/post_train: 0.56000; nll/pre_train: 0.69413; perplexity/pre_train: 2.00197; nll/post_train: 0.70560; perplexity/post_train: 2.02507; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08854; loss/total_train: 0.01209; loss/total_edit_train: 0.01209; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 80.51074; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:16:53,376][trainer][INFO] - Step 8400:
[2023-03-25 15:16:53,376][trainer][INFO] - loss/edit_train: 0.06308; loss/loc_train: 0.00284; edit/acc_train: 1.00000; edit/log_prob_train: -0.06308; edit/prob_train: 0.94092; acc/pre_train: 0.43000; acc/post_train: 0.63000; nll/pre_train: 0.70533; perplexity/pre_train: 2.02451; nll/post_train: 0.67652; perplexity/post_train: 1.96702; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08823; loss/total_train: 0.00915; loss/total_edit_train: 0.00915; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 46.07362; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:17:09,273][trainer][INFO] - Step 8500:
[2023-03-25 15:17:09,273][trainer][INFO] - loss/edit_train: 0.05784; loss/loc_train: 0.00722; edit/acc_train: 1.00000; edit/log_prob_train: -0.05784; edit/prob_train: 0.94444; acc/pre_train: 0.37000; acc/post_train: 0.68000; nll/pre_train: 0.71145; perplexity/pre_train: 2.03695; nll/post_train: 0.66189; perplexity/post_train: 1.93846; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08628; loss/total_train: 0.01301; loss/total_edit_train: 0.01301; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 105.46318; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:17:24,768][trainer][INFO] - Step 8600:
[2023-03-25 15:17:24,768][trainer][INFO] - loss/edit_train: 0.05842; loss/loc_train: 0.00427; edit/acc_train: 1.00000; edit/log_prob_train: -0.05842; edit/prob_train: 0.94398; acc/pre_train: 0.40000; acc/post_train: 0.54000; nll/pre_train: 0.70077; perplexity/pre_train: 2.01531; nll/post_train: 0.70414; perplexity/post_train: 2.02210; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08421; loss/total_train: 0.01011; loss/total_edit_train: 0.01011; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 93.29175; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:17:40,408][trainer][INFO] - Step 8700:
[2023-03-25 15:17:40,409][trainer][INFO] - loss/edit_train: 0.06503; loss/loc_train: 0.00227; edit/acc_train: 1.00000; edit/log_prob_train: -0.06503; edit/prob_train: 0.93926; acc/pre_train: 0.39000; acc/post_train: 0.52000; nll/pre_train: 0.70321; perplexity/pre_train: 2.02024; nll/post_train: 0.71020; perplexity/post_train: 2.03441; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08343; loss/total_train: 0.00877; loss/total_edit_train: 0.00877; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 46.61521; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:17:56,288][trainer][INFO] - Step 8800:
[2023-03-25 15:17:56,289][trainer][INFO] - loss/edit_train: 0.06049; loss/loc_train: 0.00171; edit/acc_train: 1.00000; edit/log_prob_train: -0.06049; edit/prob_train: 0.94323; acc/pre_train: 0.41000; acc/post_train: 0.57000; nll/pre_train: 0.70385; perplexity/pre_train: 2.02152; nll/post_train: 0.69474; perplexity/post_train: 2.00318; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08734; loss/total_train: 0.00776; loss/total_edit_train: 0.00776; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 38.79653; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:18:11,827][trainer][INFO] - Step 8900:
[2023-03-25 15:18:11,827][trainer][INFO] - loss/edit_train: 0.05525; loss/loc_train: 0.00196; edit/acc_train: 1.00000; edit/log_prob_train: -0.05525; edit/prob_train: 0.94734; acc/pre_train: 0.45000; acc/post_train: 0.50000; nll/pre_train: 0.69613; perplexity/pre_train: 2.00597; nll/post_train: 0.69473; perplexity/post_train: 2.00318; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08431; loss/total_train: 0.00748; loss/total_edit_train: 0.00748; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 48.76832; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:18:27,583][trainer][INFO] - Step 9000:
[2023-03-25 15:18:27,583][trainer][INFO] - loss/edit_train: 0.05253; loss/loc_train: 0.00231; edit/acc_train: 1.00000; edit/log_prob_train: -0.05253; edit/prob_train: 0.94969; acc/pre_train: 0.40000; acc/post_train: 0.48000; nll/pre_train: 0.70552; perplexity/pre_train: 2.02490; nll/post_train: 0.70599; perplexity/post_train: 2.02586; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08567; loss/total_train: 0.00756; loss/total_edit_train: 0.00756; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 32.84907; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:18:42,697][trainer][INFO] - Step 9100:
[2023-03-25 15:18:42,697][trainer][INFO] - loss/edit_train: 0.05539; loss/loc_train: 0.00168; edit/acc_train: 1.00000; edit/log_prob_train: -0.05539; edit/prob_train: 0.94728; acc/pre_train: 0.34000; acc/post_train: 0.44000; nll/pre_train: 0.71773; perplexity/pre_train: 2.04978; nll/post_train: 0.70560; perplexity/post_train: 2.02505; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08588; loss/total_train: 0.00721; loss/total_edit_train: 0.00721; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 14.34567; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00009; lr/lr5_train: 0.00011
[2023-03-25 15:18:58,376][trainer][INFO] - Step 9200:
[2023-03-25 15:18:58,376][trainer][INFO] - loss/edit_train: 0.05119; loss/loc_train: 0.00281; edit/acc_train: 1.00000; edit/log_prob_train: -0.05119; edit/prob_train: 0.95034; acc/pre_train: 0.28000; acc/post_train: 0.52000; nll/pre_train: 0.72331; perplexity/pre_train: 2.06125; nll/post_train: 0.68858; perplexity/post_train: 1.99089; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08529; loss/total_train: 0.00793; loss/total_edit_train: 0.00793; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 39.54311; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00009; lr/lr5_train: 0.00011
[2023-03-25 15:19:14,345][trainer][INFO] - Step 9300:
[2023-03-25 15:19:14,346][trainer][INFO] - loss/edit_train: 0.06608; loss/loc_train: 0.00186; edit/acc_train: 0.99000; edit/log_prob_train: -0.06608; edit/prob_train: 0.93982; acc/pre_train: 0.50000; acc/post_train: 0.57000; nll/pre_train: 0.68841; perplexity/pre_train: 1.99055; nll/post_train: 0.69378; perplexity/post_train: 2.00126; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08622; loss/total_train: 0.00847; loss/total_edit_train: 0.00847; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 33.27352; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00009; lr/lr5_train: 0.00011
[2023-03-25 15:19:30,069][trainer][INFO] - Step 9400:
[2023-03-25 15:19:30,070][trainer][INFO] - loss/edit_train: 0.04953; loss/loc_train: 0.00180; edit/acc_train: 1.00000; edit/log_prob_train: -0.04953; edit/prob_train: 0.95209; acc/pre_train: 0.50000; acc/post_train: 0.55000; nll/pre_train: 0.70052; perplexity/pre_train: 2.01480; nll/post_train: 0.69504; perplexity/post_train: 2.00378; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08498; loss/total_train: 0.00675; loss/total_edit_train: 0.00675; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 16.54836; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00009; lr/lr5_train: 0.00011
[2023-03-25 15:19:45,842][trainer][INFO] - Step 9500:
[2023-03-25 15:19:45,842][trainer][INFO] - loss/edit_train: 0.04842; loss/loc_train: 0.00647; edit/acc_train: 1.00000; edit/log_prob_train: -0.04842; edit/prob_train: 0.95345; acc/pre_train: 0.43000; acc/post_train: 0.58000; nll/pre_train: 0.69536; perplexity/pre_train: 2.00443; nll/post_train: 0.69873; perplexity/post_train: 2.01119; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08564; loss/total_train: 0.01131; loss/total_edit_train: 0.01131; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 57.31975; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00009; lr/lr5_train: 0.00011
[2023-03-25 15:20:01,660][trainer][INFO] - Step 9600:
[2023-03-25 15:20:01,661][trainer][INFO] - loss/edit_train: 0.05272; loss/loc_train: 0.00143; edit/acc_train: 1.00000; edit/log_prob_train: -0.05272; edit/prob_train: 0.94999; acc/pre_train: 0.53000; acc/post_train: 0.45000; nll/pre_train: 0.68785; perplexity/pre_train: 1.98942; nll/post_train: 0.69733; perplexity/post_train: 2.00838; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08573; loss/total_train: 0.00671; loss/total_edit_train: 0.00671; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 19.28100; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:20:17,493][trainer][INFO] - Step 9700:
[2023-03-25 15:20:17,494][trainer][INFO] - loss/edit_train: 0.05532; loss/loc_train: 0.00507; edit/acc_train: 1.00000; edit/log_prob_train: -0.05532; edit/prob_train: 0.94771; acc/pre_train: 0.42000; acc/post_train: 0.54000; nll/pre_train: 0.70297; perplexity/pre_train: 2.01973; nll/post_train: 0.68818; perplexity/post_train: 1.99010; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08621; loss/total_train: 0.01060; loss/total_edit_train: 0.01060; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 61.42047; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:20:33,610][trainer][INFO] - Step 9800:
[2023-03-25 15:20:33,610][trainer][INFO] - loss/edit_train: 0.04516; loss/loc_train: 0.00240; edit/acc_train: 1.00000; edit/log_prob_train: -0.04516; edit/prob_train: 0.95593; acc/pre_train: 0.49000; acc/post_train: 0.61000; nll/pre_train: 0.68234; perplexity/pre_train: 1.97851; nll/post_train: 0.68594; perplexity/post_train: 1.98564; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08694; loss/total_train: 0.00691; loss/total_edit_train: 0.00691; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 49.53128; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:20:49,126][trainer][INFO] - Step 9900:
[2023-03-25 15:20:49,127][trainer][INFO] - loss/edit_train: 0.04816; loss/loc_train: 0.00202; edit/acc_train: 1.00000; edit/log_prob_train: -0.04816; edit/prob_train: 0.95383; acc/pre_train: 0.47000; acc/post_train: 0.54000; nll/pre_train: 0.68654; perplexity/pre_train: 1.98683; nll/post_train: 0.69850; perplexity/post_train: 2.01073; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08285; loss/total_train: 0.00683; loss/total_edit_train: 0.00683; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 18.79694; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:21:05,026][trainer][INFO] - Step 10000:
[2023-03-25 15:21:05,027][trainer][INFO] - loss/edit_train: 0.04363; loss/loc_train: 0.00170; edit/acc_train: 1.00000; edit/log_prob_train: -0.04363; edit/prob_train: 0.95740; acc/pre_train: 0.46000; acc/post_train: 0.60000; nll/pre_train: 0.68513; perplexity/pre_train: 1.98402; nll/post_train: 0.68977; perplexity/post_train: 1.99326; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08691; loss/total_train: 0.00606; loss/total_edit_train: 0.00606; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 17.98979; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:22:01,576][trainer][INFO] - Step 10000:
[2023-03-25 15:22:01,576][trainer][INFO] - loss/edit_val: 0.04710; loss/loc_val: 0.00394; edit/acc_val: 0.99800; edit/log_prob_val: -0.04710; edit/prob_val: 0.95525; acc/pre_val: 0.47600; acc/post_val: 0.49850; nll/pre_val: 0.69428; perplexity/pre_val: 2.00227; nll/post_val: 0.70031; perplexity/post_val: 2.01438; n_tokens/pre_val: 4.00000; n_tokens/post_val: 4.00000; time/edit_val: 0.07527; loss/total_val: 0.00865; loss/total_edit_val: 0.00865; memory/alloc_max_val: 3416440320.00000; memory/res_max_val: 3827302400.00000; eval_time/elapsed: 56.52369; eval_time/average: 0.11305
[2023-03-25 15:22:01,580][trainer][INFO] - Saving model to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-52-21_2648014163/models/bert-base-multilingual-uncased.2023-03-25_14-52-21_2648014163
[2023-03-25 15:22:01,580][trainer][INFO] - Moving old archive to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-52-21_2648014163/models/bert-base-multilingual-uncased.2023-03-25_14-52-21_2648014163.bk
[2023-03-25 15:22:02,615][trainer][INFO] - Write complete.
[2023-03-25 15:22:18,648][trainer][INFO] - Step 10100:
[2023-03-25 15:22:18,649][trainer][INFO] - loss/edit_train: 0.04625; loss/loc_train: 0.00153; edit/acc_train: 1.00000; edit/log_prob_train: -0.04625; edit/prob_train: 0.95527; acc/pre_train: 0.41000; acc/post_train: 0.55000; nll/pre_train: 0.69988; perplexity/pre_train: 2.01351; nll/post_train: 0.69520; perplexity/post_train: 2.00412; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09082; loss/total_train: 0.00616; loss/total_edit_train: 0.00616; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 32.52811; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:22:34,338][trainer][INFO] - Step 10200:
[2023-03-25 15:22:34,339][trainer][INFO] - loss/edit_train: 0.04619; loss/loc_train: 0.00239; edit/acc_train: 1.00000; edit/log_prob_train: -0.04619; edit/prob_train: 0.95580; acc/pre_train: 0.44000; acc/post_train: 0.47000; nll/pre_train: 0.70409; perplexity/pre_train: 2.02201; nll/post_train: 0.69903; perplexity/post_train: 2.01180; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08849; loss/total_train: 0.00701; loss/total_edit_train: 0.00701; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 28.95489; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:22:50,754][trainer][INFO] - Step 10300:
[2023-03-25 15:22:50,755][trainer][INFO] - loss/edit_train: 0.04905; loss/loc_train: 0.00430; edit/acc_train: 1.00000; edit/log_prob_train: -0.04905; edit/prob_train: 0.95286; acc/pre_train: 0.39000; acc/post_train: 0.43000; nll/pre_train: 0.70447; perplexity/pre_train: 2.02278; nll/post_train: 0.72347; perplexity/post_train: 2.06157; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08994; loss/total_train: 0.00920; loss/total_edit_train: 0.00920; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 29.93169; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00010; lr/lr5_train: 0.00011
[2023-03-25 15:23:07,023][trainer][INFO] - Step 10400:
[2023-03-25 15:23:07,024][trainer][INFO] - loss/edit_train: 0.04730; loss/loc_train: 0.00350; edit/acc_train: 1.00000; edit/log_prob_train: -0.04730; edit/prob_train: 0.95442; acc/pre_train: 0.54000; acc/post_train: 0.67000; nll/pre_train: 0.68274; perplexity/pre_train: 1.97930; nll/post_train: 0.68844; perplexity/post_train: 1.99061; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08926; loss/total_train: 0.00823; loss/total_edit_train: 0.00823; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 67.50627; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:23:22,663][trainer][INFO] - Step 10500:
[2023-03-25 15:23:22,663][trainer][INFO] - loss/edit_train: 0.05553; loss/loc_train: 0.00493; edit/acc_train: 1.00000; edit/log_prob_train: -0.05553; edit/prob_train: 0.94767; acc/pre_train: 0.41000; acc/post_train: 0.56000; nll/pre_train: 0.69426; perplexity/pre_train: 2.00223; nll/post_train: 0.71531; perplexity/post_train: 2.04483; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08886; loss/total_train: 0.01048; loss/total_edit_train: 0.01048; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 84.31052; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:23:39,055][trainer][INFO] - Step 10600:
[2023-03-25 15:23:39,056][trainer][INFO] - loss/edit_train: 0.05306; loss/loc_train: 0.00139; edit/acc_train: 1.00000; edit/log_prob_train: -0.05306; edit/prob_train: 0.95101; acc/pre_train: 0.43000; acc/post_train: 0.49000; nll/pre_train: 0.69546; perplexity/pre_train: 2.00464; nll/post_train: 0.69328; perplexity/post_train: 2.00026; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09024; loss/total_train: 0.00669; loss/total_edit_train: 0.00669; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 18.11684; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:23:54,963][trainer][INFO] - Step 10700:
[2023-03-25 15:23:54,963][trainer][INFO] - loss/edit_train: 0.04992; loss/loc_train: 0.00365; edit/acc_train: 1.00000; edit/log_prob_train: -0.04992; edit/prob_train: 0.95262; acc/pre_train: 0.46000; acc/post_train: 0.51000; nll/pre_train: 0.69896; perplexity/pre_train: 2.01165; nll/post_train: 0.68571; perplexity/post_train: 1.98519; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08591; loss/total_train: 0.00864; loss/total_edit_train: 0.00864; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 71.63263; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:24:11,020][trainer][INFO] - Step 10800:
[2023-03-25 15:24:11,020][trainer][INFO] - loss/edit_train: 0.04717; loss/loc_train: 0.00171; edit/acc_train: 1.00000; edit/log_prob_train: -0.04717; edit/prob_train: 0.95457; acc/pre_train: 0.44000; acc/post_train: 0.53000; nll/pre_train: 0.69299; perplexity/pre_train: 1.99969; nll/post_train: 0.69637; perplexity/post_train: 2.00645; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08923; loss/total_train: 0.00642; loss/total_edit_train: 0.00642; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 36.58639; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:24:26,844][trainer][INFO] - Step 10900:
[2023-03-25 15:24:26,845][trainer][INFO] - loss/edit_train: 0.05167; loss/loc_train: 0.00398; edit/acc_train: 0.99000; edit/log_prob_train: -0.05167; edit/prob_train: 0.95338; acc/pre_train: 0.47000; acc/post_train: 0.63000; nll/pre_train: 0.69647; perplexity/pre_train: 2.00665; nll/post_train: 0.70630; perplexity/post_train: 2.02647; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08676; loss/total_train: 0.00914; loss/total_edit_train: 0.00914; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 22.90640; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:24:43,760][trainer][INFO] - Step 11000:
[2023-03-25 15:24:43,761][trainer][INFO] - loss/edit_train: 0.04109; loss/loc_train: 0.00279; edit/acc_train: 1.00000; edit/log_prob_train: -0.04109; edit/prob_train: 0.95987; acc/pre_train: 0.49000; acc/post_train: 0.51000; nll/pre_train: 0.69335; perplexity/pre_train: 2.00041; nll/post_train: 0.70901; perplexity/post_train: 2.03197; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09403; loss/total_train: 0.00689; loss/total_edit_train: 0.00689; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 28.69946; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:25:00,452][trainer][INFO] - Step 11100:
[2023-03-25 15:25:00,453][trainer][INFO] - loss/edit_train: 0.04483; loss/loc_train: 0.00219; edit/acc_train: 1.00000; edit/log_prob_train: -0.04483; edit/prob_train: 0.95771; acc/pre_train: 0.44000; acc/post_train: 0.49000; nll/pre_train: 0.70255; perplexity/pre_train: 2.01890; nll/post_train: 0.69335; perplexity/post_train: 2.00040; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09187; loss/total_train: 0.00667; loss/total_edit_train: 0.00667; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 10.11548; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:25:16,353][trainer][INFO] - Step 11200:
[2023-03-25 15:25:16,354][trainer][INFO] - loss/edit_train: 0.04600; loss/loc_train: 0.00385; edit/acc_train: 1.00000; edit/log_prob_train: -0.04600; edit/prob_train: 0.95630; acc/pre_train: 0.48000; acc/post_train: 0.53000; nll/pre_train: 0.69442; perplexity/pre_train: 2.00256; nll/post_train: 0.68646; perplexity/post_train: 1.98667; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08671; loss/total_train: 0.00845; loss/total_edit_train: 0.00845; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 89.54772; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:25:33,274][trainer][INFO] - Step 11300:
[2023-03-25 15:25:33,274][trainer][INFO] - loss/edit_train: 0.03736; loss/loc_train: 0.00212; edit/acc_train: 1.00000; edit/log_prob_train: -0.03736; edit/prob_train: 0.96337; acc/pre_train: 0.50000; acc/post_train: 0.63000; nll/pre_train: 0.68722; perplexity/pre_train: 1.98819; nll/post_train: 0.68777; perplexity/post_train: 1.98928; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09296; loss/total_train: 0.00585; loss/total_edit_train: 0.00585; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 13.07360; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:25:49,681][trainer][INFO] - Step 11400:
[2023-03-25 15:25:49,682][trainer][INFO] - loss/edit_train: 0.03899; loss/loc_train: 0.00266; edit/acc_train: 1.00000; edit/log_prob_train: -0.03899; edit/prob_train: 0.96185; acc/pre_train: 0.44000; acc/post_train: 0.63000; nll/pre_train: 0.70201; perplexity/pre_train: 2.01780; nll/post_train: 0.68909; perplexity/post_train: 1.99190; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09066; loss/total_train: 0.00655; loss/total_edit_train: 0.00655; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 30.41095; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:26:06,124][trainer][INFO] - Step 11500:
[2023-03-25 15:26:06,125][trainer][INFO] - loss/edit_train: 0.05048; loss/loc_train: 0.00195; edit/acc_train: 1.00000; edit/log_prob_train: -0.05048; edit/prob_train: 0.95262; acc/pre_train: 0.47000; acc/post_train: 0.57000; nll/pre_train: 0.68739; perplexity/pre_train: 1.98851; nll/post_train: 0.69488; perplexity/post_train: 2.00346; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09323; loss/total_train: 0.00699; loss/total_edit_train: 0.00699; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 48.72007; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:26:22,277][trainer][INFO] - Step 11600:
[2023-03-25 15:26:22,278][trainer][INFO] - loss/edit_train: 0.04176; loss/loc_train: 0.00263; edit/acc_train: 1.00000; edit/log_prob_train: -0.04176; edit/prob_train: 0.95964; acc/pre_train: 0.41000; acc/post_train: 0.54000; nll/pre_train: 0.71121; perplexity/pre_train: 2.03646; nll/post_train: 0.68552; perplexity/post_train: 1.98481; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08823; loss/total_train: 0.00680; loss/total_edit_train: 0.00680; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 34.28872; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:26:38,548][trainer][INFO] - Step 11700:
[2023-03-25 15:26:38,548][trainer][INFO] - loss/edit_train: 0.04077; loss/loc_train: 0.00235; edit/acc_train: 1.00000; edit/log_prob_train: -0.04077; edit/prob_train: 0.96067; acc/pre_train: 0.43000; acc/post_train: 0.58000; nll/pre_train: 0.69025; perplexity/pre_train: 1.99421; nll/post_train: 0.69809; perplexity/post_train: 2.00992; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09216; loss/total_train: 0.00642; loss/total_edit_train: 0.00642; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 27.52243; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:26:54,204][trainer][INFO] - Step 11800:
[2023-03-25 15:26:54,204][trainer][INFO] - loss/edit_train: 0.05975; loss/loc_train: 0.00148; edit/acc_train: 0.99000; edit/log_prob_train: -0.05975; edit/prob_train: 0.94739; acc/pre_train: 0.50000; acc/post_train: 0.57000; nll/pre_train: 0.68600; perplexity/pre_train: 1.98576; nll/post_train: 0.68886; perplexity/post_train: 1.99144; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08939; loss/total_train: 0.00746; loss/total_edit_train: 0.00746; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 37.40082; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:27:10,297][trainer][INFO] - Step 11900:
[2023-03-25 15:27:10,297][trainer][INFO] - loss/edit_train: 0.04598; loss/loc_train: 0.00118; edit/acc_train: 1.00000; edit/log_prob_train: -0.04598; edit/prob_train: 0.95633; acc/pre_train: 0.48000; acc/post_train: 0.53000; nll/pre_train: 0.70446; perplexity/pre_train: 2.02276; nll/post_train: 0.69105; perplexity/post_train: 1.99580; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08778; loss/total_train: 0.00577; loss/total_edit_train: 0.00577; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 41.83706; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00011
[2023-03-25 15:27:26,601][trainer][INFO] - Step 12000:
[2023-03-25 15:27:26,601][trainer][INFO] - loss/edit_train: 0.03476; loss/loc_train: 0.00457; edit/acc_train: 1.00000; edit/log_prob_train: -0.03476; edit/prob_train: 0.96586; acc/pre_train: 0.46000; acc/post_train: 0.51000; nll/pre_train: 0.69941; perplexity/pre_train: 2.01257; nll/post_train: 0.68374; perplexity/post_train: 1.98127; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08987; loss/total_train: 0.00805; loss/total_edit_train: 0.00805; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 42.17034; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00012
[2023-03-25 15:27:42,929][trainer][INFO] - Step 12100:
[2023-03-25 15:27:42,930][trainer][INFO] - loss/edit_train: 0.03710; loss/loc_train: 0.00170; edit/acc_train: 1.00000; edit/log_prob_train: -0.03710; edit/prob_train: 0.96387; acc/pre_train: 0.39000; acc/post_train: 0.46000; nll/pre_train: 0.70438; perplexity/pre_train: 2.02260; nll/post_train: 0.69135; perplexity/post_train: 1.99641; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08867; loss/total_train: 0.00541; loss/total_edit_train: 0.00541; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 22.90703; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00012
[2023-03-25 15:27:59,383][trainer][INFO] - Step 12200:
[2023-03-25 15:27:59,384][trainer][INFO] - loss/edit_train: 0.04155; loss/loc_train: 0.00200; edit/acc_train: 1.00000; edit/log_prob_train: -0.04155; edit/prob_train: 0.96065; acc/pre_train: 0.56000; acc/post_train: 0.55000; nll/pre_train: 0.68907; perplexity/pre_train: 1.99187; nll/post_train: 0.69246; perplexity/post_train: 1.99862; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09126; loss/total_train: 0.00616; loss/total_edit_train: 0.00616; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 19.32526; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00012
[2023-03-25 15:28:15,358][trainer][INFO] - Step 12300:
[2023-03-25 15:28:15,359][trainer][INFO] - loss/edit_train: 0.03777; loss/loc_train: 0.00300; edit/acc_train: 1.00000; edit/log_prob_train: -0.03777; edit/prob_train: 0.96324; acc/pre_train: 0.50000; acc/post_train: 0.57000; nll/pre_train: 0.69775; perplexity/pre_train: 2.00922; nll/post_train: 0.68215; perplexity/post_train: 1.97813; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08648; loss/total_train: 0.00678; loss/total_edit_train: 0.00678; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 40.72613; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:28:31,502][trainer][INFO] - Step 12400:
[2023-03-25 15:28:31,503][trainer][INFO] - loss/edit_train: 0.03393; loss/loc_train: 0.00168; edit/acc_train: 1.00000; edit/log_prob_train: -0.03393; edit/prob_train: 0.96671; acc/pre_train: 0.55000; acc/post_train: 0.55000; nll/pre_train: 0.70041; perplexity/pre_train: 2.01457; nll/post_train: 0.69312; perplexity/post_train: 1.99994; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08982; loss/total_train: 0.00507; loss/total_edit_train: 0.00507; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 18.42461; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:28:47,587][trainer][INFO] - Step 12500:
[2023-03-25 15:28:47,587][trainer][INFO] - loss/edit_train: 0.04145; loss/loc_train: 0.00550; edit/acc_train: 1.00000; edit/log_prob_train: -0.04145; edit/prob_train: 0.96012; acc/pre_train: 0.41000; acc/post_train: 0.37000; nll/pre_train: 0.70106; perplexity/pre_train: 2.01588; nll/post_train: 0.72873; perplexity/post_train: 2.07244; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08841; loss/total_train: 0.00964; loss/total_edit_train: 0.00964; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 34.02083; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:29:03,794][trainer][INFO] - Step 12600:
[2023-03-25 15:29:03,794][trainer][INFO] - loss/edit_train: 0.04590; loss/loc_train: 0.00257; edit/acc_train: 1.00000; edit/log_prob_train: -0.04590; edit/prob_train: 0.95676; acc/pre_train: 0.53000; acc/post_train: 0.55000; nll/pre_train: 0.67714; perplexity/pre_train: 1.96825; nll/post_train: 0.68766; perplexity/post_train: 1.98906; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09161; loss/total_train: 0.00716; loss/total_edit_train: 0.00716; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 33.50100; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:29:20,104][trainer][INFO] - Step 12700:
[2023-03-25 15:29:20,104][trainer][INFO] - loss/edit_train: 0.04262; loss/loc_train: 0.00178; edit/acc_train: 1.00000; edit/log_prob_train: -0.04262; edit/prob_train: 0.95907; acc/pre_train: 0.41000; acc/post_train: 0.48000; nll/pre_train: 0.70701; perplexity/pre_train: 2.02791; nll/post_train: 0.68828; perplexity/post_train: 1.99029; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08839; loss/total_train: 0.00604; loss/total_edit_train: 0.00604; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 40.84652; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:29:36,842][trainer][INFO] - Step 12800:
[2023-03-25 15:29:36,843][trainer][INFO] - loss/edit_train: 0.03807; loss/loc_train: 0.00181; edit/acc_train: 1.00000; edit/log_prob_train: -0.03807; edit/prob_train: 0.96287; acc/pre_train: 0.47000; acc/post_train: 0.66000; nll/pre_train: 0.69026; perplexity/pre_train: 1.99423; nll/post_train: 0.67904; perplexity/post_train: 1.97199; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09102; loss/total_train: 0.00562; loss/total_edit_train: 0.00562; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 30.16872; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:29:54,089][trainer][INFO] - Step 12900:
[2023-03-25 15:29:54,089][trainer][INFO] - loss/edit_train: 0.04430; loss/loc_train: 0.00161; edit/acc_train: 1.00000; edit/log_prob_train: -0.04430; edit/prob_train: 0.95869; acc/pre_train: 0.41000; acc/post_train: 0.57000; nll/pre_train: 0.69871; perplexity/pre_train: 2.01116; nll/post_train: 0.69375; perplexity/post_train: 2.00121; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09554; loss/total_train: 0.00604; loss/total_edit_train: 0.00604; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 25.51903; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:30:09,609][trainer][INFO] - Step 13000:
[2023-03-25 15:30:09,609][trainer][INFO] - loss/edit_train: 0.03232; loss/loc_train: 0.00589; edit/acc_train: 1.00000; edit/log_prob_train: -0.03232; edit/prob_train: 0.96821; acc/pre_train: 0.40000; acc/post_train: 0.59000; nll/pre_train: 0.69232; perplexity/pre_train: 1.99834; nll/post_train: 0.70096; perplexity/post_train: 2.01568; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08488; loss/total_train: 0.00912; loss/total_edit_train: 0.00912; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 98.52996; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:30:25,924][trainer][INFO] - Step 13100:
[2023-03-25 15:30:25,924][trainer][INFO] - loss/edit_train: 0.04655; loss/loc_train: 0.00785; edit/acc_train: 1.00000; edit/log_prob_train: -0.04655; edit/prob_train: 0.95631; acc/pre_train: 0.33000; acc/post_train: 0.56000; nll/pre_train: 0.72692; perplexity/pre_train: 2.06869; nll/post_train: 0.69496; perplexity/post_train: 2.00363; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09104; loss/total_train: 0.01251; loss/total_edit_train: 0.01251; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 142.89004; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:30:42,037][trainer][INFO] - Step 13200:
[2023-03-25 15:30:42,037][trainer][INFO] - loss/edit_train: 0.06118; loss/loc_train: 0.00184; edit/acc_train: 0.99000; edit/log_prob_train: -0.06118; edit/prob_train: 0.94501; acc/pre_train: 0.48000; acc/post_train: 0.48000; nll/pre_train: 0.68998; perplexity/pre_train: 1.99367; nll/post_train: 0.69656; perplexity/post_train: 2.00684; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08848; loss/total_train: 0.00796; loss/total_edit_train: 0.00796; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 42.12997; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:30:58,716][trainer][INFO] - Step 13300:
[2023-03-25 15:30:58,716][trainer][INFO] - loss/edit_train: 0.04479; loss/loc_train: 0.00135; edit/acc_train: 1.00000; edit/log_prob_train: -0.04479; edit/prob_train: 0.95681; acc/pre_train: 0.52000; acc/post_train: 0.57000; nll/pre_train: 0.69030; perplexity/pre_train: 1.99430; nll/post_train: 0.68626; perplexity/post_train: 1.98628; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09269; loss/total_train: 0.00583; loss/total_edit_train: 0.00583; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 32.15701; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00009; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:31:14,155][trainer][INFO] - Step 13400:
[2023-03-25 15:31:14,156][trainer][INFO] - loss/edit_train: 0.03219; loss/loc_train: 0.00212; edit/acc_train: 1.00000; edit/log_prob_train: -0.03219; edit/prob_train: 0.96834; acc/pre_train: 0.46000; acc/post_train: 0.51000; nll/pre_train: 0.69963; perplexity/pre_train: 2.01300; nll/post_train: 0.70149; perplexity/post_train: 2.01675; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08931; loss/total_train: 0.00534; loss/total_edit_train: 0.00534; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 43.58421; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:31:29,649][trainer][INFO] - Step 13500:
[2023-03-25 15:31:29,650][trainer][INFO] - loss/edit_train: 0.04117; loss/loc_train: 0.00295; edit/acc_train: 0.99000; edit/log_prob_train: -0.04117; edit/prob_train: 0.96181; acc/pre_train: 0.47000; acc/post_train: 0.47000; nll/pre_train: 0.68756; perplexity/pre_train: 1.98885; nll/post_train: 0.69737; perplexity/post_train: 2.00846; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08838; loss/total_train: 0.00707; loss/total_edit_train: 0.00707; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 51.53561; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:31:45,437][trainer][INFO] - Step 13600:
[2023-03-25 15:31:45,438][trainer][INFO] - loss/edit_train: 0.03401; loss/loc_train: 0.00386; edit/acc_train: 1.00000; edit/log_prob_train: -0.03401; edit/prob_train: 0.96664; acc/pre_train: 0.40000; acc/post_train: 0.56000; nll/pre_train: 0.70529; perplexity/pre_train: 2.02444; nll/post_train: 0.68539; perplexity/post_train: 1.98455; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09181; loss/total_train: 0.00727; loss/total_edit_train: 0.00727; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 54.87839; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:32:01,091][trainer][INFO] - Step 13700:
[2023-03-25 15:32:01,092][trainer][INFO] - loss/edit_train: 0.05232; loss/loc_train: 0.00206; edit/acc_train: 1.00000; edit/log_prob_train: -0.05232; edit/prob_train: 0.95198; acc/pre_train: 0.36000; acc/post_train: 0.53000; nll/pre_train: 0.70945; perplexity/pre_train: 2.03286; nll/post_train: 0.69181; perplexity/post_train: 1.99733; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08692; loss/total_train: 0.00729; loss/total_edit_train: 0.00729; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 35.81268; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:32:17,581][trainer][INFO] - Step 13800:
[2023-03-25 15:32:17,581][trainer][INFO] - loss/edit_train: 0.03349; loss/loc_train: 0.00139; edit/acc_train: 1.00000; edit/log_prob_train: -0.03349; edit/prob_train: 0.96721; acc/pre_train: 0.39000; acc/post_train: 0.46000; nll/pre_train: 0.70909; perplexity/pre_train: 2.03215; nll/post_train: 0.69849; perplexity/post_train: 2.01071; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09007; loss/total_train: 0.00474; loss/total_edit_train: 0.00474; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 17.24405; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:32:33,494][trainer][INFO] - Step 13900:
[2023-03-25 15:32:33,494][trainer][INFO] - loss/edit_train: 0.03172; loss/loc_train: 0.00396; edit/acc_train: 1.00000; edit/log_prob_train: -0.03172; edit/prob_train: 0.96882; acc/pre_train: 0.49000; acc/post_train: 0.55000; nll/pre_train: 0.69389; perplexity/pre_train: 2.00148; nll/post_train: 0.69896; perplexity/post_train: 2.01165; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08716; loss/total_train: 0.00713; loss/total_edit_train: 0.00713; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 95.15112; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:32:49,906][trainer][INFO] - Step 14000:
[2023-03-25 15:32:49,906][trainer][INFO] - loss/edit_train: 0.04445; loss/loc_train: 0.00198; edit/acc_train: 1.00000; edit/log_prob_train: -0.04445; edit/prob_train: 0.95856; acc/pre_train: 0.45000; acc/post_train: 0.51000; nll/pre_train: 0.70030; perplexity/pre_train: 2.01435; nll/post_train: 0.69095; perplexity/post_train: 1.99561; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08983; loss/total_train: 0.00643; loss/total_edit_train: 0.00643; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 28.14179; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:33:05,769][trainer][INFO] - Step 14100:
[2023-03-25 15:33:05,770][trainer][INFO] - loss/edit_train: 0.04079; loss/loc_train: 0.00480; edit/acc_train: 1.00000; edit/log_prob_train: -0.04079; edit/prob_train: 0.96089; acc/pre_train: 0.43000; acc/post_train: 0.64000; nll/pre_train: 0.69135; perplexity/pre_train: 1.99642; nll/post_train: 0.71604; perplexity/post_train: 2.04631; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08729; loss/total_train: 0.00888; loss/total_edit_train: 0.00888; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 68.58513; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:33:22,449][trainer][INFO] - Step 14200:
[2023-03-25 15:33:22,450][trainer][INFO] - loss/edit_train: 0.04190; loss/loc_train: 0.00146; edit/acc_train: 0.99000; edit/log_prob_train: -0.04190; edit/prob_train: 0.96135; acc/pre_train: 0.40000; acc/post_train: 0.54000; nll/pre_train: 0.71190; perplexity/pre_train: 2.03785; nll/post_train: 0.68818; perplexity/post_train: 1.99009; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09109; loss/total_train: 0.00566; loss/total_edit_train: 0.00566; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 23.84282; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:33:38,919][trainer][INFO] - Step 14300:
[2023-03-25 15:33:38,920][trainer][INFO] - loss/edit_train: 0.03919; loss/loc_train: 0.00755; edit/acc_train: 1.00000; edit/log_prob_train: -0.03919; edit/prob_train: 0.96297; acc/pre_train: 0.49000; acc/post_train: 0.55000; nll/pre_train: 0.69133; perplexity/pre_train: 1.99636; nll/post_train: 0.67445; perplexity/post_train: 1.96294; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09039; loss/total_train: 0.01147; loss/total_edit_train: 0.01147; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 127.61595; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:33:56,004][trainer][INFO] - Step 14400:
[2023-03-25 15:33:56,004][trainer][INFO] - loss/edit_train: 0.03899; loss/loc_train: 0.00291; edit/acc_train: 1.00000; edit/log_prob_train: -0.03899; edit/prob_train: 0.96353; acc/pre_train: 0.41000; acc/post_train: 0.51000; nll/pre_train: 0.70537; perplexity/pre_train: 2.02459; nll/post_train: 0.68644; perplexity/post_train: 1.98664; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09237; loss/total_train: 0.00681; loss/total_edit_train: 0.00681; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 79.16713; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:34:13,599][trainer][INFO] - Step 14500:
[2023-03-25 15:34:13,600][trainer][INFO] - loss/edit_train: 0.03402; loss/loc_train: 0.00172; edit/acc_train: 1.00000; edit/log_prob_train: -0.03402; edit/prob_train: 0.96677; acc/pre_train: 0.43000; acc/post_train: 0.42000; nll/pre_train: 0.70219; perplexity/pre_train: 2.01816; nll/post_train: 0.69806; perplexity/post_train: 2.00985; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09623; loss/total_train: 0.00512; loss/total_edit_train: 0.00512; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 26.58667; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00013; lr/lr5_train: 0.00012
[2023-03-25 15:34:30,548][trainer][INFO] - Step 14600:
[2023-03-25 15:34:30,548][trainer][INFO] - loss/edit_train: 0.03432; loss/loc_train: 0.00177; edit/acc_train: 1.00000; edit/log_prob_train: -0.03432; edit/prob_train: 0.96636; acc/pre_train: 0.51000; acc/post_train: 0.50000; nll/pre_train: 0.69229; perplexity/pre_train: 1.99828; nll/post_train: 0.70083; perplexity/post_train: 2.01542; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09377; loss/total_train: 0.00520; loss/total_edit_train: 0.00520; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 25.72074; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:34:47,951][trainer][INFO] - Step 14700:
[2023-03-25 15:34:47,952][trainer][INFO] - loss/edit_train: 0.04151; loss/loc_train: 0.00133; edit/acc_train: 1.00000; edit/log_prob_train: -0.04151; edit/prob_train: 0.96088; acc/pre_train: 0.48000; acc/post_train: 0.52000; nll/pre_train: 0.68715; perplexity/pre_train: 1.98805; nll/post_train: 0.69637; perplexity/post_train: 2.00646; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09528; loss/total_train: 0.00549; loss/total_edit_train: 0.00549; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 20.60950; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:35:04,730][trainer][INFO] - Step 14800:
[2023-03-25 15:35:04,730][trainer][INFO] - loss/edit_train: 0.03665; loss/loc_train: 0.00115; edit/acc_train: 1.00000; edit/log_prob_train: -0.03665; edit/prob_train: 0.96454; acc/pre_train: 0.47000; acc/post_train: 0.51000; nll/pre_train: 0.69692; perplexity/pre_train: 2.00756; nll/post_train: 0.69033; perplexity/post_train: 1.99438; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09334; loss/total_train: 0.00481; loss/total_edit_train: 0.00481; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 14.83740; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:35:21,726][trainer][INFO] - Step 14900:
[2023-03-25 15:35:21,726][trainer][INFO] - loss/edit_train: 0.03158; loss/loc_train: 0.00117; edit/acc_train: 1.00000; edit/log_prob_train: -0.03158; edit/prob_train: 0.96904; acc/pre_train: 0.43000; acc/post_train: 0.47000; nll/pre_train: 0.69789; perplexity/pre_train: 2.00950; nll/post_train: 0.69594; perplexity/post_train: 2.00560; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09300; loss/total_train: 0.00433; loss/total_edit_train: 0.00433; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 8.41818; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:35:38,472][trainer][INFO] - Step 15000:
[2023-03-25 15:35:38,472][trainer][INFO] - loss/edit_train: 0.03179; loss/loc_train: 0.00121; edit/acc_train: 1.00000; edit/log_prob_train: -0.03179; edit/prob_train: 0.96881; acc/pre_train: 0.45000; acc/post_train: 0.55000; nll/pre_train: 0.69379; perplexity/pre_train: 2.00129; nll/post_train: 0.69145; perplexity/post_train: 1.99662; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09488; loss/total_train: 0.00439; loss/total_edit_train: 0.00439; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 10.08949; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:36:39,853][trainer][INFO] - Step 15000:
[2023-03-25 15:36:39,854][trainer][INFO] - loss/edit_val: 0.03365; loss/loc_val: 0.00328; edit/acc_val: 0.99800; edit/log_prob_val: -0.03365; edit/prob_val: 0.96775; acc/pre_val: 0.47600; acc/post_val: 0.49250; nll/pre_val: 0.69428; perplexity/pre_val: 2.00227; nll/post_val: 0.70091; perplexity/post_val: 2.01558; n_tokens/pre_val: 4.00000; n_tokens/post_val: 4.00000; time/edit_val: 0.08176; loss/total_val: 0.00665; loss/total_edit_val: 0.00665; memory/alloc_max_val: 3416440320.00000; memory/res_max_val: 3827302400.00000; eval_time/elapsed: 61.35466; eval_time/average: 0.12271
[2023-03-25 15:36:39,858][trainer][INFO] - Saving model to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-52-21_2648014163/models/bert-base-multilingual-uncased.2023-03-25_14-52-21_2648014163
[2023-03-25 15:36:39,858][trainer][INFO] - Moving old archive to /home/anonymous-xme/mend/mend/outputs/2023-03-25_14-52-21_2648014163/models/bert-base-multilingual-uncased.2023-03-25_14-52-21_2648014163.bk
[2023-03-25 15:36:40,901][trainer][INFO] - Write complete.
[2023-03-25 15:36:57,319][trainer][INFO] - Step 15100:
[2023-03-25 15:36:57,319][trainer][INFO] - loss/edit_train: 0.03537; loss/loc_train: 0.00130; edit/acc_train: 1.00000; edit/log_prob_train: -0.03537; edit/prob_train: 0.96596; acc/pre_train: 0.46000; acc/post_train: 0.48000; nll/pre_train: 0.68583; perplexity/pre_train: 1.98542; nll/post_train: 0.69243; perplexity/post_train: 1.99857; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08901; loss/total_train: 0.00484; loss/total_edit_train: 0.00484; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 21.69231; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:37:13,873][trainer][INFO] - Step 15200:
[2023-03-25 15:37:13,874][trainer][INFO] - loss/edit_train: 0.03365; loss/loc_train: 0.00164; edit/acc_train: 1.00000; edit/log_prob_train: -0.03365; edit/prob_train: 0.96770; acc/pre_train: 0.37000; acc/post_train: 0.55000; nll/pre_train: 0.70208; perplexity/pre_train: 2.01794; nll/post_train: 0.68864; perplexity/post_train: 1.99101; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09009; loss/total_train: 0.00500; loss/total_edit_train: 0.00500; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 14.20462; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:37:31,128][trainer][INFO] - Step 15300:
[2023-03-25 15:37:31,129][trainer][INFO] - loss/edit_train: 0.03068; loss/loc_train: 0.00918; edit/acc_train: 1.00000; edit/log_prob_train: -0.03068; edit/prob_train: 0.96985; acc/pre_train: 0.38000; acc/post_train: 0.57000; nll/pre_train: 0.71198; perplexity/pre_train: 2.03803; nll/post_train: 0.67422; perplexity/post_train: 1.96251; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09374; loss/total_train: 0.01225; loss/total_edit_train: 0.01225; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 105.29348; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:37:48,354][trainer][INFO] - Step 15400:
[2023-03-25 15:37:48,354][trainer][INFO] - loss/edit_train: 0.03431; loss/loc_train: 0.00128; edit/acc_train: 1.00000; edit/log_prob_train: -0.03431; edit/prob_train: 0.96716; acc/pre_train: 0.50000; acc/post_train: 0.57000; nll/pre_train: 0.69170; perplexity/pre_train: 1.99710; nll/post_train: 0.69234; perplexity/post_train: 1.99839; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09384; loss/total_train: 0.00472; loss/total_edit_train: 0.00472; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 20.40612; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:38:05,501][trainer][INFO] - Step 15500:
[2023-03-25 15:38:05,502][trainer][INFO] - loss/edit_train: 0.04067; loss/loc_train: 0.00187; edit/acc_train: 1.00000; edit/log_prob_train: -0.04067; edit/prob_train: 0.96176; acc/pre_train: 0.44000; acc/post_train: 0.49000; nll/pre_train: 0.70315; perplexity/pre_train: 2.02011; nll/post_train: 0.70539; perplexity/post_train: 2.02464; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09320; loss/total_train: 0.00594; loss/total_edit_train: 0.00594; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 59.09172; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:38:22,008][trainer][INFO] - Step 15600:
[2023-03-25 15:38:22,009][trainer][INFO] - loss/edit_train: 0.03598; loss/loc_train: 0.00378; edit/acc_train: 1.00000; edit/log_prob_train: -0.03598; edit/prob_train: 0.96588; acc/pre_train: 0.38000; acc/post_train: 0.50000; nll/pre_train: 0.71278; perplexity/pre_train: 2.03965; nll/post_train: 0.69921; perplexity/post_train: 2.01217; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08945; loss/total_train: 0.00737; loss/total_edit_train: 0.00737; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 80.74686; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:38:38,973][trainer][INFO] - Step 15700:
[2023-03-25 15:38:38,973][trainer][INFO] - loss/edit_train: 0.03845; loss/loc_train: 0.00636; edit/acc_train: 1.00000; edit/log_prob_train: -0.03845; edit/prob_train: 0.96300; acc/pre_train: 0.49000; acc/post_train: 0.53000; nll/pre_train: 0.69425; perplexity/pre_train: 2.00222; nll/post_train: 0.71035; perplexity/post_train: 2.03471; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09330; loss/total_train: 0.01020; loss/total_edit_train: 0.01020; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 61.41496; lr/lr0_train: 0.00022; lr/lr1_train: 0.00022; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:38:56,331][trainer][INFO] - Step 15800:
[2023-03-25 15:38:56,331][trainer][INFO] - loss/edit_train: 0.03431; loss/loc_train: 0.00146; edit/acc_train: 1.00000; edit/log_prob_train: -0.03431; edit/prob_train: 0.96634; acc/pre_train: 0.50000; acc/post_train: 0.50000; nll/pre_train: 0.68616; perplexity/pre_train: 1.98607; nll/post_train: 0.69389; perplexity/post_train: 2.00149; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09386; loss/total_train: 0.00489; loss/total_edit_train: 0.00489; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 19.42468; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:39:13,731][trainer][INFO] - Step 15900:
[2023-03-25 15:39:13,732][trainer][INFO] - loss/edit_train: 0.03527; loss/loc_train: 0.00177; edit/acc_train: 1.00000; edit/log_prob_train: -0.03527; edit/prob_train: 0.96570; acc/pre_train: 0.39000; acc/post_train: 0.41000; nll/pre_train: 0.71449; perplexity/pre_train: 2.04314; nll/post_train: 0.69667; perplexity/post_train: 2.00707; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09419; loss/total_train: 0.00530; loss/total_edit_train: 0.00530; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 10.84535; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:39:30,361][trainer][INFO] - Step 16000:
[2023-03-25 15:39:30,362][trainer][INFO] - loss/edit_train: 0.03363; loss/loc_train: 0.00362; edit/acc_train: 1.00000; edit/log_prob_train: -0.03363; edit/prob_train: 0.96732; acc/pre_train: 0.44000; acc/post_train: 0.58000; nll/pre_train: 0.69990; perplexity/pre_train: 2.01355; nll/post_train: 0.70307; perplexity/post_train: 2.01994; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08892; loss/total_train: 0.00698; loss/total_edit_train: 0.00698; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 83.10251; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:39:47,323][trainer][INFO] - Step 16100:
[2023-03-25 15:39:47,323][trainer][INFO] - loss/edit_train: 0.03097; loss/loc_train: 0.00529; edit/acc_train: 1.00000; edit/log_prob_train: -0.03097; edit/prob_train: 0.96965; acc/pre_train: 0.52000; acc/post_train: 0.56000; nll/pre_train: 0.67861; perplexity/pre_train: 1.97114; nll/post_train: 0.67531; perplexity/post_train: 1.96465; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09161; loss/total_train: 0.00838; loss/total_edit_train: 0.00838; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 30.06519; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:40:05,004][trainer][INFO] - Step 16200:
[2023-03-25 15:40:05,004][trainer][INFO] - loss/edit_train: 0.02871; loss/loc_train: 0.00222; edit/acc_train: 1.00000; edit/log_prob_train: -0.02871; edit/prob_train: 0.97171; acc/pre_train: 0.46000; acc/post_train: 0.64000; nll/pre_train: 0.69592; perplexity/pre_train: 2.00555; nll/post_train: 0.69566; perplexity/post_train: 2.00503; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09539; loss/total_train: 0.00509; loss/total_edit_train: 0.00509; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 42.99205; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:40:21,917][trainer][INFO] - Step 16300:
[2023-03-25 15:40:21,918][trainer][INFO] - loss/edit_train: 0.02954; loss/loc_train: 0.00173; edit/acc_train: 1.00000; edit/log_prob_train: -0.02954; edit/prob_train: 0.97099; acc/pre_train: 0.41000; acc/post_train: 0.55000; nll/pre_train: 0.70549; perplexity/pre_train: 2.02484; nll/post_train: 0.69326; perplexity/post_train: 2.00023; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09155; loss/total_train: 0.00468; loss/total_edit_train: 0.00468; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 25.07428; lr/lr0_train: 0.00021; lr/lr1_train: 0.00022; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:40:38,851][trainer][INFO] - Step 16400:
[2023-03-25 15:40:38,852][trainer][INFO] - loss/edit_train: 0.02921; loss/loc_train: 0.00184; edit/acc_train: 1.00000; edit/log_prob_train: -0.02921; edit/prob_train: 0.97124; acc/pre_train: 0.54000; acc/post_train: 0.56000; nll/pre_train: 0.67862; perplexity/pre_train: 1.97116; nll/post_train: 0.69057; perplexity/post_train: 1.99485; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09249; loss/total_train: 0.00476; loss/total_edit_train: 0.00476; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 12.27008; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:40:56,162][trainer][INFO] - Step 16500:
[2023-03-25 15:40:56,163][trainer][INFO] - loss/edit_train: 0.03412; loss/loc_train: 0.00158; edit/acc_train: 1.00000; edit/log_prob_train: -0.03412; edit/prob_train: 0.96710; acc/pre_train: 0.47000; acc/post_train: 0.68000; nll/pre_train: 0.68354; perplexity/pre_train: 1.98088; nll/post_train: 0.67962; perplexity/post_train: 1.97312; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09328; loss/total_train: 0.00500; loss/total_edit_train: 0.00500; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 23.70365; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:41:12,756][trainer][INFO] - Step 16600:
[2023-03-25 15:41:12,757][trainer][INFO] - loss/edit_train: 0.02786; loss/loc_train: 0.00181; edit/acc_train: 1.00000; edit/log_prob_train: -0.02786; edit/prob_train: 0.97253; acc/pre_train: 0.41000; acc/post_train: 0.51000; nll/pre_train: 0.70067; perplexity/pre_train: 2.01509; nll/post_train: 0.69034; perplexity/post_train: 1.99439; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09358; loss/total_train: 0.00460; loss/total_edit_train: 0.00460; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 10.62050; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:41:30,032][trainer][INFO] - Step 16700:
[2023-03-25 15:41:30,032][trainer][INFO] - loss/edit_train: 0.02879; loss/loc_train: 0.00488; edit/acc_train: 1.00000; edit/log_prob_train: -0.02879; edit/prob_train: 0.97172; acc/pre_train: 0.45000; acc/post_train: 0.56000; nll/pre_train: 0.69561; perplexity/pre_train: 2.00493; nll/post_train: 0.71207; perplexity/post_train: 2.03820; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09579; loss/total_train: 0.00776; loss/total_edit_train: 0.00776; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 27.00548; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:41:46,897][trainer][INFO] - Step 16800:
[2023-03-25 15:41:46,898][trainer][INFO] - loss/edit_train: 0.02775; loss/loc_train: 0.00143; edit/acc_train: 1.00000; edit/log_prob_train: -0.02775; edit/prob_train: 0.97265; acc/pre_train: 0.43000; acc/post_train: 0.62000; nll/pre_train: 0.69389; perplexity/pre_train: 2.00149; nll/post_train: 0.68608; perplexity/post_train: 1.98591; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09386; loss/total_train: 0.00420; loss/total_edit_train: 0.00420; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 9.45413; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:42:04,252][trainer][INFO] - Step 16900:
[2023-03-25 15:42:04,253][trainer][INFO] - loss/edit_train: 0.03404; loss/loc_train: 0.00246; edit/acc_train: 1.00000; edit/log_prob_train: -0.03404; edit/prob_train: 0.96775; acc/pre_train: 0.46000; acc/post_train: 0.51000; nll/pre_train: 0.69167; perplexity/pre_train: 1.99704; nll/post_train: 0.70253; perplexity/post_train: 2.01886; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09494; loss/total_train: 0.00587; loss/total_edit_train: 0.00587; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 66.86182; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:42:21,424][trainer][INFO] - Step 17000:
[2023-03-25 15:42:21,424][trainer][INFO] - loss/edit_train: 0.03132; loss/loc_train: 0.00118; edit/acc_train: 1.00000; edit/log_prob_train: -0.03132; edit/prob_train: 0.96973; acc/pre_train: 0.47000; acc/post_train: 0.50000; nll/pre_train: 0.69850; perplexity/pre_train: 2.01074; nll/post_train: 0.69396; perplexity/post_train: 2.00162; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09504; loss/total_train: 0.00431; loss/total_edit_train: 0.00431; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 13.65332; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00013; lr/lr5_train: 0.00012
[2023-03-25 15:42:38,718][trainer][INFO] - Step 17100:
[2023-03-25 15:42:38,718][trainer][INFO] - loss/edit_train: 0.03251; loss/loc_train: 0.00185; edit/acc_train: 1.00000; edit/log_prob_train: -0.03251; edit/prob_train: 0.96870; acc/pre_train: 0.41000; acc/post_train: 0.49000; nll/pre_train: 0.69355; perplexity/pre_train: 2.00081; nll/post_train: 0.69979; perplexity/post_train: 2.01333; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09588; loss/total_train: 0.00510; loss/total_edit_train: 0.00510; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 26.11268; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00013; lr/lr5_train: 0.00012
[2023-03-25 15:42:56,513][trainer][INFO] - Step 17200:
[2023-03-25 15:42:56,514][trainer][INFO] - loss/edit_train: 0.03027; loss/loc_train: 0.00139; edit/acc_train: 1.00000; edit/log_prob_train: -0.03027; edit/prob_train: 0.97026; acc/pre_train: 0.44000; acc/post_train: 0.49000; nll/pre_train: 0.70201; perplexity/pre_train: 2.01780; nll/post_train: 0.69447; perplexity/post_train: 2.00265; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09784; loss/total_train: 0.00442; loss/total_edit_train: 0.00442; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 106.71594; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:43:13,870][trainer][INFO] - Step 17300:
[2023-03-25 15:43:13,871][trainer][INFO] - loss/edit_train: 0.03777; loss/loc_train: 0.00193; edit/acc_train: 1.00000; edit/log_prob_train: -0.03777; edit/prob_train: 0.96436; acc/pre_train: 0.48000; acc/post_train: 0.60000; nll/pre_train: 0.68629; perplexity/pre_train: 1.98633; nll/post_train: 0.68477; perplexity/post_train: 1.98332; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09504; loss/total_train: 0.00570; loss/total_edit_train: 0.00570; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 53.85475; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:43:31,194][trainer][INFO] - Step 17400:
[2023-03-25 15:43:31,195][trainer][INFO] - loss/edit_train: 0.03198; loss/loc_train: 0.00258; edit/acc_train: 1.00000; edit/log_prob_train: -0.03198; edit/prob_train: 0.96889; acc/pre_train: 0.39000; acc/post_train: 0.53000; nll/pre_train: 0.70453; perplexity/pre_train: 2.02289; nll/post_train: 0.68395; perplexity/post_train: 1.98168; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09520; loss/total_train: 0.00578; loss/total_edit_train: 0.00578; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 51.85790; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:43:48,558][trainer][INFO] - Step 17500:
[2023-03-25 15:43:48,559][trainer][INFO] - loss/edit_train: 0.03803; loss/loc_train: 0.00157; edit/acc_train: 1.00000; edit/log_prob_train: -0.03803; edit/prob_train: 0.96402; acc/pre_train: 0.46000; acc/post_train: 0.43000; nll/pre_train: 0.69809; perplexity/pre_train: 2.00991; nll/post_train: 0.69594; perplexity/post_train: 2.00559; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09392; loss/total_train: 0.00537; loss/total_edit_train: 0.00537; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 36.53233; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00007; lr/lr4_train: 0.00013; lr/lr5_train: 0.00012
[2023-03-25 15:44:05,070][trainer][INFO] - Step 17600:
[2023-03-25 15:44:05,071][trainer][INFO] - loss/edit_train: 0.03316; loss/loc_train: 0.00151; edit/acc_train: 1.00000; edit/log_prob_train: -0.03316; edit/prob_train: 0.96857; acc/pre_train: 0.48000; acc/post_train: 0.56000; nll/pre_train: 0.68578; perplexity/pre_train: 1.98531; nll/post_train: 0.69571; perplexity/post_train: 2.00513; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09139; loss/total_train: 0.00483; loss/total_edit_train: 0.00483; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 28.96820; lr/lr0_train: 0.00022; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00007; lr/lr4_train: 0.00013; lr/lr5_train: 0.00012
[2023-03-25 15:44:21,351][trainer][INFO] - Step 17700:
[2023-03-25 15:44:21,351][trainer][INFO] - loss/edit_train: 0.03004; loss/loc_train: 0.00652; edit/acc_train: 1.00000; edit/log_prob_train: -0.03004; edit/prob_train: 0.97075; acc/pre_train: 0.39000; acc/post_train: 0.47000; nll/pre_train: 0.71203; perplexity/pre_train: 2.03813; nll/post_train: 0.69240; perplexity/post_train: 1.99850; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08998; loss/total_train: 0.00952; loss/total_edit_train: 0.00952; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 106.18262; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:44:38,408][trainer][INFO] - Step 17800:
[2023-03-25 15:44:38,408][trainer][INFO] - loss/edit_train: 0.03165; loss/loc_train: 0.00142; edit/acc_train: 1.00000; edit/log_prob_train: -0.03165; edit/prob_train: 0.96929; acc/pre_train: 0.45000; acc/post_train: 0.52000; nll/pre_train: 0.70320; perplexity/pre_train: 2.02021; nll/post_train: 0.69215; perplexity/post_train: 1.99801; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09205; loss/total_train: 0.00458; loss/total_edit_train: 0.00458; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 12.51283; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:44:54,903][trainer][INFO] - Step 17900:
[2023-03-25 15:44:54,903][trainer][INFO] - loss/edit_train: 0.03281; loss/loc_train: 0.00129; edit/acc_train: 1.00000; edit/log_prob_train: -0.03281; edit/prob_train: 0.96857; acc/pre_train: 0.41000; acc/post_train: 0.54000; nll/pre_train: 0.69884; perplexity/pre_train: 2.01142; nll/post_train: 0.68890; perplexity/post_train: 1.99152; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09062; loss/total_train: 0.00458; loss/total_edit_train: 0.00458; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 21.57016; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:45:11,454][trainer][INFO] - Step 18000:
[2023-03-25 15:45:11,454][trainer][INFO] - loss/edit_train: 0.02792; loss/loc_train: 0.00135; edit/acc_train: 1.00000; edit/log_prob_train: -0.02792; edit/prob_train: 0.97266; acc/pre_train: 0.43000; acc/post_train: 0.46000; nll/pre_train: 0.71322; perplexity/pre_train: 2.04055; nll/post_train: 0.70133; perplexity/post_train: 2.01643; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09078; loss/total_train: 0.00414; loss/total_edit_train: 0.00414; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 11.02223; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:45:28,612][trainer][INFO] - Step 18100:
[2023-03-25 15:45:28,613][trainer][INFO] - loss/edit_train: 0.03032; loss/loc_train: 0.00405; edit/acc_train: 1.00000; edit/log_prob_train: -0.03032; edit/prob_train: 0.97051; acc/pre_train: 0.42000; acc/post_train: 0.47000; nll/pre_train: 0.70483; perplexity/pre_train: 2.02350; nll/post_train: 0.70093; perplexity/post_train: 2.01563; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09420; loss/total_train: 0.00708; loss/total_edit_train: 0.00708; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 101.02784; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:45:45,112][trainer][INFO] - Step 18200:
[2023-03-25 15:45:45,112][trainer][INFO] - loss/edit_train: 0.02907; loss/loc_train: 0.00334; edit/acc_train: 1.00000; edit/log_prob_train: -0.02907; edit/prob_train: 0.97181; acc/pre_train: 0.47000; acc/post_train: 0.49000; nll/pre_train: 0.70017; perplexity/pre_train: 2.01409; nll/post_train: 0.68873; perplexity/post_train: 1.99118; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09046; loss/total_train: 0.00624; loss/total_edit_train: 0.00624; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 63.47589; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:46:01,538][trainer][INFO] - Step 18300:
[2023-03-25 15:46:01,538][trainer][INFO] - loss/edit_train: 0.03308; loss/loc_train: 0.00312; edit/acc_train: 1.00000; edit/log_prob_train: -0.03308; edit/prob_train: 0.96829; acc/pre_train: 0.43000; acc/post_train: 0.61000; nll/pre_train: 0.68298; perplexity/pre_train: 1.97977; nll/post_train: 0.69533; perplexity/post_train: 2.00436; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09036; loss/total_train: 0.00643; loss/total_edit_train: 0.00643; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 55.03699; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00007; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:46:18,252][trainer][INFO] - Step 18400:
[2023-03-25 15:46:18,253][trainer][INFO] - loss/edit_train: 0.03385; loss/loc_train: 0.00148; edit/acc_train: 1.00000; edit/log_prob_train: -0.03385; edit/prob_train: 0.96784; acc/pre_train: 0.48000; acc/post_train: 0.60000; nll/pre_train: 0.67997; perplexity/pre_train: 1.97383; nll/post_train: 0.68407; perplexity/post_train: 1.98192; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09062; loss/total_train: 0.00487; loss/total_edit_train: 0.00487; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 10.38802; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:46:35,315][trainer][INFO] - Step 18500:
[2023-03-25 15:46:35,315][trainer][INFO] - loss/edit_train: 0.03046; loss/loc_train: 0.00183; edit/acc_train: 1.00000; edit/log_prob_train: -0.03046; edit/prob_train: 0.97095; acc/pre_train: 0.50000; acc/post_train: 0.60000; nll/pre_train: 0.68814; perplexity/pre_train: 1.99001; nll/post_train: 0.68214; perplexity/post_train: 1.97810; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09454; loss/total_train: 0.00487; loss/total_edit_train: 0.00487; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 49.76709; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:46:52,400][trainer][INFO] - Step 18600:
[2023-03-25 15:46:52,400][trainer][INFO] - loss/edit_train: 0.02577; loss/loc_train: 0.00158; edit/acc_train: 1.00000; edit/log_prob_train: -0.02577; edit/prob_train: 0.97457; acc/pre_train: 0.45000; acc/post_train: 0.50000; nll/pre_train: 0.68862; perplexity/pre_train: 1.99096; nll/post_train: 0.69420; perplexity/post_train: 2.00211; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09270; loss/total_train: 0.00416; loss/total_edit_train: 0.00416; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 5.03609; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:47:09,339][trainer][INFO] - Step 18700:
[2023-03-25 15:47:09,339][trainer][INFO] - loss/edit_train: 0.02916; loss/loc_train: 0.00430; edit/acc_train: 1.00000; edit/log_prob_train: -0.02916; edit/prob_train: 0.97174; acc/pre_train: 0.50000; acc/post_train: 0.58000; nll/pre_train: 0.69846; perplexity/pre_train: 2.01065; nll/post_train: 0.70778; perplexity/post_train: 2.02947; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09258; loss/total_train: 0.00722; loss/total_edit_train: 0.00722; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 70.10202; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:47:26,129][trainer][INFO] - Step 18800:
[2023-03-25 15:47:26,129][trainer][INFO] - loss/edit_train: 0.03364; loss/loc_train: 0.00149; edit/acc_train: 1.00000; edit/log_prob_train: -0.03364; edit/prob_train: 0.96828; acc/pre_train: 0.40000; acc/post_train: 0.64000; nll/pre_train: 0.69448; perplexity/pre_train: 2.00266; nll/post_train: 0.68277; perplexity/post_train: 1.97936; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09495; loss/total_train: 0.00486; loss/total_edit_train: 0.00486; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 18.80243; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:47:42,591][trainer][INFO] - Step 18900:
[2023-03-25 15:47:42,592][trainer][INFO] - loss/edit_train: 0.04092; loss/loc_train: 0.00129; edit/acc_train: 1.00000; edit/log_prob_train: -0.04092; edit/prob_train: 0.96299; acc/pre_train: 0.40000; acc/post_train: 0.57000; nll/pre_train: 0.70133; perplexity/pre_train: 2.01644; nll/post_train: 0.69280; perplexity/post_train: 1.99930; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.09081; loss/total_train: 0.00538; loss/total_edit_train: 0.00538; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 25.14632; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:47:58,426][trainer][INFO] - Step 19000:
[2023-03-25 15:47:58,426][trainer][INFO] - loss/edit_train: 0.02873; loss/loc_train: 0.00142; edit/acc_train: 1.00000; edit/log_prob_train: -0.02873; edit/prob_train: 0.97243; acc/pre_train: 0.47000; acc/post_train: 0.45000; nll/pre_train: 0.69560; perplexity/pre_train: 2.00491; nll/post_train: 0.69607; perplexity/post_train: 2.00586; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08859; loss/total_train: 0.00429; loss/total_edit_train: 0.00429; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 31.86860; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:48:14,353][trainer][INFO] - Step 19100:
[2023-03-25 15:48:14,353][trainer][INFO] - loss/edit_train: 0.02841; loss/loc_train: 0.00191; edit/acc_train: 1.00000; edit/log_prob_train: -0.02841; edit/prob_train: 0.97223; acc/pre_train: 0.38000; acc/post_train: 0.47000; nll/pre_train: 0.70082; perplexity/pre_train: 2.01541; nll/post_train: 0.68935; perplexity/post_train: 1.99243; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08937; loss/total_train: 0.00475; loss/total_edit_train: 0.00475; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 39.84640; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:48:30,820][trainer][INFO] - Step 19200:
[2023-03-25 15:48:30,821][trainer][INFO] - loss/edit_train: 0.03210; loss/loc_train: 0.00408; edit/acc_train: 1.00000; edit/log_prob_train: -0.03210; edit/prob_train: 0.96943; acc/pre_train: 0.32000; acc/post_train: 0.42000; nll/pre_train: 0.71616; perplexity/pre_train: 2.04656; nll/post_train: 0.72861; perplexity/post_train: 2.07219; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08939; loss/total_train: 0.00729; loss/total_edit_train: 0.00729; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 123.44946; lr/lr0_train: 0.00021; lr/lr1_train: 0.00021; lr/lr2_train: 0.00006; lr/lr3_train: 0.00007; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:48:47,304][trainer][INFO] - Step 19300:
[2023-03-25 15:48:47,305][trainer][INFO] - loss/edit_train: 0.04069; loss/loc_train: 0.00110; edit/acc_train: 1.00000; edit/log_prob_train: -0.04069; edit/prob_train: 0.96197; acc/pre_train: 0.37000; acc/post_train: 0.59000; nll/pre_train: 0.71343; perplexity/pre_train: 2.04099; nll/post_train: 0.68593; perplexity/post_train: 1.98562; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08895; loss/total_train: 0.00516; loss/total_edit_train: 0.00516; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 38.52404; lr/lr0_train: 0.00021; lr/lr1_train: 0.00020; lr/lr2_train: 0.00006; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:49:03,389][trainer][INFO] - Step 19400:
[2023-03-25 15:49:03,389][trainer][INFO] - loss/edit_train: 0.04068; loss/loc_train: 0.00229; edit/acc_train: 1.00000; edit/log_prob_train: -0.04068; edit/prob_train: 0.96186; acc/pre_train: 0.46000; acc/post_train: 0.50000; nll/pre_train: 0.69527; perplexity/pre_train: 2.00425; nll/post_train: 0.70052; perplexity/post_train: 2.01479; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08927; loss/total_train: 0.00635; loss/total_edit_train: 0.00635; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 45.48400; lr/lr0_train: 0.00021; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00012; lr/lr5_train: 0.00012
[2023-03-25 15:49:19,691][trainer][INFO] - Step 19500:
[2023-03-25 15:49:19,691][trainer][INFO] - loss/edit_train: 0.03130; loss/loc_train: 0.00375; edit/acc_train: 1.00000; edit/log_prob_train: -0.03130; edit/prob_train: 0.96939; acc/pre_train: 0.42000; acc/post_train: 0.56000; nll/pre_train: 0.69865; perplexity/pre_train: 2.01103; nll/post_train: 0.68401; perplexity/post_train: 1.98180; n_tokens/pre_train: 1.00000; n_tokens/post_train: 1.00000; time/edit_train: 0.08966; loss/total_train: 0.00688; loss/total_edit_train: 0.00688; memory/alloc_max_train: 3416440320.00000; memory/res_max_train: 3827302400.00000; grad_train: 24.79106; lr/lr0_train: 0.00021; lr/lr1_train: 0.00020; lr/lr2_train: 0.00005; lr/lr3_train: 0.00008; lr/lr4_train: 0.00011; lr/lr5_train: 0.00012